[PowerPC] Exploit xxspltiw and xxspltidp instructions

Exploits the VSX Vector Splat Immediate Word and VSX Vector Splat Immediate Double Precision instructions: xxspltiw XT,IMM32 xxspltidp XT,IMM32 Differential Revision: https://reviews.llvm.org/D82911
2020-07-01 14:16:27 -05:00 · 2020-07-01 14:16:27 -05:00 · c5b4f03b53
parent 0670f855a7
commit c5b4f03b53
6 changed files with 539 additions and 28 deletions
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@ -1473,6 +1473,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
  case PPCISD::VPERM:           return "PPCISD::VPERM";
  case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
+  case PPCISD::XXSPLTI_SP_TO_DP:
+    return "PPCISD::XXSPLTI_SP_TO_DP";
  case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
  case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
  case PPCISD::VECSHL:          return "PPCISD::VECSHL";
@ -8966,9 +8968,9 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
 // Vector related lowering.
 //

-/// BuildSplatI - Build a canonical splati of Val with an element size of
-/// SplatSize.  Cast the result to VT.
-static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
+/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
+/// element size of SplatSize. Cast the result to VT.
+static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
                                      SelectionDAG &DAG, const SDLoc &dl) {
  static const MVT VTys[] = { // canonical VT to use for each size.
    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
@ -8976,9 +8978,11 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,

  EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];

-  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
-  if (Val == -1)
+  // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
+  if (Val == ((1LU << (SplatSize * 8)) - 1)) {
    SplatSize = 1;
+    Val = 0xFF;
+  }

  EVT CanonicalVT = VTys[SplatSize-1];

@ -9113,6 +9117,34 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {
  return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
 }

+// Convert the argument APFloat to a single precision APFloat if there is no
+// loss in information during the conversion to single precision APFloat and the
+// resulting number is not a denormal number. Return true if successful.
+bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
+  APFloat APFloatToConvert = ArgAPFloat;
+  bool LosesInfo = true;
+  APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                           &LosesInfo);
+  bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
+  if (Success)
+    ArgAPFloat = APFloatToConvert;
+  return Success;
+}
+
+// Bitcast the argument APInt to a double and convert it to a single precision
+// APFloat, bitcast the APFloat to an APInt and assign it to the original
+// argument if there is no loss in information during the conversion from
+// double to single precision APFloat and the resulting number is not a denormal
+// number. Return true if successful.
+bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
+  double DpValue = ArgAPInt.bitsToDouble();
+  APFloat APFloatDp(DpValue);
+  bool Success = convertToNonDenormSingle(APFloatDp);
+  if (Success)
+    ArgAPInt = APFloatDp.bitcastToAPInt();
+  return Success;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@ -9232,9 +9264,23 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
  APInt APSplatBits, APSplatUndef;
  unsigned SplatBitSize;
  bool HasAnyUndefs;
-  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                             HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
-      SplatBitSize > 32) {
+  bool BVNIsConstantSplat =
+      BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                           HasAnyUndefs, 0, !Subtarget.isLittleEndian());
+
+  // If it is a splat of a double, check if we can shrink it to a 32 bit
+  // non-denormal float which when converted back to double gives us the same
+  // double. This is to exploit the XXSPLTIDP instruction.
+  if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
+      (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
+      convertToNonDenormSingle(APSplatBits)) {
+    SDValue SplatNode = DAG.getNode(
+        PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
+        DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
+    return DAG.getBitcast(Op.getValueType(), SplatNode);
+  }
+
+  if (!BVNIsConstantSplat || SplatBitSize > 32) {

    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
    // Handle load-and-splat patterns as we have instructions that will do this
@ -9273,8 +9319,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
    return SDValue();
  }

-  unsigned SplatBits = APSplatBits.getZExtValue();
-  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  uint64_t SplatBits = APSplatBits.getZExtValue();
+  uint64_t SplatUndef = APSplatUndef.getZExtValue();
  unsigned SplatSize = SplatBitSize / 8;

  // First, handle single instruction cases.
@ -9289,17 +9335,30 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
    return Op;
  }

-  // We have XXSPLTIB for constant splats one byte wide
-  // FIXME: SplatBits is an unsigned int being cast to an int while passing it
-  // as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.
+  // We have XXSPLTIW for constant splats four bytes wide.
+  // Given vector length is a multiple of 4, 2-byte splats can be replaced
+  // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
+  // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
+  // turned into a 4-byte splat of 0xABABABAB.
+  if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
+    return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
+                                  Op.getValueType(), DAG, dl);
+
+  if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
+    return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
+                                  dl);
+
+  // We have XXSPLTIB for constant splats one byte wide.
  if (Subtarget.hasP9Vector() && SplatSize == 1)
-    return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);
+    return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
+                                  dl);

  // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
  int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
                    (32-SplatBitSize));
  if (SextVal >= -16 && SextVal <= 15)
-    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+    return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
+                                  dl);

  // Two instruction sequences.

@ -9330,7 +9389,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
  // for fneg/fabs.
  if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
    // Make -1 and vspltisw -1:
-    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+    SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);

    // Make the VSLW intrinsic, computing 0x8000_0000.
    SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
@ -9358,7 +9417,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

    // vsplti + shl self.
    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
      static const unsigned IIDs[] = { // Intrinsic to use for each size.
        Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
        Intrinsic::ppc_altivec_vslw
@ -9369,7 +9428,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

    // vsplti + srl self.
    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
      static const unsigned IIDs[] = { // Intrinsic to use for each size.
        Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
        Intrinsic::ppc_altivec_vsrw
@ -9380,7 +9439,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

    // vsplti + sra self.
    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
      static const unsigned IIDs[] = { // Intrinsic to use for each size.
        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
        Intrinsic::ppc_altivec_vsraw
@ -9392,7 +9451,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
    // vsplti + rol self.
    if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
                         ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
      static const unsigned IIDs[] = { // Intrinsic to use for each size.
        Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
        Intrinsic::ppc_altivec_vrlw
@ -9403,19 +9462,19 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,

    // t = vsplti c, result = vsldoi t, t, 1
    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
-      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
      unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
    }
    // t = vsplti c, result = vsldoi t, t, 2
    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
-      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
      unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
    }
    // t = vsplti c, result = vsldoi t, t, 3
    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
-      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
      unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
      return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
    }
@ -10817,9 +10876,9 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
  if (Op.getValueType() == MVT::v4i32) {
    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);

-    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
-    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
-
+    SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
+    // +16 as shift amt.
+    SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
    SDValue RHSSwap =   // = vrlw RHS, 16
      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);

@ -16239,6 +16298,13 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
    return false;
  case MVT::f32:
  case MVT::f64:
+    if (Subtarget.hasPrefixInstrs()) {
+      // With prefixed instructions, we can materialize anything that can be
+      // represented with a 32-bit immediate, not just positive zero.
+      APFloat APFloatOfImm = Imm;
+      return convertToNonDenormSingle(APFloatOfImm);
+    }
+    LLVM_FALLTHROUGH;
  case MVT::ppcf128:
    return Imm.isPosZero();
  }
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@ -97,6 +97,11 @@ namespace llvm {
    ///
    XXSPLT,

+    /// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
+    /// converting immediate single precision numbers to double precision
+    /// vector or scalar.
+    XXSPLTI_SP_TO_DP,
+
    /// VECINSERT - The PPC vector insert instruction
    ///
    VECINSERT,
@ -1273,6 +1278,9 @@ namespace llvm {
  bool isIntS16Immediate(SDNode *N, int16_t &Imm);
  bool isIntS16Immediate(SDValue Op, int16_t &Imm);

+  bool convertToNonDenormSingle(APInt &ArgAPInt);
+  bool convertToNonDenormSingle(APFloat &ArgAPFloat);
+
 } // end namespace llvm

 #endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@ -50,6 +50,10 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
  SDTCisVec<1>, SDTCisInt<2>
 ]>;

+def SDT_PPCSpToDp : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>,
+  SDTCisInt<1>
+]>;
+
 def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
  SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
 ]>;
@ -194,6 +198,7 @@ def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;

 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+def PPCxxspltidp : SDNode<"PPCISD::XXSPLTI_SP_TO_DP", SDT_PPCSpToDp, []>;
 def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
@ -326,6 +331,23 @@ def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
 // PowerPC specific transformation functions and pattern fragments.
 //

+// A floating point immediate that is not a positive zero and can be converted
+// to a single precision floating point non-denormal immediate without loss of
+// information.
+def nzFPImmAsi32 : PatLeaf<(fpimm), [{
+  APFloat APFloatOfN = N->getValueAPF();
+  return convertToNonDenormSingle(APFloatOfN) && !N->isExactlyValue(+0.0);
+}]>;
+
+// Convert the floating point immediate into a 32 bit floating point immediate
+// and get a i32 with the resulting bits.
+def getFPAs32BitInt : SDNodeXForm<fpimm, [{
+  APFloat APFloatOfN = N->getValueAPF();
+  convertToNonDenormSingle(APFloatOfN);
+  return CurDAG->getTargetConstant(APFloatOfN.bitcastToAPInt().getZExtValue(),
+                                   SDLoc(N), MVT::i32);
+}]>;
+
 def SHL32 : SDNodeXForm<imm, [{
  // Transformation function: 31 - imm
  return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
@ -392,6 +414,7 @@ def immZExt16  : PatLeaf<(imm), [{
 def immNonAllOneAnyExt8 : ImmLeaf<i32, [{
  return (isInt<8>(Imm) && (Imm != -1)) || (isUInt<8>(Imm) && (Imm != 0xFF));
 }]>;
+def i32immNonAllOneNonZero : ImmLeaf<i32, [{ return Imm && (Imm != -1); }]>;
 def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;

 // imm16Shifted* - These match immediates where the low 16-bits are zero.  There
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@ -704,7 +704,8 @@ let Predicates = [PrefixInstrs] in {
  def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
                                      (ins i32imm:$IMM32),
                                      "xxspltidp $XT, $IMM32", IIC_VecGeneral,
-                                      []>;
+                                      [(set v2f64:$XT,
+                                            (PPCxxspltidp i32:$IMM32))]>;
  def XXSPLTI32DX :
      8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
                             (ins vsrc:$XTi, i1imm:$IX, i32imm:$IMM32),
@ -822,3 +823,17 @@ let Predicates = [IsISA3_1] in {
  def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
            (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
 }
+
+let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
+ def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A)),
+           (v4i32 (XXSPLTIW imm:$A))>;
+ def : Pat<(f32 nzFPImmAsi32:$A),
+           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
+                             VSFRC)>;
+ def : Pat<(f64 nzFPImmAsi32:$A),
+           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
+                             VSFRC)>;
+}
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm-CPload-pcrel.ll
@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s \
+; RUN:     --check-prefix=CHECK-NOPCREL
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:     -mattr=-pcrelative-memops -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \
+; RUN:     FileCheck %s --check-prefix=CHECK-NOPCREL
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -target-abi=elfv2 -mcpu=pwr10 < %s | \
+; RUN:     FileCheck %s
+
+define dso_local <2 x double> @testDoubleToDoubleFail() local_unnamed_addr {
+; CHECK-LABEL: testDoubleToDoubleFail:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxv vs34, .LCPI0_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+;
+; CHECK-NOPCREL-LABEL: testDoubleToDoubleFail:
+; CHECK-NOPCREL:       # %bb.0: # %entry
+; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI0_0@toc@ha
+; CHECK-NOPCREL-NEXT:    addi r3, r3, .LCPI0_0@toc@l
+; CHECK-NOPCREL-NEXT:    lxvx vs34, 0, r3
+; CHECK-NOPCREL-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 3.423300e+02, double 3.423300e+02>
+}
+
+define dso_local <2 x double> @testFloatDenormToDouble() local_unnamed_addr {
+; CHECK-LABEL: testFloatDenormToDouble:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxv vs34, .LCPI1_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+;
+; CHECK-NOPCREL-LABEL: testFloatDenormToDouble:
+; CHECK-NOPCREL:       # %bb.0: # %entry
+; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI1_0@toc@ha
+; CHECK-NOPCREL-NEXT:    addi r3, r3, .LCPI1_0@toc@l
+; CHECK-NOPCREL-NEXT:    lxvx vs34, 0, r3
+; CHECK-NOPCREL-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0x380B38FB80000000, double 0x380B38FB80000000>
+}
+
+define dso_local <2 x double> @testDoubleToDoubleNaNFail() local_unnamed_addr {
+; CHECK-LABEL: testDoubleToDoubleNaNFail:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plxv vs34, .LCPI2_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+;
+; CHECK-NOPCREL-LABEL: testDoubleToDoubleNaNFail:
+; CHECK-NOPCREL:       # %bb.0: # %entry
+; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI2_0@toc@ha
+; CHECK-NOPCREL-NEXT:    addi r3, r3, .LCPI2_0@toc@l
+; CHECK-NOPCREL-NEXT:    lxvx vs34, 0, r3
+; CHECK-NOPCREL-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0xFFFFFFFFFFFFFFF0, double 0xFFFFFFFFFFFFFFF0>
+}
+
+define dso_local double @testDoubleNonRepresentableScalar() local_unnamed_addr {
+; CHECK-LABEL: testDoubleNonRepresentableScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plfd f1, .LCPI3_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+;
+; CHECK-NOPCREL-LABEL: testDoubleNonRepresentableScalar:
+; CHECK-NOPCREL:       # %bb.0: # %entry
+; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI3_0@toc@ha
+; CHECK-NOPCREL-NEXT:    lfd f1, .LCPI3_0@toc@l(r3)
+; CHECK-NOPCREL-NEXT:    blr
+
+entry:
+  ret double 3.423300e+02
+}
+
+define dso_local float @testFloatDenormScalar() local_unnamed_addr {
+; CHECK-LABEL: testFloatDenormScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plfs f1, .LCPI4_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+;
+; CHECK-NOPCREL-LABEL: testFloatDenormScalar:
+; CHECK-NOPCREL:       # %bb.0: # %entry
+; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI4_0@toc@ha
+; CHECK-NOPCREL-NEXT:    lfs f1, .LCPI4_0@toc@l(r3)
+; CHECK-NOPCREL-NEXT:    blr
+
+entry:
+  ret float 0x380B38FB80000000
+}
+
+define dso_local double @testFloatDenormToDoubleScalar() local_unnamed_addr {
+; CHECK-LABEL: testFloatDenormToDoubleScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    plfs f1, .LCPI5_0@PCREL(0), 1
+; CHECK-NEXT:    blr
+;
+; CHECK-NOPCREL-LABEL: testFloatDenormToDoubleScalar:
+; CHECK-NOPCREL:       # %bb.0: # %entry
+; CHECK-NOPCREL-NEXT:    addis r3, r2, .LCPI5_0@toc@ha
+; CHECK-NOPCREL-NEXT:    lfs f1, .LCPI5_0@toc@l(r3)
+; CHECK-NOPCREL-NEXT:    blr
+
+entry:
+  ret double 0x380B38FB80000000
+}
--- a/llvm/test/CodeGen/PowerPC/p10-splatImm.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm.ll
@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | FileCheck %s
+
+define dso_local <4 x i32> @testZero() local_unnamed_addr {
+; CHECK-LABEL: testZero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x i32> zeroinitializer
+}
+
+define dso_local <4 x float> @testZeroF() local_unnamed_addr {
+; CHECK-LABEL: testZeroF:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x float> zeroinitializer
+}
+
+define dso_local <4 x i32> @testAllOneS() local_unnamed_addr {
+; CHECK-LABEL: testAllOneS:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv vs34, vs34, vs34
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define dso_local <4 x i32> @test5Bit() local_unnamed_addr {
+; CHECK-LABEL: test5Bit:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vspltisw v2, 7
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+}
+
+define dso_local <16 x i8> @test1ByteChar() local_unnamed_addr {
+; CHECK-LABEL: test1ByteChar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib vs34, 7
+; CHECK-NEXT:    blr
+
+entry:
+  ret <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+}
+
+define dso_local <4 x i32> @test1ByteSplatInt() local_unnamed_addr {
+; Here the splat of 171 or 0xABABABAB can be done using a byte splat
+; of 0xAB using xxspltib while avoiding the use of xxspltiw.
+; CHECK-LABEL: test1ByteSplatInt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltib vs34, 171
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x i32> <i32 -1414812757, i32 -1414812757, i32 -1414812757, i32 -1414812757>
+}
+
+define dso_local <4 x i32> @test5Bit2Ins() local_unnamed_addr {
+; Splats within the range [-32,31] can be done using two vsplti[bhw]
+; instructions, but we prefer the xxspltiw instruction to them.
+; CHECK-LABEL: test5Bit2Ins:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 16
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x i32> <i32 16, i32 16, i32 16, i32 16>
+}
+
+define dso_local <4 x float> @testFloatNegZero() local_unnamed_addr {
+; 0.0f is not the same as -0.0f. We try to splat -0.0f
+; CHECK-LABEL: testFloatNegZero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, -2147483648
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
+}
+
+define dso_local <4 x float> @testFloat() local_unnamed_addr {
+; CHECK-LABEL: testFloat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 1135323709
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x float> <float 0x40757547A0000000, float 0x40757547A0000000, float 0x40757547A0000000, float 0x40757547A0000000>
+}
+
+define dso_local <4 x float> @testIntToFloat() local_unnamed_addr {
+; CHECK-LABEL: testIntToFloat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 1135312896
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x float> <float 3.430000e+02, float 3.430000e+02, float 3.430000e+02, float 3.430000e+02>
+}
+
+define dso_local <4 x i32> @testUndefInt() local_unnamed_addr {
+; CHECK-LABEL: testUndefInt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 18
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x i32> <i32 18, i32 undef, i32 undef, i32 18>
+}
+
+define dso_local <4 x float> @testUndefIntToFloat() local_unnamed_addr {
+; CHECK-LABEL: testUndefIntToFloat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 1135312896
+; CHECK-NEXT:    blr
+
+entry:
+  ret <4 x float> <float 3.430000e+02, float undef, float undef, float 3.430000e+02>
+}
+
+define dso_local <2 x i64> @testPseudo8Byte() local_unnamed_addr {
+; CHECK-LABEL: testPseudo8Byte:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, -1430532899
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x i64> <i64 -6144092014192636707, i64 -6144092014192636707>
+}
+
+define dso_local <8 x i16> @test2Byte() local_unnamed_addr {
+; CHECK-LABEL: test2Byte:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 1179666
+; CHECK-NEXT:    blr
+
+entry:
+  ret <8 x i16> <i16 18, i16 18, i16 18, i16 18, i16 18, i16 18, i16 18, i16 18>
+}
+
+define dso_local <8 x i16> @test2ByteUndef() local_unnamed_addr {
+; CHECK-LABEL: test2ByteUndef:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltiw vs34, 1179666
+; CHECK-NEXT:    blr
+
+entry:
+  ret <8 x i16> <i16 18, i16 undef, i16 18, i16 18, i16 18, i16 undef, i16 18, i16 18>
+}
+
+define dso_local <2 x double> @testFloatToDouble() local_unnamed_addr {
+; CHECK-LABEL: testFloatToDouble:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, 1135290941
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0x40756547A0000000, double 0x40756547A0000000>
+}
+
+define dso_local <2 x double> @testDoubleLower4ByteZero() local_unnamed_addr {
+; The expanded double will have 0 in the last 32 bits. Imprecise handling of
+; return value of data structures like APInt, returned when calling getZextValue
+; , like saving the return value into an unsigned instead of uint64_t may cause
+; this test to fail.
+; CHECK-LABEL: testDoubleLower4ByteZero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, 1093664768
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 1.100000e+01, double 1.100000e+01>
+}
+
+define dso_local <2 x double> @testDoubleToDoubleZero() local_unnamed_addr {
+; Should be using canonicalized form to splat zero and use shorter instructions
+; than xxspltidp.
+; CHECK-LABEL: testDoubleToDoubleZero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor vs34, vs34, vs34
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> zeroinitializer
+}
+
+define dso_local <2 x double> @testDoubleToDoubleNegZero() local_unnamed_addr {
+; CHECK-LABEL: testDoubleToDoubleNegZero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, -2147483648
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double -0.000000e+00, double -0.000000e+00>
+}
+
+define dso_local <2 x double> @testDoubleToDoubleNaN() local_unnamed_addr {
+; CHECK-LABEL: testDoubleToDoubleNaN:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, -16
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0xFFFFFFFE00000000, double 0xFFFFFFFE00000000>
+}
+
+define dso_local <2 x double> @testDoubleToDoubleInfinity() local_unnamed_addr {
+; CHECK-LABEL: testDoubleToDoubleInfinity:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, 2139095040
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000>
+}
+
+define dso_local <2 x double> @testFloatToDoubleNaN() local_unnamed_addr {
+; CHECK-LABEL: testFloatToDoubleNaN:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, -1
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0xFFFFFFFFE0000000, double 0xFFFFFFFFE0000000>
+}
+
+define dso_local <2 x double> @testFloatToDoubleInfinity() local_unnamed_addr {
+; CHECK-LABEL: testFloatToDoubleInfinity:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs34, 2139095040
+; CHECK-NEXT:    blr
+
+entry:
+  ret <2 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000>
+}
+
+define dso_local float @testFloatScalar() local_unnamed_addr {
+; CHECK-LABEL: testFloatScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs1, 1135290941
+; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; CHECK-NEXT:    blr
+
+entry:
+  ret float 0x40756547A0000000
+}
+
+define dso_local float @testFloatZeroScalar() local_unnamed_addr {
+; CHECK-LABEL: testFloatZeroScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor f1, f1, f1
+; CHECK-NEXT:    blr
+
+entry:
+  ret float 0.000000e+00
+}
+
+define dso_local double @testDoubleRepresentableScalar() local_unnamed_addr {
+; CHECK-LABEL: testDoubleRepresentableScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxspltidp vs1, 1135290941
+; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; CHECK-NEXT:    blr
+
+entry:
+  ret double 0x40756547A0000000
+}
+
+define dso_local double @testDoubleZeroScalar() local_unnamed_addr {
+; CHECK-LABEL: testDoubleZeroScalar:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor f1, f1, f1
+; CHECK-NEXT:    blr
+
+entry:
+  ret double 0.000000e+00
+}