From df1cb520df72d28eb3283931a0a86ec06ea4e536 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Tue, 29 Nov 2016 16:11:34 +0000 Subject: [PATCH] [PowerPC] Improvements for BUILD_VECTOR Vol. 1 This patch corresponds to review: https://reviews.llvm.org/D25912 This is the first patch in a series of 4 that improve the lowering and combining for BUILD_VECTOR nodes on PowerPC. llvm-svn: 288152 --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 93 +++-- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 1 + llvm/lib/Target/PowerPC/PPCInstrVSX.td | 325 ++++++++++++++++-- .../PowerPC/p8-scalar_vector_conversions.ll | 8 +- .../PowerPC/power9-moves-and-splats.ll | 18 +- llvm/test/CodeGen/PowerPC/vsx.ll | 8 +- 6 files changed, 375 insertions(+), 78 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8232132948f7..261e060600d8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -563,10 +563,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); - if (Subtarget.hasP8Altivec()) - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); - if (Subtarget.hasVSX()) - setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); // Altivec does not contain unordered floating-point compare instructions setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand); @@ -676,6 +672,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FABS, MVT::v4f32, Legal); setOperationAction(ISD::FABS, MVT::v2f64, Legal); + if (Subtarget.hasDirectMove()) + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass); } @@ -688,9 +688,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); } - - if (Subtarget.isISA3_0() && Subtarget.hasDirectMove()) - setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); } if (Subtarget.hasQPX()) { @@ -7129,14 +7126,55 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT, return DAG.getNode(ISD::BITCAST, dl, VT, T); } -static bool isNonConstSplatBV(BuildVectorSDNode *BVN, EVT Type) { - if (BVN->isConstant() || BVN->getValueType(0) != Type) +/// Do we have an efficient pattern in a .td file for this node? +/// +/// \param V - pointer to the BuildVectorSDNode being matched +/// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves? +/// +/// There are some patterns where it is beneficial to keep a BUILD_VECTOR +/// node as a BUILD_VECTOR node rather than expanding it. The patterns where +/// the opposite is true (expansion is beneficial) are: +/// - The node builds a vector out of integers that are not 32 or 64-bits +/// - The node builds a vector out of constants +/// - The node is a "load-and-splat" +/// In all other cases, we will choose to keep the BUILD_VECTOR. +static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V, + bool HasDirectMove) { + EVT VecVT = V->getValueType(0); + bool RightType = VecVT == MVT::v2f64 || VecVT == MVT::v4f32 || + (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32)); + if (!RightType) return false; - auto OpZero = BVN->getOperand(0); - for (int i = 1, e = BVN->getNumOperands(); i < e; i++) - if (BVN->getOperand(i) != OpZero) + + bool IsSplat = true; + bool IsLoad = false; + SDValue Op0 = V->getOperand(0); + + // This function is called in a block that confirms the node is not a constant + // splat. So a constant BUILD_VECTOR here means the vector is built out of + // different constants. + if (V->isConstant()) + return false; + for (int i = 0, e = V->getNumOperands(); i < e; ++i) { + if (V->getOperand(i).isUndef()) return false; - return true; + // We want to expand nodes that represent load-and-splat even if the + // loaded value is a floating point truncation or conversion to int. + if (V->getOperand(i).getOpcode() == ISD::LOAD || + (V->getOperand(i).getOpcode() == ISD::FP_ROUND && + V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || + (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT && + V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) || + (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT && + V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD)) + IsLoad = true; + // If the operands are different or the input is not a load and has more + // uses than just this BV node, then it isn't a splat. + if (V->getOperand(i) != Op0 || + (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode()))) + IsSplat = false; + } + return !(IsSplat && IsLoad); } // If this is a case we can't handle, return null and let the default @@ -7261,14 +7299,11 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || SplatBitSize > 32) { - // We can splat a non-const value on CPU's that implement ISA 3.0 - // in two ways: LXVWSX (load and splat) and MTVSRWS(move and splat). - auto OpZero = BVN->getOperand(0); - bool CanLoadAndSplat = OpZero.getOpcode() == ISD::LOAD && - BVN->isOnlyUserOf(OpZero.getNode()); - if (Subtarget.isISA3_0() && !CanLoadAndSplat && - (isNonConstSplatBV(BVN, MVT::v4i32) || - isNonConstSplatBV(BVN, MVT::v2i64))) + // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be + // lowered to VSX instructions under certain conditions. + // Without VSX, there is no pattern more efficient than expanding the node. + if (Subtarget.hasVSX() && + haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove())) return Op; return SDValue(); } @@ -7290,8 +7325,20 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, } // We have XXSPLTIB for constant splats one byte wide - if (Subtarget.isISA3_0() && Op.getValueType() == MVT::v16i8) + if (Subtarget.hasP9Vector() && SplatSize == 1) { + // This is a splat of 1-byte elements with some elements potentially undef. + // Rather than trying to match undef in the SDAG patterns, ensure that all + // elements are the same constant. + if (HasAnyUndefs || ISD::isBuildVectorAllOnes(BVN)) { + SmallVector Ops(16, DAG.getConstant(SplatBits, + dl, MVT::i32)); + SDValue NewBV = DAG.getBuildVector(MVT::v16i8, dl, Ops); + if (Op.getValueType() != MVT::v16i8) + return DAG.getBitcast(Op.getValueType(), NewBV); + return NewBV; + } return Op; + } // If the sign extended value is in the range [-16,15], use VSPLTI[bhw]. int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >> @@ -7539,7 +7586,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If the source for the shuffle is a scalar_to_vector that came from a // 32-bit load, it will have used LXVWSX so we don't need to splat again. - if (Subtarget.isISA3_0() && + if (Subtarget.hasP9Vector() && ((isLittleEndian && SplatIdx == 3) || (!isLittleEndian && SplatIdx == 0))) { SDValue Src = V1.getOperand(0); diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 5f10e0ccd1c5..074fa815293e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -327,6 +327,7 @@ def immZExt16 : PatLeaf<(imm), [{ return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue(); }], LO16>; def immSExt8 : ImmLeaf(Imm); }]>; +def immSExt5NonZero : ImmLeaf(Imm); }]>; // imm16Shifted* - These match immediates where the low 16-bits are zero. There // are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index daf22a13c444..f203c67a651d 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -570,18 +570,38 @@ let Uses = [RM] in { (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsxds $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctidz f64:$XB))]>; + let isCodeGenOnly = 1 in + def XSCVDPSXDSs : XX2Form<60, 344, + (outs vssrc:$XT), (ins vssrc:$XB), + "xscvdpsxds $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfctidz f32:$XB))]>; def XSCVDPSXWS : XX2Form<60, 88, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpsxws $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctiwz f64:$XB))]>; + let isCodeGenOnly = 1 in + def XSCVDPSXWSs : XX2Form<60, 88, + (outs vssrc:$XT), (ins vssrc:$XB), + "xscvdpsxws $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfctiwz f32:$XB))]>; def XSCVDPUXDS : XX2Form<60, 328, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpuxds $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctiduz f64:$XB))]>; + let isCodeGenOnly = 1 in + def XSCVDPUXDSs : XX2Form<60, 328, + (outs vssrc:$XT), (ins vssrc:$XB), + "xscvdpuxds $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfctiduz f32:$XB))]>; def XSCVDPUXWS : XX2Form<60, 72, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvdpuxws $XT, $XB", IIC_VecFP, [(set f64:$XT, (PPCfctiwuz f64:$XB))]>; + let isCodeGenOnly = 1 in + def XSCVDPUXWSs : XX2Form<60, 72, + (outs vssrc:$XT), (ins vssrc:$XB), + "xscvdpuxws $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfctiwuz f32:$XB))]>; def XSCVSPDP : XX2Form<60, 329, (outs vsfrc:$XT), (ins vsfrc:$XB), "xscvspdp $XT, $XB", IIC_VecFP, []>; @@ -624,13 +644,15 @@ let Uses = [RM] in { "xvcvspsxds $XT, $XB", IIC_VecFP, []>; def XVCVSPSXWS : XX2Form<60, 152, (outs vsrc:$XT), (ins vsrc:$XB), - "xvcvspsxws $XT, $XB", IIC_VecFP, []>; + "xvcvspsxws $XT, $XB", IIC_VecFP, + [(set v4i32:$XT, (fp_to_sint v4f32:$XB))]>; def XVCVSPUXDS : XX2Form<60, 392, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvspuxds $XT, $XB", IIC_VecFP, []>; def XVCVSPUXWS : XX2Form<60, 136, (outs vsrc:$XT), (ins vsrc:$XB), - "xvcvspuxws $XT, $XB", IIC_VecFP, []>; + "xvcvspuxws $XT, $XB", IIC_VecFP, + [(set v4i32:$XT, (fp_to_uint v4f32:$XB))]>; def XVCVSXDDP : XX2Form<60, 504, (outs vsrc:$XT), (ins vsrc:$XB), "xvcvsxddp $XT, $XB", IIC_VecFP, @@ -661,7 +683,8 @@ let Uses = [RM] in { [(set v2f64:$XT, (int_ppc_vsx_xvcvuxwdp v4i32:$XB))]>; def XVCVUXWSP : XX2Form<60, 168, (outs vsrc:$XT), (ins vsrc:$XB), - "xvcvuxwsp $XT, $XB", IIC_VecFP, []>; + "xvcvuxwsp $XT, $XB", IIC_VecFP, + [(set v4f32:$XT, (uint_to_fp v4i32:$XB))]>; // Rounding Instructions def XSRDPI : XX2Form<60, 73, @@ -1207,6 +1230,8 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. def : Pat<(f64 (extloadf32 xoaddr:$src)), (COPY_TO_REGCLASS (LXSSPX xoaddr:$src), VSFRC)>; + def : Pat<(f32 (fpround (extloadf32 xoaddr:$src))), + (f32 (LXSSPX xoaddr:$src))>; def : Pat<(f64 (fpextend f32:$src)), (COPY_TO_REGCLASS $src, VSFRC)>; @@ -1384,7 +1409,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. } // AddedComplexity = 400 } // HasP8Vector -let UseVSXReg = 1 in { +let UseVSXReg = 1, AddedComplexity = 400 in { let Predicates = [HasDirectMove] in { // VSX direct move instructions def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT), @@ -1730,6 +1755,7 @@ def VectorExtractions { dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC); } +let AddedComplexity = 400 in { // v4f32 scalar <-> vector conversions (BE) let Predicates = [IsBigEndian, HasP8Vector] in { def : Pat<(v4f32 (scalar_to_vector f32:$A)), @@ -1971,15 +1997,16 @@ def : Pat<(f64 (bitconvert i64:$S)), (f64 (MTVSRD $S))>; } +// Materialize a zero-vector of long long +def : Pat<(v2i64 immAllZerosV), + (v2i64 (XXLXORz))>; +} + def AlignValues { dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3)); dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC); } -// Materialize a zero-vector of long long -def : Pat<(v2i64 immAllZerosV), - (v2i64 (XXLXORz))>; - // The following VSX instructions were introduced in Power ISA 3.0 def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">; let AddedComplexity = 400, Predicates = [HasP9Vector] in { @@ -2474,23 +2501,8 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), (v4f32 (LXVWSX xoaddr:$src))>; - def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), - (v4i32 (MTVSRWS $A))>; - def : Pat<(v16i8 (build_vector immSExt8:$A, immSExt8:$A, immSExt8:$A, - immSExt8:$A, immSExt8:$A, immSExt8:$A, - immSExt8:$A, immSExt8:$A, immSExt8:$A, - immSExt8:$A, immSExt8:$A, immSExt8:$A, - immSExt8:$A, immSExt8:$A, immSExt8:$A, - immSExt8:$A)), - (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>; - def : Pat<(v16i8 immAllOnesV), - (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; - def : Pat<(v8i16 immAllOnesV), - (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; - def : Pat<(v4i32 immAllOnesV), - (v4i32 (XXSPLTIB 255))>; - def : Pat<(v2i64 immAllOnesV), - (v2i64 (XXSPLTIB 255))>; + def : Pat<(v4f32 (scalar_to_vector (f32 (fpround (extloadf32 xoaddr:$src))))), + (v4f32 (LXVWSX xoaddr:$src))>; // Build vectors from i8 loads def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)), @@ -2631,6 +2643,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>; def : Pat<(f64 (PPCVexts f64:$A, 2)), (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>; + let isPseudo = 1 in { def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src), "#DFLOADf32", @@ -2647,18 +2660,260 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { } def : Pat<(f64 (extloadf32 iaddr:$src)), (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>; + def : Pat<(f32 (fpround (extloadf32 iaddr:$src))), + (f32 (DFLOADf32 iaddr:$src))>; } // end HasP9Vector, AddedComplexity -let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in { -def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)), - (v2i64 (MTVSRDD $rB, $rA))>; -def : Pat<(i64 (extractelt v2i64:$A, 0)), - (i64 (MFVSRLD $A))>; +// Integer extend helper dags 32 -> 64 +def AnyExts { + dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32); + dag B = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $B, sub_32); + dag C = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $C, sub_32); + dag D = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $D, sub_32); } -let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { -def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)), - (v2i64 (MTVSRDD $rB, $rA))>; -def : Pat<(i64 (extractelt v2i64:$A, 1)), - (i64 (MFVSRLD $A))>; +def DblToFlt { + dag A0 = (f32 (fpround (f64 (extractelt v2f64:$A, 0)))); + dag A1 = (f32 (fpround (f64 (extractelt v2f64:$A, 1)))); + dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0)))); + dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1)))); +} +def FltToIntLoad { + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A))))); +} +def FltToUIntLoad { + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (extloadf32 xoaddr:$A))))); +} +def FltToLongLoad { + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); +} +def FltToULongLoad { + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); +} +def FltToLong { + dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A)))); +} +def FltToULong { + dag A = (i64 (PPCmfvsr (PPCfctiduz (fpextend f32:$A)))); +} +def DblToInt { + dag A = (i32 (PPCmfvsr (f64 (PPCfctiwz f64:$A)))); +} +def DblToUInt { + dag A = (i32 (PPCmfvsr (f64 (PPCfctiwuz f64:$A)))); +} +def DblToLong { + dag A = (i64 (PPCmfvsr (f64 (PPCfctidz f64:$A)))); +} +def DblToULong { + dag A = (i64 (PPCmfvsr (f64 (PPCfctiduz f64:$A)))); +} +def DblToIntLoad { + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); +} +def DblToUIntLoad { + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); +} +def DblToLongLoad { + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); +} +def DblToULongLoad { + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (load xoaddr:$A))))); +} + +// FP merge dags (for f32 -> v4f32) +def MrgFP { + dag AC = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $A, VSRC), + (COPY_TO_REGCLASS $C, VSRC), 0)); + dag BD = (XVCVDPSP (XXPERMDI (COPY_TO_REGCLASS $B, VSRC), + (COPY_TO_REGCLASS $D, VSRC), 0)); + dag ABhToFlt = (XVCVDPSP (XXPERMDI $A, $B, 0)); + dag ABlToFlt = (XVCVDPSP (XXPERMDI $A, $B, 3)); + dag BAhToFlt = (XVCVDPSP (XXPERMDI $B, $A, 0)); + dag BAlToFlt = (XVCVDPSP (XXPERMDI $B, $A, 3)); +} + +// Patterns for BUILD_VECTOR nodes. +def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">; +let AddedComplexity = 400 in { + + let Predicates = [HasVSX] in { + // Build vectors of floating point converted to i32. + def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A, + DblToInt.A, DblToInt.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>; + def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A, + DblToUInt.A, DblToUInt.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>; + def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)), + (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), + (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>; + def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)), + (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), + (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>; + def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS + (XSCVDPSXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>; + def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS + (XSCVDPUXWSs (LXSSPX xoaddr:$A)), VSRC), 1))>; + def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), + (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; + + // Build vectors of floating point converted to i64. + def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), + (v2i64 (XXPERMDIs (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>; + def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)), + (v2i64 (XXPERMDIs (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>; + def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)), + (v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>; + def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)), + (v2i64 (XVCVDPUXDS (LXVDSX xoaddr:$A)))>; + } + + let Predicates = [HasVSX, NoP9Vector] in { + // Load-and-splat with fp-to-int conversion (using X-Form VSX loads). + def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS + (XSCVDPSXWS (LXSDX xoaddr:$A)), VSRC), 1))>; + def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS + (XSCVDPUXWS (LXSDX xoaddr:$A)), VSRC), 1))>; + def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), + (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS + (LXSSPX xoaddr:$A), VSFRC)), 0))>; + def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), + (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS + (LXSSPX xoaddr:$A), VSFRC)), 0))>; + } + + // Big endian, available on all targets with VSX + let Predicates = [IsBigEndian, HasVSX] in { + def : Pat<(v2f64 (build_vector f64:$A, f64:$B)), + (v2f64 (XXPERMDI + (COPY_TO_REGCLASS $A, VSRC), + (COPY_TO_REGCLASS $B, VSRC), 0))>; + + def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)), + (VMRGEW MrgFP.AC, MrgFP.BD)>; + def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, + DblToFlt.B0, DblToFlt.B1)), + (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>; + } + + let Predicates = [IsLittleEndian, HasVSX] in { + // Little endian, available on all targets with VSX + def : Pat<(v2f64 (build_vector f64:$A, f64:$B)), + (v2f64 (XXPERMDI + (COPY_TO_REGCLASS $B, VSRC), + (COPY_TO_REGCLASS $A, VSRC), 0))>; + + def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)), + (VMRGEW MrgFP.AC, MrgFP.BD)>; + def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1, + DblToFlt.B0, DblToFlt.B1)), + (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>; + } + + let Predicates = [HasDirectMove] in { + // Endianness-neutral constant splat on P8 and newer targets. The reason + // for this pattern is that on targets with direct moves, we don't expand + // BUILD_VECTOR nodes for v4i32. + def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A, + immSExt5NonZero:$A, immSExt5NonZero:$A)), + (v4i32 (VSPLTISW imm:$A))>; + } + + let Predicates = [IsBigEndian, HasDirectMove, NoP9Vector] in { + // Big endian integer vectors using direct moves. + def : Pat<(v2i64 (build_vector i64:$A, i64:$B)), + (v2i64 (XXPERMDI + (COPY_TO_REGCLASS (MTVSRD $A), VSRC), + (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), + (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), + (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), 0), + (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), + (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), 0))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), + (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>; + } + + let Predicates = [IsLittleEndian, HasDirectMove, NoP9Vector] in { + // Little endian integer vectors using direct moves. + def : Pat<(v2i64 (build_vector i64:$A, i64:$B)), + (v2i64 (XXPERMDI + (COPY_TO_REGCLASS (MTVSRD $B), VSRC), + (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), + (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), + (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), 0), + (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), + (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 0))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), + (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>; + } + + let Predicates = [HasP9Vector] in { + // Endianness-neutral patterns for const splats with ISA 3.0 instructions. + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 (MTVSRWS $A))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)), + (v4i32 (MTVSRWS $A))>; + def : Pat<(v16i8 (build_vector immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A, immSExt8:$A, immSExt8:$A, + immSExt8:$A)), + (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>; + def : Pat<(v16i8 immAllOnesV), + (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; + def : Pat<(v8i16 immAllOnesV), + (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>; + def : Pat<(v4i32 immAllOnesV), + (v4i32 (XXSPLTIB 255))>; + def : Pat<(v2i64 immAllOnesV), + (v2i64 (XXSPLTIB 255))>; + def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)), + (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>; + def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)), + (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; + def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS + (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; + def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)), + (v4i32 (XXSPLTW (COPY_TO_REGCLASS + (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; + def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)), + (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS + (DFLOADf32 iaddr:$A), + VSFRC)), 0))>; + def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)), + (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS + (DFLOADf32 iaddr:$A), + VSFRC)), 0))>; + } + + let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { + def : Pat<(i64 (extractelt v2i64:$A, 1)), + (i64 (MFVSRLD $A))>; + // Better way to build integer vectors if we have MTVSRDD. Big endian. + def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)), + (v2i64 (MTVSRDD $rB, $rA))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), + (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC), + (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC))>; + } + + let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in { + def : Pat<(i64 (extractelt v2i64:$A, 0)), + (i64 (MFVSRLD $A))>; + // Better way to build integer vectors if we have MTVSRDD. Little endian. + def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)), + (v2i64 (MTVSRDD $rB, $rA))>; + def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)), + (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC), + (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>; + } } diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll index 979dd4b5d02a..1f317992a3b7 100644 --- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -46,10 +46,10 @@ entry: %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ret <4 x i32> %splat.splat -; CHECK: sldi [[REG1:[0-9]+]], 3, 32 -; CHECK: mtvsrd {{[0-9]+}}, [[REG1]] -; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3 -; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]] +; CHECK: mtvsrwz [[REG1:[0-9]+]], 3 +; CHECK: xxspltw 34, [[REG1]] +; CHECK-LE: mtvsrwz [[REG1:[0-9]+]], 3 +; CHECK-LE: xxspltw 34, [[REG1]] } ; Function Attrs: nounwind diff --git a/llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll b/llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll index fa4f0320c928..68995de702c2 100644 --- a/llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll +++ b/llvm/test/CodeGen/PowerPC/power9-moves-and-splats.ll @@ -10,15 +10,9 @@ entry: ; The FIXME below is due to the lowering for BUILD_VECTOR needing a re-vamp ; which will happen in a subsequent patch. ; CHECK-LABEL: test1 -; FIXME: mtvsrdd 34, 4, 3 -; CHECK: mtvsrd {{[0-9]+}}, 3 -; CHECK: mtvsrd {{[0-9]+}}, 4 -; CHECK: xxmrgld +; CHECK: mtvsrdd 34, 4, 3 ; CHECK-BE-LABEL: test1 -; FIXME-BE: mtvsrdd 34, 3, 4 -; CHECK-BE: mtvsrd {{[0-9]+}}, 4 -; CHECK-BE: mtvsrd {{[0-9]+}}, 3 -; CHECK-BE: xxmrghd +; CHECK-BE: mtvsrdd 34, 3, 4 %vecins = insertelement <2 x i64> undef, i64 %a, i32 0 %vecins1 = insertelement <2 x i64> %vecins, i64 %b, i32 1 ret <2 x i64> %vecins1 @@ -162,10 +156,14 @@ define <4 x i32> @test14(<4 x i32> %a, i32* nocapture readonly %b) { entry: ; CHECK-LABEL: test14 ; CHECK: lwz [[LD:[0-9]+]], -; CHECK: mtvsrws 34, [[LD]] +; FIXME: mtvsrws 34, [[LD]] +; CHECK: mtvsrws [[SPLT:[0-9]+]], [[LD]] +; CHECK: xxspltw 34, [[SPLT]], 3 ; CHECK-BE-LABEL: test14 ; CHECK-BE: lwz [[LD:[0-9]+]], -; CHECK-BE: mtvsrws 34, [[LD]] +; FIXME: mtvsrws 34, [[LD]] +; CHECK-BE: mtvsrws [[SPLT:[0-9]+]], [[LD]] +; CHECK-BE: xxspltw 34, [[SPLT]], 0 %0 = load i32, i32* %b, align 4 %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0 %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll index b15dc4bede11..f1a165c33581 100644 --- a/llvm/test/CodeGen/PowerPC/vsx.ll +++ b/llvm/test/CodeGen/PowerPC/vsx.ll @@ -1096,9 +1096,7 @@ define <2 x double> @test69(<2 x i16> %a) { ; CHECK-LE: mtvsrwa ; CHECK-LE: xscvsxddp ; CHECK-LE: xscvsxddp -; CHECK-LE: xxspltd -; CHECK-LE: xxspltd -; CHECK-LE: xxmrgld +; CHECK-LE: xxmrghd ; CHECK-LE: blr } @@ -1121,9 +1119,7 @@ define <2 x double> @test70(<2 x i8> %a) { ; CHECK-LE: mtvsrwa ; CHECK-LE: xscvsxddp ; CHECK-LE: xscvsxddp -; CHECK-LE: xxspltd -; CHECK-LE: xxspltd -; CHECK-LE: xxmrgld +; CHECK-LE: xxmrghd ; CHECK-LE: blr }