From 1d1cf30b738b88a58919221cc419109ba9519a61 Mon Sep 17 00:00:00 2001 From: Justin Hibbits Date: Mon, 17 Jun 2019 03:15:23 +0000 Subject: [PATCH] PowerPC: Optimize SPE double parameter calling setup Summary: SPE passes doubles the same as soft-float, in register pairs as i32 types. This is all handled by the target-independent layer. However, this is not optimal when splitting or reforming the doubles, as it pushes to the stack and loads from, on either side. For instance, to pass a double argument to a function, assuming the double value is in r5, the sequence currently looks like this: evstdd 5, X(1) lwz 3, X(1) lwz 4, X+4(1) Likewise, to form a double into r5 from args in r3 and r4: stw 3, X(1) stw 4, X+4(1) evldd 5, X(1) This optimizes the fence to use SPE instructions. Now, to pass a double to a function: mr 4, 5 evmergehi 3, 5, 5 And to form a double into r5 from args in r3 and r4: evmergelo 5, 3, 4 This is comparable to the way that gcc generates the double splits. This also fixes a bug with expanding builtins to libcalls, where the LowerCallTo() code path was generating intermediate illegal type nodes. Reviewers: nemanjai, hfinkel, joerg Subscribers: kbarton, jfb, jsji, llvm-commits Differential Revision: https://reviews.llvm.org/D54583 llvm-svn: 363526 --- llvm/lib/Target/PowerPC/PPCCallingConv.cpp | 54 +++++++++ llvm/lib/Target/PowerPC/PPCCallingConv.td | 7 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 119 +++++++++++++------- llvm/lib/Target/PowerPC/PPCISelLowering.h | 17 +-- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 12 ++ llvm/lib/Target/PowerPC/PPCInstrSPE.td | 12 +- llvm/test/CodeGen/PowerPC/spe.ll | 8 +- 7 files changed, 175 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.cpp b/llvm/lib/Target/PowerPC/PPCCallingConv.cpp index ecf1872e8fcb..77cdf5c939dc 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.cpp +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.cpp @@ -105,4 +105,58 @@ static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT, return false; } +// Split F64 arguments into two 32-bit consecutive registers. +static bool CC_PPC32_SPE_CustomSplitFP64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg HiRegList[] = { PPC::R3, PPC::R5, PPC::R7, PPC::R9 }; + static const MCPhysReg LoRegList[] = { PPC::R4, PPC::R6, PPC::R8, PPC::R10 }; + + // Try to get the first register. + unsigned Reg = State.AllocateReg(HiRegList); + if (!Reg) + return false; + + unsigned i; + for (i = 0; i < sizeof(HiRegList) / sizeof(HiRegList[0]); ++i) + if (HiRegList[i] == Reg) + break; + + unsigned T = State.AllocateReg(LoRegList[i]); + (void)T; + assert(T == LoRegList[i] && "Could not allocate register"); + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + +// Same as above, but for return values, so only allocate for R3 and R4 +static bool CC_PPC32_SPE_RetF64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + static const MCPhysReg HiRegList[] = { PPC::R3 }; + static const MCPhysReg LoRegList[] = { PPC::R4 }; + + // Try to get the first register. + unsigned Reg = State.AllocateReg(HiRegList, LoRegList); + if (!Reg) + return false; + + unsigned i; + for (i = 0; i < sizeof(HiRegList) / sizeof(HiRegList[0]); ++i) + if (HiRegList[i] == Reg) + break; + + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i], + LocVT, LocInfo)); + return true; +} + #include "PPCGenCallingConv.inc" diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td index de8b2b0986b0..ee367214dd7a 100644 --- a/llvm/lib/Target/PowerPC/PPCCallingConv.td +++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td @@ -91,7 +91,7 @@ def RetCC_PPC : CallingConv<[ CCIfSubtarget<"hasSPE()", CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>, CCIfSubtarget<"hasSPE()", - CCIfType<[f64], CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>, + CCIfType<[f64], CCCustom<"CC_PPC32_SPE_RetF64">>>, // For P9, f128 are passed in vector registers. CCIfType<[f128], @@ -182,6 +182,9 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCIfType<[i32], CCIfSplit>>>, + CCIfType<[f64], + CCIfSubtarget<"hasSPE()", + CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>, CCIfSplit>>>, @@ -202,7 +205,7 @@ def CC_PPC32_SVR4_Common : CallingConv<[ CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>, CCIfType<[f64], CCIfSubtarget<"hasSPE()", - CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>, + CCCustom<"CC_PPC32_SPE_CustomSplitFP64">>>, CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 9f7914a3b8ef..ddeec794952f 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1269,22 +1269,6 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, return Align; } -unsigned PPCTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const { - if (Subtarget.hasSPE() && VT == MVT::f64) - return 2; - return PPCTargetLowering::getNumRegisters(Context, VT); -} - -MVT PPCTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const { - if (Subtarget.hasSPE() && VT == MVT::f64) - return MVT::i32; - return PPCTargetLowering::getRegisterType(Context, VT); -} - bool PPCTargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } @@ -1402,6 +1386,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::QBFLT: return "PPCISD::QBFLT"; case PPCISD::QVLFSb: return "PPCISD::QVLFSb"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; + case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; + case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH"; @@ -3427,7 +3413,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( // Reserve space for the linkage area on the stack. unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize(); CCInfo.AllocateStack(LinkageSize, PtrByteSize); - if (useSoftFloat() || hasSPE()) + if (useSoftFloat()) CCInfo.PreAnalyzeFormalArguments(Ins); CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4); @@ -3460,7 +3446,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( if (Subtarget.hasVSX()) RC = &PPC::VSFRCRegClass; else if (Subtarget.hasSPE()) - RC = &PPC::SPERCRegClass; + // SPE passes doubles in GPR pairs. + RC = &PPC::GPRCRegClass; else RC = &PPC::F8RCRegClass; break; @@ -3484,13 +3471,26 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( break; } - // Transform the arguments stored in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, - ValVT == MVT::i1 ? MVT::i32 : ValVT); - - if (ValVT == MVT::i1) - ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + SDValue ArgValue; + // Transform the arguments stored in physical registers into + // virtual ones. + if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) { + assert(i + 1 < e && "No second half of double precision argument"); + unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC); + unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC); + SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32); + SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32); + if (!Subtarget.isLittleEndian()) + std::swap (ArgValueLo, ArgValueHi); + ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo, + ArgValueHi); + } else { + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, + ValVT == MVT::i1 ? MVT::i32 : ValVT); + if (ValVT == MVT::i1) + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue); + } InVals.push_back(ArgValue); } else { @@ -5135,10 +5135,27 @@ SDValue PPCTargetLowering::LowerCallResult( CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Val = DAG.getCopyFromReg(Chain, dl, - VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); + SDValue Val; + + if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { + SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Lo.getValue(1); + InFlag = Lo.getValue(2); + VA = RVLocs[++i]; // skip ahead to next loc + SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, + InFlag); + Chain = Hi.getValue(1); + InFlag = Hi.getValue(2); + if (!Subtarget.isLittleEndian()) + std::swap (Lo, Hi); + Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi); + } else { + Val = DAG.getCopyFromReg(Chain, dl, + VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -5459,12 +5476,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( bool seenFloatArg = false; // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, j = 0, e = ArgLocs.size(); + // i - Tracks the index into the list of registers allocated for the call + // RealArgIdx - Tracks the index into the list of actual function arguments + // j - Tracks the index into the list of byval arguments + for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size(); i != e; - ++i) { + ++i, ++RealArgIdx) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[RealArgIdx]; + ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags; if (Flags.isByVal()) { // Argument is an aggregate which is passed by value, thus we need to @@ -5513,7 +5533,17 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( if (VA.isRegLoc()) { seenFloatArg |= VA.getLocVT().isFloatingPoint(); // Put argument in a physical register. - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) { + bool IsLE = Subtarget.isLittleEndian(); + SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(IsLE ? 0 : 1, dl)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0))); + SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(IsLE ? 1 : 0, dl)); + RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(), + SVal.getValue(0))); + } else + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { // Put argument in the parameter list area of the current stack frame. assert(VA.isMemLoc()); @@ -6781,11 +6811,11 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SmallVector RetOps(1, Chain); // Copy the result values into the output registers. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Arg = OutVals[i]; + SDValue Arg = OutVals[RealResIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); @@ -6800,8 +6830,21 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; } - - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); + if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { + bool isLittleEndian = Subtarget.isLittleEndian(); + // Legalize ret f64 -> ret 2 x i32. + SDValue SVal = + DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl)); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, + DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl)); + Flag = Chain.getValue(1); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + } else + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index f443a87311dc..2c0ac80d95bc 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -194,6 +194,15 @@ namespace llvm { /// Direct move of 2 consecutive GPR to a VSX register. BUILD_FP128, + /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and + /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is + /// unsupported for this target. + /// Merge 2 GPRs to a single SPE register. + BUILD_SPE64, + + /// Extract SPE register component, second argument is high or low. + EXTRACT_SPE, + /// Extract a subvector from signed integer vector and convert to FP. /// It is primarily used to convert a (widened) illegal integer vector /// type to a legal floating point vector type. @@ -908,14 +917,6 @@ namespace llvm { unsigned JTI, MCContext &Ctx) const override; - unsigned getNumRegistersForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const override; - - MVT getRegisterTypeForCallingConv(LLVMContext &Context, - CallingConv:: ID CC, - EVT VT) const override; - private: struct ReuseLoadInfo { SDValue Ptr; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 6b6787f35216..5f9805938165 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -230,6 +230,18 @@ def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128", SDTCisSameAs<1,2>]>, []>; +def PPCbuild_spe64: SDNode<"PPCISD::BUILD_SPE64", + SDTypeProfile<1, 2, + [SDTCisVT<0, f64>, SDTCisVT<1,i32>, + SDTCisVT<1,i32>]>, + []>; + +def PPCextract_spe : SDNode<"PPCISD::EXTRACT_SPE", + SDTypeProfile<1, 2, + [SDTCisVT<0, i32>, SDTCisVT<1, f64>, + SDTCisPtrTy<2>]>, + []>; + // These are target-independent nodes, but have target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart, [SDNPHasChain, SDNPOutGlue]>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td index 31093de223fa..935c3044ae47 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td +++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td @@ -511,7 +511,7 @@ def EVLWWSPLATX : EVXForm_1<792, (outs sperc:$RT), (ins memrr:$src), def EVMERGEHI : EVXForm_1<556, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB), "evmergehi $RT, $RA, $RB", IIC_VecGeneral, []>; -def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB), +def EVMERGELO : EVXForm_1<557, (outs sperc:$RT), (ins gprc:$RA, gprc:$RB), "evmergelo $RT, $RA, $RB", IIC_VecGeneral, []>; def EVMERGEHILO : EVXForm_1<558, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB), "evmergehilo $RT, $RA, $RB", IIC_VecGeneral, []>; @@ -886,4 +886,14 @@ def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)), (SELECT_SPE (CRANDC $lhs, $rhs), $tval, $fval)>; def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)), (SELECT_SPE (CRXOR $lhs, $rhs), $tval, $fval)>; + + +def : Pat<(f64 (PPCbuild_spe64 i32:$rB, i32:$rA)), + (f64 (COPY_TO_REGCLASS (EVMERGELO $rA, $rB), SPERC))>; + +def : Pat<(i32 (PPCextract_spe f64:$rA, 1)), + (i32 (EXTRACT_SUBREG (EVMERGEHI $rA, $rA), sub_32))>; +def : Pat<(i32 (PPCextract_spe f64:$rA, 0)), + (i32 (EXTRACT_SUBREG $rA, sub_32))>; + } diff --git a/llvm/test/CodeGen/PowerPC/spe.ll b/llvm/test/CodeGen/PowerPC/spe.ll index 8603f45dabb4..06915ceb2dba 100644 --- a/llvm/test/CodeGen/PowerPC/spe.ll +++ b/llvm/test/CodeGen/PowerPC/spe.ll @@ -472,10 +472,8 @@ entry: ; CHECK-LABEL: test_dselect ; CHECK: andi. ; CHECK: bc -; CHECK: evldd -; CHECK: b -; CHECK: evldd -; CHECK: evstdd +; CHECK: evor +; CHECK: evmergehi ; CHECK: blr } @@ -519,7 +517,7 @@ entry: %1 = call i32 asm sideeffect "efdctsi $0, $1", "=d,d"(double %0) ret i32 %1 ; CHECK-LABEL: test_dasmconst -; CHECK: evldd +; CHECK: evmergelo ; CHECK: #APP ; CHECK: efdctsi ; CHECK: #NO_APP