From 5840f7197d058371c01fd7ac9ad932a3ab2ced6a Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Mon, 8 Nov 2021 13:41:46 -0600 Subject: [PATCH] [PowerPC] Respect rounding mode in the back end Currently, the floating point instructions that depend on rounding mode are correctly marked in the PPC back end with an implicit use of the RM register. Similarly, instructions that explicitly define the register are marked with an implicit def of the same register. So for the most part, RM-using code won't be moved across RM-setting instructions. However, calls are not marked as RM-setting instructions so code can be moved across calls. This is generally desired, but so is the ability to turn off this behaviour with an appropriate option - and -frounding-math really should be that option. This patch provides a set of call instructions (for direct and indirect calls) that are marked with an implicit def of the RM register. These will be used for calls that are marked with the strictfp attribute. Differential revision: https://reviews.llvm.org/D111433 --- llvm/lib/Target/PowerPC/P10InstrResources.td | 6 +- llvm/lib/Target/PowerPC/P9InstrResources.td | 10 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 77 +++++++---- llvm/lib/Target/PowerPC/PPCISelLowering.h | 8 ++ llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 63 +++++++++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 9 +- llvm/lib/Target/PowerPC/PPCInstrInfo.td | 61 +++++++++ llvm/lib/Target/PowerPC/PPCInstrVSX.td | 25 ++-- .../PowerPC/cse-despite-rounding-mode.ll | 127 +++++++++++++++++ .../CodeGen/PowerPC/respect-rounding-mode.ll | 128 ++++++++++++++++++ .../vector-constrained-fp-intrinsics.ll | 4 +- 11 files changed, 474 insertions(+), 44 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll create mode 100644 llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index 5c040f31db02..f43ba00ec373 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -312,7 +312,7 @@ def : InstRW<[P10W_BR_2C, P10W_DISP_ANY], (instrs BCLR, BCLRn, BDNZLR, BDNZLR8, BDNZLRm, BDNZLRp, BDZLR, BDZLR8, BDZLRm, BDZLRp, gBCLR, BCLRL, BCLRLn, BDNZLRL, BDNZLRLm, BDNZLRLp, BDZLRL, BDZLRLm, BDZLRLp, gBCLRL, - BL, BL8, BL8_NOP, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_TLS, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_TLS + BL, BL8, BL8_NOP, BL8_NOP_RM, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_RM, BL8_NOTOC_TLS, BL8_RM, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_NOP_RM, BL_RM, BL_TLS )>; // 2 Cycles Branch operations, 1 input operands @@ -320,9 +320,9 @@ def : InstRW<[P10W_BR_2C, P10W_DISP_ANY, P10BR_Read], (instrs B, BCC, BCCA, BCCCTR, BCCCTR8, BCCCTRL, BCCCTRL8, BCCL, BCCLA, BCCLR, BCCLRL, CTRL_DEP, TAILB, TAILB8, BA, TAILBA, TAILBA8, - BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL_LWZinto_toc, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat, + BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL8_LDinto_toc_RM, BCTRL8_RM, BCTRL_LWZinto_toc, BCTRL_LWZinto_toc_RM, BCTRL_RM, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat, BCL, BCLalways, BCLn, BDNZL, BDNZLm, BDNZLp, BDZL, BDZLm, BDZLp, gBCL, gBCLat, - BLA, BLA8, BLA8_NOP + BLA, BLA8, BLA8_NOP, BLA8_NOP_RM, BLA8_RM, BLA_RM )>; // 2 Cycles Branch operations, 3 input operands diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td index fe7487ad3d00..c4f4a2b3d796 100644 --- a/llvm/lib/Target/PowerPC/P9InstrResources.td +++ b/llvm/lib/Target/PowerPC/P9InstrResources.td @@ -1302,15 +1302,15 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C], (instregex "BCCTR(L)?(8)?(n)?$"), (instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"), (instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"), - (instregex "BL(_TLS|_NOP)?$"), - (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"), - (instregex "BLA(8|8_NOP)?$"), + (instregex "BL(_TLS|_NOP)?(_RM)?$"), + (instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?(_RM)?$"), + (instregex "BLA(8|8_NOP)?(_RM)?$"), (instregex "BLR(8|L)?$"), (instregex "TAILB(A)?(8)?$"), (instregex "TAILBCTR(8)?$"), (instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"), (instregex "BCLR(L)?(n)?$"), - (instregex "BCTR(L)?(8)?$"), + (instregex "BCTR(L)?(8)?(_RM)?$"), B, BA, BC, @@ -1321,6 +1321,8 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C], BCLn, BCTRL8_LDinto_toc, BCTRL_LWZinto_toc, + BCTRL8_LDinto_toc_RM, + BCTRL_LWZinto_toc_RM, BCn, CTRL_DEP )>; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d8333023a0b5..a7b2fecf2eae 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1630,9 +1630,19 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::CALL: return "PPCISD::CALL"; case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP"; case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC"; + case PPCISD::CALL_RM: + return "PPCISD::CALL_RM"; + case PPCISD::CALL_NOP_RM: + return "PPCISD::CALL_NOP_RM"; + case PPCISD::CALL_NOTOC_RM: + return "PPCISD::CALL_NOTOC_RM"; case PPCISD::MTCTR: return "PPCISD::MTCTR"; case PPCISD::BCTRL: return "PPCISD::BCTRL"; case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC"; + case PPCISD::BCTRL_RM: + return "PPCISD::BCTRL_RM"; + case PPCISD::BCTRL_LOAD_TOC_RM: + return "PPCISD::BCTRL_LOAD_TOC_RM"; case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; @@ -5172,13 +5182,14 @@ static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) { } static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, - const Function &Caller, - const SDValue &Callee, + const Function &Caller, const SDValue &Callee, const PPCSubtarget &Subtarget, - const TargetMachine &TM) { + const TargetMachine &TM, + bool IsStrictFPCall = false) { if (CFlags.IsTailCall) return PPCISD::TC_RETURN; + unsigned RetOpc = 0; // This is a call through a function pointer. if (CFlags.IsIndirect) { // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross @@ -5189,28 +5200,46 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, // immediately followed by a load of the TOC pointer from the the stack save // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC // as it is not saved or used. - return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC - : PPCISD::BCTRL; - } - - if (Subtarget.isUsingPCRelativeCalls()) { + RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC + : PPCISD::BCTRL; + } else if (Subtarget.isUsingPCRelativeCalls()) { assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI."); - return PPCISD::CALL_NOTOC; + RetOpc = PPCISD::CALL_NOTOC; + } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) + // The ABIs that maintain a TOC pointer accross calls need to have a nop + // immediately following the call instruction if the caller and callee may + // have different TOC bases. At link time if the linker determines the calls + // may not share a TOC base, the call is redirected to a trampoline inserted + // by the linker. The trampoline will (among other things) save the callers + // TOC pointer at an ABI designated offset in the linkage area and the + // linker will rewrite the nop to be a load of the TOC pointer from the + // linkage area into gpr2. + RetOpc = callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL + : PPCISD::CALL_NOP; + else + RetOpc = PPCISD::CALL; + if (IsStrictFPCall) { + switch (RetOpc) { + default: + llvm_unreachable("Unknown call opcode"); + case PPCISD::BCTRL_LOAD_TOC: + RetOpc = PPCISD::BCTRL_LOAD_TOC_RM; + break; + case PPCISD::BCTRL: + RetOpc = PPCISD::BCTRL_RM; + break; + case PPCISD::CALL_NOTOC: + RetOpc = PPCISD::CALL_NOTOC_RM; + break; + case PPCISD::CALL: + RetOpc = PPCISD::CALL_RM; + break; + case PPCISD::CALL_NOP: + RetOpc = PPCISD::CALL_NOP_RM; + break; + } } - - // The ABIs that maintain a TOC pointer accross calls need to have a nop - // immediately following the call instruction if the caller and callee may - // have different TOC bases. At link time if the linker determines the calls - // may not share a TOC base, the call is redirected to a trampoline inserted - // by the linker. The trampoline will (among other things) save the callers - // TOC pointer at an ABI designated offset in the linkage area and the linker - // will rewrite the nop to be a load of the TOC pointer from the linkage area - // into gpr2. - if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) - return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL - : PPCISD::CALL_NOP; - - return PPCISD::CALL; + return RetOpc; } static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, @@ -5506,7 +5535,7 @@ SDValue PPCTargetLowering::FinishCall( unsigned CallOpc = getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee, - Subtarget, DAG.getTarget()); + Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false); if (!CFlags.IsIndirect) Callee = transformCallee(Callee, DAG, dl, Subtarget); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 450bc48ec143..34dce2c3172d 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -200,6 +200,14 @@ namespace llvm { /// and 64-bit AIX. BCTRL_LOAD_TOC, + /// The variants that implicitly define rounding mode for calls with + /// strictfp semantics. + CALL_RM, + CALL_NOP_RM, + CALL_NOTOC_RM, + BCTRL_RM, + BCTRL_LOAD_TOC_RM, + /// Return with a flag operand, matched by 'blr' RET_FLAG, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 9d9e0e907382..e4a6a0e9d3f0 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -178,6 +178,39 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8], hasSideEffects = 0 in { } } +let isCall = 1, PPC970_Unit = 7, Defs = [LR8, RM], hasSideEffects = 0, + isCodeGenOnly = 1, Uses = [RM] in { + // Convenient aliases for call instructions + def BL8_RM : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + + def BLA8_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall_rm (i64 imm:$func))]>; + def BL8_NOP_RM : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; + + def BLA8_NOP_RM : IForm_and_DForm_4_zero<18, 1, 1, 24, + (outs), (ins abscalltarget:$func), + "bla $func\n\tnop", IIC_BrB, + [(PPCcall_nop_rm (i64 imm:$func))]>; + let Predicates = [PCRelativeMemops] in { + // BL8_NOTOC means that the caller does not use the TOC pointer and if + // it does use R2 then it is just a caller saved register. Therefore it is + // safe to emit only the bl and not the nop for this instruction. The + // linker will not try to restore R2 after the call. + def BL8_NOTOC_RM : IForm<18, 0, 1, (outs), + (ins calltarget:$func), + "bl $func", IIC_BrB, []>; + } + let Uses = [CTR8, RM] in { + let isPredicable = 1 in + def BCTRL8_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl_rm)]>, + Requires<[In64BitMode]>; + } +} + let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in { def BCTRL8_LDinto_toc : @@ -188,6 +221,16 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, Requires<[In64BitMode]>; } +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR8, X2, RM], Uses = [CTR8, RM], RST = 2 in { + def BCTRL8_LDinto_toc_RM : + XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs), + (ins memrix:$src), + "bctrl\n\tld 2, $src", IIC_BrB, + [(PPCbctrl_load_toc_rm iaddrX4:$src)]>, + Requires<[In64BitMode]>; +} + } // Interpretation64Bit // FIXME: Duplicating this for the asm parser should be unnecessary, but the @@ -214,12 +257,32 @@ def : Pat<(PPCcall_notoc (i64 tglobaladdr:$dst)), def : Pat<(PPCcall_notoc (i64 texternalsym:$dst)), (BL8_NOTOC texternalsym:$dst)>; +def : Pat<(PPCcall_rm (i64 tglobaladdr:$dst)), + (BL8_RM tglobaladdr:$dst)>; +def : Pat<(PPCcall_nop_rm (i64 tglobaladdr:$dst)), + (BL8_NOP_RM tglobaladdr:$dst)>; + +def : Pat<(PPCcall_rm (i64 texternalsym:$dst)), + (BL8_RM texternalsym:$dst)>; +def : Pat<(PPCcall_nop_rm (i64 texternalsym:$dst)), + (BL8_NOP_RM texternalsym:$dst)>; + +def : Pat<(PPCcall_notoc_rm (i64 tglobaladdr:$dst)), + (BL8_NOTOC_RM tglobaladdr:$dst)>; +def : Pat<(PPCcall_notoc_rm (i64 texternalsym:$dst)), + (BL8_NOTOC_RM texternalsym:$dst)>; + // Calls for AIX def : Pat<(PPCcall (i64 mcsym:$dst)), (BL8 mcsym:$dst)>; def : Pat<(PPCcall_nop (i64 mcsym:$dst)), (BL8_NOP mcsym:$dst)>; +def : Pat<(PPCcall_rm (i64 mcsym:$dst)), + (BL8_RM mcsym:$dst)>; +def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)), + (BL8_NOP_RM mcsym:$dst)>; + // Atomic operations // FIXME: some of these might be used with constant operands. This will result // in constant materialization instructions that may be redundant. We currently diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index b61933c99106..649a150866b4 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -2246,11 +2246,13 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, return true; } else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL || - OpC == PPC::BCTRL8) { + OpC == PPC::BCTRL8 || OpC == PPC::BCTRL_RM || + OpC == PPC::BCTRL8_RM) { if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR) llvm_unreachable("Cannot predicate bctr[l] on the ctr register"); - bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8; + bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8 || + OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM; bool isPPC64 = Subtarget.isPPC64(); if (Pred[0].getImm() == PPC::PRED_BIT_SET) { @@ -2274,6 +2276,9 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI, MachineInstrBuilder(*MI.getParent()->getParent(), MI) .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit) .addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine); + if (OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM) + MachineInstrBuilder(*MI.getParent()->getParent(), MI) + .addReg(PPC::RM, RegState::ImplicitDefine); return true; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 3aaf5c389c8b..a65d0af09456 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -316,6 +316,24 @@ def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC", [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +// Call nodes for strictfp calls (that define RM). +def PPCcall_rm : SDNode<"PPCISD::CALL_RM", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCcall_nop_rm : SDNode<"PPCISD::CALL_NOP_RM", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCcall_notoc_rm : SDNode<"PPCISD::CALL_NOTOC_RM", SDT_PPCCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCbctrl_rm : SDNode<"PPCISD::BCTRL_RM", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM", + SDTypeProfile<0, 1, []>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -1892,6 +1910,26 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { } } +let isCall = 1, PPC970_Unit = 7, Defs = [LR, RM], isCodeGenOnly = 1 in { + // Convenient aliases for call instructions + let Uses = [RM] in { + def BL_RM : IForm<18, 0, 1, (outs), (ins calltarget:$func), + "bl $func", IIC_BrB, []>; // See Pat patterns below. + def BLA_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func), + "bla $func", IIC_BrB, [(PPCcall_rm (i32 imm:$func))]>; + + def BL_NOP_RM : IForm_and_DForm_4_zero<18, 0, 1, 24, + (outs), (ins calltarget:$func), + "bl $func\n\tnop", IIC_BrB, []>; + } + let Uses = [CTR, RM] in { + let isPredicable = 1 in + def BCTRL_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), + "bctrl", IIC_BrB, [(PPCbctrl_rm)]>, + Requires<[In32BitMode]>; + } +} + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in def TCRETURNdi :PPCEmitTimePseudo< (outs), (ins calltarget:$dst, i32imm:$offset), @@ -1918,6 +1956,14 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, } +let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, + Defs = [LR, R2, RM], Uses = [CTR, RM], RST = 2 in { + def BCTRL_LWZinto_toc_RM: + XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs), + (ins memri:$src), "bctrl\n\tlwz 2, $src", IIC_BrB, + [(PPCbctrl_load_toc_rm iaddr:$src)]>, Requires<[In32BitMode]>; + +} let isCodeGenOnly = 1, hasSideEffects = 0 in { @@ -3435,6 +3481,12 @@ def : Pat<(PPCcall (i32 tglobaladdr:$dst)), def : Pat<(PPCcall (i32 texternalsym:$dst)), (BL texternalsym:$dst)>; +def : Pat<(PPCcall_rm (i32 tglobaladdr:$dst)), + (BL_RM tglobaladdr:$dst)>; + +def : Pat<(PPCcall_rm (i32 texternalsym:$dst)), + (BL_RM texternalsym:$dst)>; + // Calls for AIX only def : Pat<(PPCcall (i32 mcsym:$dst)), (BL mcsym:$dst)>; @@ -3445,6 +3497,15 @@ def : Pat<(PPCcall_nop (i32 mcsym:$dst)), def : Pat<(PPCcall_nop (i32 texternalsym:$dst)), (BL_NOP texternalsym:$dst)>; +def : Pat<(PPCcall_rm (i32 mcsym:$dst)), + (BL_RM mcsym:$dst)>; + +def : Pat<(PPCcall_nop_rm (i32 mcsym:$dst)), + (BL_NOP_RM mcsym:$dst)>; + +def : Pat<(PPCcall_nop_rm (i32 texternalsym:$dst)), + (BL_NOP_RM texternalsym:$dst)>; + def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm), (TCRETURNdi tglobaladdr:$dst, imm:$imm)>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 506c7fa1bfd0..d2d5ca92ca1c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -906,16 +906,13 @@ let hasSideEffects = 0 in { // Rounding Instructions respecting current rounding mode def XSRDPIC : XX2Form<60, 107, (outs vsfrc:$XT), (ins vsfrc:$XB), - "xsrdpic $XT, $XB", IIC_VecFP, - [(set f64:$XT, (fnearbyint f64:$XB))]>; + "xsrdpic $XT, $XB", IIC_VecFP, []>; def XVRDPIC : XX2Form<60, 235, (outs vsrc:$XT), (ins vsrc:$XB), - "xvrdpic $XT, $XB", IIC_VecFP, - [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>; + "xvrdpic $XT, $XB", IIC_VecFP, []>; def XVRSPIC : XX2Form<60, 171, (outs vsrc:$XT), (ins vsrc:$XB), - "xvrspic $XT, $XB", IIC_VecFP, - [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>; + "xvrspic $XT, $XB", IIC_VecFP, []>; // Max/Min Instructions let isCommutable = 1 in { def XSMAXDP : XX3Form<60, 160, @@ -2783,9 +2780,6 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be ForceXForm:$src)), (LXVD2X ForceXForm:$s def : Pat<(f32 (any_fround f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; -def : Pat<(f32 (fnearbyint f32:$S)), - (f32 (COPY_TO_REGCLASS (XSRDPIC - (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; def : Pat<(f32 (any_ffloor f32:$S)), (f32 (COPY_TO_REGCLASS (XSRDPIM (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; @@ -2804,6 +2798,19 @@ def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>; def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>; def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>; +// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour, +// these need to be defined after the any_frint versions so ISEL will correctly +// add the chain to the strict versions. +def : Pat<(f32 (fnearbyint f32:$S)), + (f32 (COPY_TO_REGCLASS (XSRDPIC + (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>; +def : Pat<(f64 (fnearbyint f64:$S)), + (f64 (XSRDPIC $S))>; +def : Pat<(v2f64 (fnearbyint v2f64:$S)), + (v2f64 (XVRDPIC $S))>; +def : Pat<(v4f32 (fnearbyint v4f32:$S)), + (v4f32 (XVRSPIC $S))>; + // Materialize a zero-vector of long long def : Pat<(v2i64 immAllZerosV), (v2i64 (XXLXORz))>; diff --git a/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll b/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll new file mode 100644 index 000000000000..ee449a06a479 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll @@ -0,0 +1,127 @@ +; The non-strictfp version of test/CodeGen/PowerPC/respect-rounding-mode.ll +; Without strictfp, CSE should be free to eliminate the repeated multiply +; and conversion instructions. +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 + +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8 + +define dso_local signext i32 @func1() local_unnamed_addr #0 { +entry: + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %0, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext3 = extractelement <2 x double> %1, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +declare void @directCall(...) local_unnamed_addr + +declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) + +declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) + +declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata) + +declare void @exit(i32 signext) local_unnamed_addr + +define dso_local signext i32 @func2() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @getvector1(...) local_unnamed_addr + +declare <2 x double> @getvector2(...) local_unnamed_addr + +declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) + +declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>) + +define dso_local signext i32 @func3() local_unnamed_addr #0 { +entry: + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %1, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + %2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %2() #0 + %3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext4 = extractelement <2 x double> %3, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +define dso_local signext i32 @func4() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + %1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %1() #0 + %mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata) + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll b/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll new file mode 100644 index 000000000000..9698ed9821be --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/respect-rounding-mode.ll @@ -0,0 +1,128 @@ +; The strictfp version of test/CodeGen/PowerPC/cse-despit-rounding-mode.ll +; With strictfp, the MachineIR optimizations need to assume that a call +; can change the rounding mode and must not move/eliminate the repeated +; multiply/convert instructions in this test. +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 + +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 +; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 +@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8 + +define dso_local signext i32 @func1() local_unnamed_addr #0 { +entry: + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %0, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext3 = extractelement <2 x double> %1, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +declare void @directCall(...) local_unnamed_addr + +declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) + +declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) + +declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata) + +declare void @exit(i32 signext) local_unnamed_addr + +define dso_local signext i32 @func2() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + tail call void bitcast (void (...)* @directCall to void ()*)() #0 + %mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @getvector1(...) local_unnamed_addr + +declare <2 x double> @getvector2(...) local_unnamed_addr + +declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) + +declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>) + +define dso_local signext i32 @func3() local_unnamed_addr #0 { +entry: + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %1, i32 0 + %sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0 + %2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %2() #0 + %3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> , metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext4 = extractelement <2 x double> %3, i32 1 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @exit(i32 signext 2) #0 + unreachable + +if.end: ; preds = %entry + ret i32 %conv +} + +define dso_local signext i32 @func4() local_unnamed_addr #0 { +entry: + %call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0 + %call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0 + %0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %0() #0 + %mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %vecext = extractelement <2 x double> %mul, i32 0 + %cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0 + br i1 %cmp, label %cleanup, label %if.end + +if.end: ; preds = %entry + %1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8 + tail call void %1() #0 + %mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0 + %2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0 + br label %cleanup + +cleanup: ; preds = %entry, %if.end + %retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ] + ret i32 %retval.0 +} + +declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata) + +attributes #0 = { nounwind strictfp } diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll index b378b0aabdb2..ce195b734cca 100644 --- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll @@ -4631,14 +4631,14 @@ entry: define <4 x double> @constrained_vector_rint_v4f64(<4 x double> %x) #0 { ; PC64LE-LABEL: constrained_vector_rint_v4f64: ; PC64LE: # %bb.0: # %entry -; PC64LE-NEXT: xvrdpic 34, 34 ; PC64LE-NEXT: xvrdpic 35, 35 +; PC64LE-NEXT: xvrdpic 34, 34 ; PC64LE-NEXT: blr ; ; PC64LE9-LABEL: constrained_vector_rint_v4f64: ; PC64LE9: # %bb.0: # %entry -; PC64LE9-NEXT: xvrdpic 34, 34 ; PC64LE9-NEXT: xvrdpic 35, 35 +; PC64LE9-NEXT: xvrdpic 34, 34 ; PC64LE9-NEXT: blr entry: %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(