[PowerPC] Respect rounding mode in the back end

Currently, the floating point instructions that depend on
rounding mode are correctly marked in the PPC back end with
an implicit use of the RM register. Similarly, instructions
that explicitly define the register are marked with an
implicit def of the same register. So for the most part,
RM-using code won't be moved across RM-setting instructions.

However, calls are not marked as RM-setting instructions so
code can be moved across calls. This is generally desired,
but so is the ability to turn off this behaviour with an
appropriate option - and -frounding-math really should be
that option.

This patch provides a set of call instructions (for direct
and indirect calls) that are marked with an implicit def of
the RM register. These will be used for calls that are marked
with the strictfp attribute.

Differential revision: https://reviews.llvm.org/D111433
This commit is contained in:
Nemanja Ivanovic 2021-11-08 13:41:46 -06:00
parent ba2ac9c97c
commit 5840f7197d
11 changed files with 474 additions and 44 deletions

View File

@ -312,7 +312,7 @@ def : InstRW<[P10W_BR_2C, P10W_DISP_ANY],
(instrs
BCLR, BCLRn, BDNZLR, BDNZLR8, BDNZLRm, BDNZLRp, BDZLR, BDZLR8, BDZLRm, BDZLRp, gBCLR,
BCLRL, BCLRLn, BDNZLRL, BDNZLRLm, BDNZLRLp, BDZLRL, BDZLRLm, BDZLRLp, gBCLRL,
BL, BL8, BL8_NOP, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_TLS, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_TLS
BL, BL8, BL8_NOP, BL8_NOP_RM, BL8_NOP_TLS, BL8_NOTOC, BL8_NOTOC_RM, BL8_NOTOC_TLS, BL8_RM, BL8_TLS, BL8_TLS_, BLR, BLR8, BLRL, BL_NOP, BL_NOP_RM, BL_RM, BL_TLS
)>;
// 2 Cycles Branch operations, 1 input operands
@ -320,9 +320,9 @@ def : InstRW<[P10W_BR_2C, P10W_DISP_ANY, P10BR_Read],
(instrs
B, BCC, BCCA, BCCCTR, BCCCTR8, BCCCTRL, BCCCTRL8, BCCL, BCCLA, BCCLR, BCCLRL, CTRL_DEP, TAILB, TAILB8,
BA, TAILBA, TAILBA8,
BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL_LWZinto_toc, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat,
BC, BCTR, BCTR8, BCTRL, BCTRL8, BCTRL8_LDinto_toc, BCTRL8_LDinto_toc_RM, BCTRL8_RM, BCTRL_LWZinto_toc, BCTRL_LWZinto_toc_RM, BCTRL_RM, BCn, BDNZ, BDNZ8, BDNZm, BDNZp, BDZ, BDZ8, BDZm, BDZp, TAILBCTR, TAILBCTR8, gBC, gBCat,
BCL, BCLalways, BCLn, BDNZL, BDNZLm, BDNZLp, BDZL, BDZLm, BDZLp, gBCL, gBCLat,
BLA, BLA8, BLA8_NOP
BLA, BLA8, BLA8_NOP, BLA8_NOP_RM, BLA8_RM, BLA_RM
)>;
// 2 Cycles Branch operations, 3 input operands

View File

@ -1302,15 +1302,15 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C],
(instregex "BCCTR(L)?(8)?(n)?$"),
(instregex "BD(N)?Z(8|A|Am|Ap|m|p)?$"),
(instregex "BD(N)?ZL(A|Am|Ap|R|R8|RL|RLm|RLp|Rm|Rp|m|p)?$"),
(instregex "BL(_TLS|_NOP)?$"),
(instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?$"),
(instregex "BLA(8|8_NOP)?$"),
(instregex "BL(_TLS|_NOP)?(_RM)?$"),
(instregex "BL8(_TLS|_NOP|_NOP_TLS|_TLS_)?(_RM)?$"),
(instregex "BLA(8|8_NOP)?(_RM)?$"),
(instregex "BLR(8|L)?$"),
(instregex "TAILB(A)?(8)?$"),
(instregex "TAILBCTR(8)?$"),
(instregex "gBC(A|Aat|CTR|CTRL|L|LA|LAat|LR|LRL|Lat|at)?$"),
(instregex "BCLR(L)?(n)?$"),
(instregex "BCTR(L)?(8)?$"),
(instregex "BCTR(L)?(8)?(_RM)?$"),
B,
BA,
BC,
@ -1321,6 +1321,8 @@ def : InstRW<[P9_BR_2C, DISP_BR_1C],
BCLn,
BCTRL8_LDinto_toc,
BCTRL_LWZinto_toc,
BCTRL8_LDinto_toc_RM,
BCTRL_LWZinto_toc_RM,
BCn,
CTRL_DEP
)>;

View File

@ -1630,9 +1630,19 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::CALL: return "PPCISD::CALL";
case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";
case PPCISD::CALL_RM:
return "PPCISD::CALL_RM";
case PPCISD::CALL_NOP_RM:
return "PPCISD::CALL_NOP_RM";
case PPCISD::CALL_NOTOC_RM:
return "PPCISD::CALL_NOTOC_RM";
case PPCISD::MTCTR: return "PPCISD::MTCTR";
case PPCISD::BCTRL: return "PPCISD::BCTRL";
case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
case PPCISD::BCTRL_RM:
return "PPCISD::BCTRL_RM";
case PPCISD::BCTRL_LOAD_TOC_RM:
return "PPCISD::BCTRL_LOAD_TOC_RM";
case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
@ -5172,13 +5182,14 @@ static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
}
static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
const Function &Caller,
const SDValue &Callee,
const Function &Caller, const SDValue &Callee,
const PPCSubtarget &Subtarget,
const TargetMachine &TM) {
const TargetMachine &TM,
bool IsStrictFPCall = false) {
if (CFlags.IsTailCall)
return PPCISD::TC_RETURN;
unsigned RetOpc = 0;
// This is a call through a function pointer.
if (CFlags.IsIndirect) {
// AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
@ -5189,28 +5200,46 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
// immediately followed by a load of the TOC pointer from the the stack save
// slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
// as it is not saved or used.
return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
: PPCISD::BCTRL;
}
if (Subtarget.isUsingPCRelativeCalls()) {
RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
: PPCISD::BCTRL;
} else if (Subtarget.isUsingPCRelativeCalls()) {
assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
return PPCISD::CALL_NOTOC;
RetOpc = PPCISD::CALL_NOTOC;
} else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
// The ABIs that maintain a TOC pointer accross calls need to have a nop
// immediately following the call instruction if the caller and callee may
// have different TOC bases. At link time if the linker determines the calls
// may not share a TOC base, the call is redirected to a trampoline inserted
// by the linker. The trampoline will (among other things) save the callers
// TOC pointer at an ABI designated offset in the linkage area and the
// linker will rewrite the nop to be a load of the TOC pointer from the
// linkage area into gpr2.
RetOpc = callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
: PPCISD::CALL_NOP;
else
RetOpc = PPCISD::CALL;
if (IsStrictFPCall) {
switch (RetOpc) {
default:
llvm_unreachable("Unknown call opcode");
case PPCISD::BCTRL_LOAD_TOC:
RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
break;
case PPCISD::BCTRL:
RetOpc = PPCISD::BCTRL_RM;
break;
case PPCISD::CALL_NOTOC:
RetOpc = PPCISD::CALL_NOTOC_RM;
break;
case PPCISD::CALL:
RetOpc = PPCISD::CALL_RM;
break;
case PPCISD::CALL_NOP:
RetOpc = PPCISD::CALL_NOP_RM;
break;
}
}
// The ABIs that maintain a TOC pointer accross calls need to have a nop
// immediately following the call instruction if the caller and callee may
// have different TOC bases. At link time if the linker determines the calls
// may not share a TOC base, the call is redirected to a trampoline inserted
// by the linker. The trampoline will (among other things) save the callers
// TOC pointer at an ABI designated offset in the linkage area and the linker
// will rewrite the nop to be a load of the TOC pointer from the linkage area
// into gpr2.
if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
return callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL
: PPCISD::CALL_NOP;
return PPCISD::CALL;
return RetOpc;
}
static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
@ -5506,7 +5535,7 @@ SDValue PPCTargetLowering::FinishCall(
unsigned CallOpc =
getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
Subtarget, DAG.getTarget());
Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false);
if (!CFlags.IsIndirect)
Callee = transformCallee(Callee, DAG, dl, Subtarget);

View File

@ -200,6 +200,14 @@ namespace llvm {
/// and 64-bit AIX.
BCTRL_LOAD_TOC,
/// The variants that implicitly define rounding mode for calls with
/// strictfp semantics.
CALL_RM,
CALL_NOP_RM,
CALL_NOTOC_RM,
BCTRL_RM,
BCTRL_LOAD_TOC_RM,
/// Return with a flag operand, matched by 'blr'
RET_FLAG,

View File

@ -178,6 +178,39 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8], hasSideEffects = 0 in {
}
}
let isCall = 1, PPC970_Unit = 7, Defs = [LR8, RM], hasSideEffects = 0,
isCodeGenOnly = 1, Uses = [RM] in {
// Convenient aliases for call instructions
def BL8_RM : IForm<18, 0, 1, (outs), (ins calltarget:$func),
"bl $func", IIC_BrB, []>; // See Pat patterns below.
def BLA8_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
"bla $func", IIC_BrB, [(PPCcall_rm (i64 imm:$func))]>;
def BL8_NOP_RM : IForm_and_DForm_4_zero<18, 0, 1, 24,
(outs), (ins calltarget:$func),
"bl $func\n\tnop", IIC_BrB, []>;
def BLA8_NOP_RM : IForm_and_DForm_4_zero<18, 1, 1, 24,
(outs), (ins abscalltarget:$func),
"bla $func\n\tnop", IIC_BrB,
[(PPCcall_nop_rm (i64 imm:$func))]>;
let Predicates = [PCRelativeMemops] in {
// BL8_NOTOC means that the caller does not use the TOC pointer and if
// it does use R2 then it is just a caller saved register. Therefore it is
// safe to emit only the bl and not the nop for this instruction. The
// linker will not try to restore R2 after the call.
def BL8_NOTOC_RM : IForm<18, 0, 1, (outs),
(ins calltarget:$func),
"bl $func", IIC_BrB, []>;
}
let Uses = [CTR8, RM] in {
let isPredicable = 1 in
def BCTRL8_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
"bctrl", IIC_BrB, [(PPCbctrl_rm)]>,
Requires<[In64BitMode]>;
}
}
let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
Defs = [LR8, X2], Uses = [CTR8, RM], RST = 2 in {
def BCTRL8_LDinto_toc :
@ -188,6 +221,16 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
Requires<[In64BitMode]>;
}
let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
Defs = [LR8, X2, RM], Uses = [CTR8, RM], RST = 2 in {
def BCTRL8_LDinto_toc_RM :
XLForm_2_ext_and_DSForm_1<19, 528, 20, 0, 1, 58, 0, (outs),
(ins memrix:$src),
"bctrl\n\tld 2, $src", IIC_BrB,
[(PPCbctrl_load_toc_rm iaddrX4:$src)]>,
Requires<[In64BitMode]>;
}
} // Interpretation64Bit
// FIXME: Duplicating this for the asm parser should be unnecessary, but the
@ -214,12 +257,32 @@ def : Pat<(PPCcall_notoc (i64 tglobaladdr:$dst)),
def : Pat<(PPCcall_notoc (i64 texternalsym:$dst)),
(BL8_NOTOC texternalsym:$dst)>;
def : Pat<(PPCcall_rm (i64 tglobaladdr:$dst)),
(BL8_RM tglobaladdr:$dst)>;
def : Pat<(PPCcall_nop_rm (i64 tglobaladdr:$dst)),
(BL8_NOP_RM tglobaladdr:$dst)>;
def : Pat<(PPCcall_rm (i64 texternalsym:$dst)),
(BL8_RM texternalsym:$dst)>;
def : Pat<(PPCcall_nop_rm (i64 texternalsym:$dst)),
(BL8_NOP_RM texternalsym:$dst)>;
def : Pat<(PPCcall_notoc_rm (i64 tglobaladdr:$dst)),
(BL8_NOTOC_RM tglobaladdr:$dst)>;
def : Pat<(PPCcall_notoc_rm (i64 texternalsym:$dst)),
(BL8_NOTOC_RM texternalsym:$dst)>;
// Calls for AIX
def : Pat<(PPCcall (i64 mcsym:$dst)),
(BL8 mcsym:$dst)>;
def : Pat<(PPCcall_nop (i64 mcsym:$dst)),
(BL8_NOP mcsym:$dst)>;
def : Pat<(PPCcall_rm (i64 mcsym:$dst)),
(BL8_RM mcsym:$dst)>;
def : Pat<(PPCcall_nop_rm (i64 mcsym:$dst)),
(BL8_NOP_RM mcsym:$dst)>;
// Atomic operations
// FIXME: some of these might be used with constant operands. This will result
// in constant materialization instructions that may be redundant. We currently

View File

@ -2246,11 +2246,13 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
return true;
} else if (OpC == PPC::BCTR || OpC == PPC::BCTR8 || OpC == PPC::BCTRL ||
OpC == PPC::BCTRL8) {
OpC == PPC::BCTRL8 || OpC == PPC::BCTRL_RM ||
OpC == PPC::BCTRL8_RM) {
if (Pred[1].getReg() == PPC::CTR8 || Pred[1].getReg() == PPC::CTR)
llvm_unreachable("Cannot predicate bctr[l] on the ctr register");
bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8;
bool setLR = OpC == PPC::BCTRL || OpC == PPC::BCTRL8 ||
OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM;
bool isPPC64 = Subtarget.isPPC64();
if (Pred[0].getImm() == PPC::PRED_BIT_SET) {
@ -2274,6 +2276,9 @@ bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
MachineInstrBuilder(*MI.getParent()->getParent(), MI)
.addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::Implicit)
.addReg(isPPC64 ? PPC::LR8 : PPC::LR, RegState::ImplicitDefine);
if (OpC == PPC::BCTRL_RM || OpC == PPC::BCTRL8_RM)
MachineInstrBuilder(*MI.getParent()->getParent(), MI)
.addReg(PPC::RM, RegState::ImplicitDefine);
return true;
}

View File

@ -316,6 +316,24 @@ def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
// Call nodes for strictfp calls (that define RM).
def PPCcall_rm : SDNode<"PPCISD::CALL_RM", SDT_PPCCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def PPCcall_nop_rm : SDNode<"PPCISD::CALL_NOP_RM", SDT_PPCCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def PPCcall_notoc_rm : SDNode<"PPCISD::CALL_NOTOC_RM", SDT_PPCCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def PPCbctrl_rm : SDNode<"PPCISD::BCTRL_RM", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM",
SDTypeProfile<0, 1, []>,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
SDNPVariadic]>;
def retflag : SDNode<"PPCISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
@ -1892,6 +1910,26 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
}
}
let isCall = 1, PPC970_Unit = 7, Defs = [LR, RM], isCodeGenOnly = 1 in {
// Convenient aliases for call instructions
let Uses = [RM] in {
def BL_RM : IForm<18, 0, 1, (outs), (ins calltarget:$func),
"bl $func", IIC_BrB, []>; // See Pat patterns below.
def BLA_RM : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
"bla $func", IIC_BrB, [(PPCcall_rm (i32 imm:$func))]>;
def BL_NOP_RM : IForm_and_DForm_4_zero<18, 0, 1, 24,
(outs), (ins calltarget:$func),
"bl $func\n\tnop", IIC_BrB, []>;
}
let Uses = [CTR, RM] in {
let isPredicable = 1 in
def BCTRL_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
"bctrl", IIC_BrB, [(PPCbctrl_rm)]>,
Requires<[In32BitMode]>;
}
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
def TCRETURNdi :PPCEmitTimePseudo< (outs),
(ins calltarget:$dst, i32imm:$offset),
@ -1918,6 +1956,14 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
}
let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1,
Defs = [LR, R2, RM], Uses = [CTR, RM], RST = 2 in {
def BCTRL_LWZinto_toc_RM:
XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs),
(ins memri:$src), "bctrl\n\tlwz 2, $src", IIC_BrB,
[(PPCbctrl_load_toc_rm iaddr:$src)]>, Requires<[In32BitMode]>;
}
let isCodeGenOnly = 1, hasSideEffects = 0 in {
@ -3435,6 +3481,12 @@ def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
def : Pat<(PPCcall (i32 texternalsym:$dst)),
(BL texternalsym:$dst)>;
def : Pat<(PPCcall_rm (i32 tglobaladdr:$dst)),
(BL_RM tglobaladdr:$dst)>;
def : Pat<(PPCcall_rm (i32 texternalsym:$dst)),
(BL_RM texternalsym:$dst)>;
// Calls for AIX only
def : Pat<(PPCcall (i32 mcsym:$dst)),
(BL mcsym:$dst)>;
@ -3445,6 +3497,15 @@ def : Pat<(PPCcall_nop (i32 mcsym:$dst)),
def : Pat<(PPCcall_nop (i32 texternalsym:$dst)),
(BL_NOP texternalsym:$dst)>;
def : Pat<(PPCcall_rm (i32 mcsym:$dst)),
(BL_RM mcsym:$dst)>;
def : Pat<(PPCcall_nop_rm (i32 mcsym:$dst)),
(BL_NOP_RM mcsym:$dst)>;
def : Pat<(PPCcall_nop_rm (i32 texternalsym:$dst)),
(BL_NOP_RM texternalsym:$dst)>;
def : Pat<(PPCtc_return (i32 tglobaladdr:$dst), imm:$imm),
(TCRETURNdi tglobaladdr:$dst, imm:$imm)>;

View File

@ -906,16 +906,13 @@ let hasSideEffects = 0 in {
// Rounding Instructions respecting current rounding mode
def XSRDPIC : XX2Form<60, 107,
(outs vsfrc:$XT), (ins vsfrc:$XB),
"xsrdpic $XT, $XB", IIC_VecFP,
[(set f64:$XT, (fnearbyint f64:$XB))]>;
"xsrdpic $XT, $XB", IIC_VecFP, []>;
def XVRDPIC : XX2Form<60, 235,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrdpic $XT, $XB", IIC_VecFP,
[(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
"xvrdpic $XT, $XB", IIC_VecFP, []>;
def XVRSPIC : XX2Form<60, 171,
(outs vsrc:$XT), (ins vsrc:$XB),
"xvrspic $XT, $XB", IIC_VecFP,
[(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
"xvrspic $XT, $XB", IIC_VecFP, []>;
// Max/Min Instructions
let isCommutable = 1 in {
def XSMAXDP : XX3Form<60, 160,
@ -2783,9 +2780,6 @@ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be ForceXForm:$src)), (LXVD2X ForceXForm:$s
def : Pat<(f32 (any_fround f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPI
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f32 (fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f32 (any_ffloor f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIM
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
@ -2804,6 +2798,19 @@ def : Pat<(v4f32 (any_frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
def : Pat<(f64 (any_frint f64:$S)), (f64 (XSRDPIC $S))>;
def : Pat<(v2f64 (any_frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
// Rounding without exceptions (nearbyint). Due to strange tblgen behaviour,
// these need to be defined after the any_frint versions so ISEL will correctly
// add the chain to the strict versions.
def : Pat<(f32 (fnearbyint f32:$S)),
(f32 (COPY_TO_REGCLASS (XSRDPIC
(COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
def : Pat<(f64 (fnearbyint f64:$S)),
(f64 (XSRDPIC $S))>;
def : Pat<(v2f64 (fnearbyint v2f64:$S)),
(v2f64 (XVRDPIC $S))>;
def : Pat<(v4f32 (fnearbyint v4f32:$S)),
(v4f32 (XVRSPIC $S))>;
// Materialize a zero-vector of long long
def : Pat<(v2i64 immAllZerosV),
(v2i64 (XXLXORz))>;

View File

@ -0,0 +1,127 @@
; The non-strictfp version of test/CodeGen/PowerPC/respect-rounding-mode.ll
; Without strictfp, CSE should be free to eliminate the repeated multiply
; and conversion instructions.
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8
define dso_local signext i32 @func1() local_unnamed_addr #0 {
entry:
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %0, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext3 = extractelement <2 x double> %1, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable
if.end: ; preds = %entry
ret i32 %conv
}
declare void @directCall(...) local_unnamed_addr
declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
declare void @exit(i32 signext) local_unnamed_addr
define dso_local signext i32 @func2() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end
if.end: ; preds = %entry
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0
br label %cleanup
cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}
declare <2 x double> @getvector1(...) local_unnamed_addr
declare <2 x double> @getvector2(...) local_unnamed_addr
declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>)
define dso_local signext i32 @func3() local_unnamed_addr #0 {
entry:
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %1, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
%2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %2() #0
%3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext4 = extractelement <2 x double> %3, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable
if.end: ; preds = %entry
ret i32 %conv
}
define dso_local signext i32 @func4() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end
if.end: ; preds = %entry
%1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %1() #0
%mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0
br label %cleanup
cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}
declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
attributes #0 = { nounwind }

View File

@ -0,0 +1,128 @@
; The strictfp version of test/CodeGen/PowerPC/cse-despit-rounding-mode.ll
; With strictfp, the MachineIR optimizations need to assume that a call
; can change the rounding mode and must not move/eliminate the repeated
; multiply/convert instructions in this test.
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
@IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8
define dso_local signext i32 @func1() local_unnamed_addr #0 {
entry:
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%0 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %0, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext3 = extractelement <2 x double> %1, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext3, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable
if.end: ; preds = %entry
ret i32 %conv
}
declare void @directCall(...) local_unnamed_addr
declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
declare i1 @llvm.experimental.constrained.fcmp.f64(double, double, metadata, metadata)
declare void @exit(i32 signext) local_unnamed_addr
define dso_local signext i32 @func2() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end
if.end: ; preds = %entry
tail call void bitcast (void (...)* @directCall to void ()*)() #0
%mul10 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%0 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul10) #0
br label %cleanup
cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %0, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}
declare <2 x double> @getvector1(...) local_unnamed_addr
declare <2 x double> @getvector2(...) local_unnamed_addr
declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
declare i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32, <2 x double>, <2 x double>)
define dso_local signext i32 @func3() local_unnamed_addr #0 {
entry:
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%1 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %1, i32 0
%sub = tail call double @llvm.experimental.constrained.fsub.f64(double %vecext, double -9.900000e+01, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%conv = tail call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %sub, metadata !"fpexcept.ignore") #0
%2 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %2() #0
%3 = tail call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> <double -9.990000e+01, double 9.990000e+01>, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext4 = extractelement <2 x double> %3, i32 1
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext4, double 9.900000e+01, metadata !"une", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
tail call void @exit(i32 signext 2) #0
unreachable
if.end: ; preds = %entry
ret i32 %conv
}
define dso_local signext i32 @func4() local_unnamed_addr #0 {
entry:
%call = tail call <2 x double> bitcast (<2 x double> (...)* @getvector1 to <2 x double> ()*)() #0
%call1 = tail call <2 x double> bitcast (<2 x double> (...)* @getvector2 to <2 x double> ()*)() #0
%0 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %0() #0
%mul = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%vecext = extractelement <2 x double> %mul, i32 0
%cmp = tail call i1 @llvm.experimental.constrained.fcmp.f64(double %vecext, double 4.000000e+00, metadata !"oeq", metadata !"fpexcept.ignore") #0
br i1 %cmp, label %cleanup, label %if.end
if.end: ; preds = %entry
%1 = load void ()*, void ()** bitcast (void (...)** @IndirectCallPtr to void ()**), align 8
tail call void %1() #0
%mul11 = tail call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %call, <2 x double> %call1, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
%2 = tail call i32 @llvm.ppc.vsx.xvcmpeqdp.p(i32 2, <2 x double> %mul, <2 x double> %mul11) #0
br label %cleanup
cleanup: ; preds = %entry, %if.end
%retval.0 = phi i32 [ %2, %if.end ], [ 11, %entry ]
ret i32 %retval.0
}
declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
attributes #0 = { nounwind strictfp }

View File

@ -4631,14 +4631,14 @@ entry:
define <4 x double> @constrained_vector_rint_v4f64(<4 x double> %x) #0 {
; PC64LE-LABEL: constrained_vector_rint_v4f64:
; PC64LE: # %bb.0: # %entry
; PC64LE-NEXT: xvrdpic 34, 34
; PC64LE-NEXT: xvrdpic 35, 35
; PC64LE-NEXT: xvrdpic 34, 34
; PC64LE-NEXT: blr
;
; PC64LE9-LABEL: constrained_vector_rint_v4f64:
; PC64LE9: # %bb.0: # %entry
; PC64LE9-NEXT: xvrdpic 34, 34
; PC64LE9-NEXT: xvrdpic 35, 35
; PC64LE9-NEXT: xvrdpic 34, 34
; PC64LE9-NEXT: blr
entry:
%rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(