[PowerPC] Implement quadword atomic load/store

Add support to load/store i128 atomically.

Reviewed By: jsji

Differential Revision: https://reviews.llvm.org/D105612
This commit is contained in:
Kai Luo 2021-09-01 06:54:59 +00:00
parent ceccbb8145
commit 5eaebd5d64
7 changed files with 332 additions and 7 deletions

View File

@ -1741,4 +1741,11 @@ let TargetPrefix = "ppc" in {
llvm_i64_ty, llvm_i64_ty,
llvm_i64_ty, llvm_i64_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
def int_ppc_atomic_load_i128 :
Intrinsic<[llvm_i64_ty, llvm_i64_ty],
[llvm_ptr_ty],
[IntrArgMemOnly, IntrReadMem, NoCapture<ArgIndex<0>>]>;
def int_ppc_atomic_store_i128 :
Intrinsic<[], [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty],
[IntrArgMemOnly, IntrWriteMem, NoCapture<ArgIndex<2>>]>;
}

View File

@ -102,6 +102,16 @@ bool PPCExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
return expandAtomicRMW128(MBB, MI, NMBBI);
case PPC::ATOMIC_CMP_SWAP_I128:
return expandAtomicCmpSwap128(MBB, MI, NMBBI);
case PPC::BUILD_QUADWORD: {
Register Dst = MI.getOperand(0).getReg();
Register DstHi = TRI->getSubReg(Dst, PPC::sub_gp8_x0);
Register DstLo = TRI->getSubReg(Dst, PPC::sub_gp8_x1);
Register Lo = MI.getOperand(1).getReg();
Register Hi = MI.getOperand(2).getReg();
PairedCopy(TII, MBB, MI, MI.getDebugLoc(), DstHi, DstLo, Hi, Lo);
MI.eraseFromParent();
return true;
}
default:
return false;
}

View File

@ -1286,8 +1286,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
}
if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics())
if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) {
setMaxAtomicSizeInBitsSupported(128);
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom);
}
setBooleanContents(ZeroOrOneBooleanContent);
@ -1518,6 +1522,7 @@ void PPCTargetLowering::initializeAddrModeMap() {
PPC::MOF_NotAddNorCst | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector256 | PPC::MOF_SubtargetP10,
};
// TODO: Add mapping for quadword load/store.
}
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
@ -10452,11 +10457,18 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::ppc_cfence: {
assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
Op.getOperand(ArgStart + 1)),
Op.getOperand(0)),
0);
SDValue Val = Op.getOperand(ArgStart + 1);
EVT Ty = Val.getValueType();
if (Ty == MVT::i128) {
// FIXME: Testing one of two paired registers is sufficient to guarantee
// ordering?
Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val);
}
return SDValue(
DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Val),
Op.getOperand(0)),
0);
}
default:
break;
@ -10519,6 +10531,59 @@ SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
}
SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
SelectionDAG &DAG) const {
AtomicSDNode *N = cast<AtomicSDNode>(Op.getNode());
EVT MemVT = N->getMemoryVT();
MVT VT = MemVT.getSimpleVT();
assert(VT == MVT::i128 && "Expect quadword atomic operations");
SDLoc dl(N);
unsigned Opc = N->getOpcode();
switch (Opc) {
case ISD::ATOMIC_LOAD: {
// Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
// lowered to ppc instructions by pattern matching instruction selector.
SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
SmallVector<SDValue, 4> Ops{
N->getOperand(0),
DAG.getConstant(Intrinsic::ppc_atomic_load_i128, dl, MVT::i32)};
for (int I = 1, E = N->getNumOperands(); I < E; ++I)
Ops.push_back(N->getOperand(I));
SDValue LoadedVal = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys,
Ops, MemVT, N->getMemOperand());
SDValue ValLo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal);
SDValue ValHi =
DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal.getValue(1));
ValHi = DAG.getNode(ISD::SHL, dl, MVT::i128, ValHi,
DAG.getConstant(64, dl, MVT::i32));
SDValue Val =
DAG.getNode(ISD::OR, dl, {MVT::i128, MVT::Other}, {ValLo, ValHi});
return DAG.getNode(ISD::MERGE_VALUES, dl, {MVT::i128, MVT::Other},
{Val, LoadedVal.getValue(2)});
}
case ISD::ATOMIC_STORE: {
// Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
// lowered to ppc instructions by pattern matching instruction selector.
SDVTList Tys = DAG.getVTList(MVT::Other);
SmallVector<SDValue, 4> Ops{
N->getOperand(0),
DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)};
SDValue Val = N->getOperand(2);
SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val);
SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val,
DAG.getConstant(64, dl, MVT::i32));
ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi);
Ops.push_back(ValLo);
Ops.push_back(ValHi);
Ops.push_back(N->getOperand(1));
return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT,
N->getMemOperand());
}
default:
llvm_unreachable("Unexpected atomic opcode");
}
}
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@ -10910,6 +10975,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerBSWAP(Op, DAG);
case ISD::ATOMIC_CMP_SWAP:
return LowerATOMIC_CMP_SWAP(Op, DAG);
case ISD::ATOMIC_STORE:
return LowerATOMIC_LOAD_STORE(Op, DAG);
}
}
@ -10920,6 +10987,12 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
case ISD::ATOMIC_LOAD: {
SDValue Res = LowerATOMIC_LOAD_STORE(SDValue(N, 0), DAG);
Results.push_back(Res);
Results.push_back(Res.getValue(1));
break;
}
case ISD::READCYCLECOUNTER: {
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
@ -12656,6 +12729,24 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
.addDef(Hi)
.addUse(Src, 0, PPC::sub_gp8_x0);
} else if (MI.getOpcode() == PPC::LQX_PSEUDO ||
MI.getOpcode() == PPC::STQX_PSEUDO) {
DebugLoc DL = MI.getDebugLoc();
// Ptr is used as the ptr_rc_no_r0 part
// of LQ/STQ's memory operand and adding result of RA and RB,
// so it has to be g8rc_and_g8rc_nox0.
Register Ptr =
F->getRegInfo().createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
Register Val = MI.getOperand(0).getReg();
Register RA = MI.getOperand(1).getReg();
Register RB = MI.getOperand(2).getReg();
BuildMI(*BB, MI, DL, TII->get(PPC::ADD8), Ptr).addReg(RA).addReg(RB);
BuildMI(*BB, MI, DL,
MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(PPC::LQ)
: TII->get(PPC::STQ))
.addReg(Val, MI.getOpcode() == PPC::LQX_PSEUDO ? RegState::Define : 0)
.addImm(0)
.addReg(Ptr);
} else {
llvm_unreachable("Unexpected instr type to insert");
}
@ -16091,6 +16182,22 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
MachineMemOperand::MOVolatile;
return true;
case Intrinsic::ppc_atomic_load_i128:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = Align(16);
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::ppc_atomic_store_i128:
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
Info.align = Align(16);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
case Intrinsic::ppc_altivec_lvebx:
@ -17280,7 +17387,8 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
EVT MemVT = MN->getMemoryVT();
unsigned Size = MemVT.getSizeInBits();
if (MemVT.isScalarInteger()) {
assert(Size <= 64 && "Not expecting scalar integers larger than 8 bytes!");
assert(Size <= 128 &&
"Not expecting scalar integers larger than 16 bytes!");
if (Size < 32)
FlagSet |= PPC::MOF_SubWordInt;
else if (Size == 32)

View File

@ -1246,6 +1246,7 @@ namespace llvm {
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;

View File

@ -1342,12 +1342,25 @@ def LQ : DQForm_RTp5_RA17_MEM<56, 0,
[]>,
RegConstraint<"@earlyclobber $RTp">,
isPPC64;
// We don't really have LQX in the ISA, make a pseudo one so that we can
// handle x-form during isel. Make it pre-ra may expose
// oppotunities to some opts(CSE, LICM and etc.) for the result of adding
// RA and RB.
def LQX_PSEUDO : PPCCustomInserterPseudo<(outs g8prc:$RTp),
(ins memrr:$src), "#LQX_PSEUDO", []>;
def RESTORE_QUADWORD : PPCEmitTimePseudo<(outs g8prc:$RTp), (ins memrix:$src),
"#RESTORE_QUADWORD", []>;
}
}
def : Pat<(int_ppc_atomic_load_i128 iaddrX16:$src),
(SPLIT_QUADWORD (LQ memrix16:$src))>;
def : Pat<(int_ppc_atomic_load_i128 ForceXForm:$src),
(SPLIT_QUADWORD (LQX_PSEUDO memrr:$src))>;
// Support for medium and large code model.
let hasSideEffects = 0 in {
let isReMaterializable = 1 in {
@ -1536,12 +1549,28 @@ let mayStore = 1, hasNoSchedulingInfo = 1 in {
def STQ : DSForm_1<62, 2, (outs), (ins g8prc:$RSp, memrix:$dst),
"stq $RSp, $dst", IIC_LdStSTQ,
[]>, isPPC64;
def STQX_PSEUDO : PPCCustomInserterPseudo<(outs),
(ins g8prc:$RSp, memrr:$dst),
"#STQX_PSEUDO", []>;
def SPILL_QUADWORD : PPCEmitTimePseudo<(outs), (ins g8prc:$RSp, memrix:$dst),
"#SPILL_QUADWORD", []>;
}
}
def BUILD_QUADWORD : PPCPostRAExpPseudo<
(outs g8prc:$RTp),
(ins g8rc:$lo, g8rc:$hi),
"#BUILD_QUADWORD", []>;
def : Pat<(int_ppc_atomic_store_i128 i64:$lo, i64:$hi, DSForm:$dst),
(STQ (BUILD_QUADWORD g8rc:$lo, g8rc:$hi), memrix:$dst)>;
def : Pat<(int_ppc_atomic_store_i128 i64:$lo, i64:$hi, ForceXForm:$dst),
(STQX_PSEUDO (BUILD_QUADWORD g8rc:$lo, g8rc:$hi), memrr:$dst)>;
// Stores with Update (pre-inc).
let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
let Interpretation64Bit = 1, isCodeGenOnly = 1 in {

View File

@ -3104,6 +3104,7 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
// FIXME: Maybe we can expand it in 'PowerPC Expand Atomic' pass.
case PPC::CFENCE8: {
auto Val = MI.getOperand(0).getReg();
BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);

View File

@ -0,0 +1,169 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-track-subreg-liveness \
; RUN: -ppc-quadword-atomics < %s | FileCheck --check-prefix=P8 %s
define dso_local i128 @lq_unordered(i128* %src) {
; P8-LABEL: lq_unordered:
; P8: # %bb.0: # %entry
; P8-NEXT: lq r4, 0(r3)
; P8-NEXT: mr r3, r4
; P8-NEXT: mr r4, r5
; P8-NEXT: blr
entry:
%0 = load atomic i128, i128* %src unordered, align 16
ret i128 %0
}
define dso_local i128 @lqx_unordered(i128* %src, i64 %idx) {
; P8-LABEL: lqx_unordered:
; P8: # %bb.0: # %entry
; P8-NEXT: sldi r4, r4, 4
; P8-NEXT: add r3, r3, r4
; P8-NEXT: lq r4, 0(r3)
; P8-NEXT: mr r3, r4
; P8-NEXT: mr r4, r5
; P8-NEXT: blr
entry:
%0 = getelementptr i128, i128* %src, i64 %idx
%1 = load atomic i128, i128* %0 unordered, align 16
ret i128 %1
}
define dso_local i128 @lq_big_offset_unordered(i128* %src) {
; P8-LABEL: lq_big_offset_unordered:
; P8: # %bb.0: # %entry
; P8-NEXT: lis r4, 32
; P8-NEXT: add r3, r3, r4
; P8-NEXT: lq r4, 0(r3)
; P8-NEXT: mr r3, r4
; P8-NEXT: mr r4, r5
; P8-NEXT: blr
entry:
%0 = getelementptr i128, i128* %src, i64 131072
%1 = load atomic i128, i128* %0 unordered, align 16
ret i128 %1
}
define dso_local i128 @lq_monotonic(i128* %src) {
; P8-LABEL: lq_monotonic:
; P8: # %bb.0: # %entry
; P8-NEXT: lq r4, 0(r3)
; P8-NEXT: mr r3, r4
; P8-NEXT: mr r4, r5
; P8-NEXT: blr
entry:
%0 = load atomic i128, i128* %src monotonic, align 16
ret i128 %0
}
define dso_local i128 @lq_acquire(i128* %src) {
; P8-LABEL: lq_acquire:
; P8: # %bb.0: # %entry
; P8-NEXT: lq r4, 0(r3)
; P8-NEXT: cmpd cr7, r5, r5
; P8-NEXT: mr r3, r4
; P8-NEXT: mr r4, r5
; P8-NEXT: bne- cr7, .+4
; P8-NEXT: isync
; P8-NEXT: blr
entry:
%0 = load atomic i128, i128* %src acquire, align 16
ret i128 %0
}
define dso_local i128 @lq_seqcst(i128* %src) {
; P8-LABEL: lq_seqcst:
; P8: # %bb.0: # %entry
; P8-NEXT: sync
; P8-NEXT: lq r4, 0(r3)
; P8-NEXT: cmpd cr7, r5, r5
; P8-NEXT: mr r3, r4
; P8-NEXT: mr r4, r5
; P8-NEXT: bne- cr7, .+4
; P8-NEXT: isync
; P8-NEXT: blr
entry:
%0 = load atomic i128, i128* %src seq_cst, align 16
ret i128 %0
}
define dso_local void @stq_unordered(i128 %val, i128* %dst) {
; P8-LABEL: stq_unordered:
; P8: # %bb.0: # %entry
; P8-NEXT: mr r7, r4
; P8-NEXT: mr r6, r3
; P8-NEXT: stq r6, 0(r5)
; P8-NEXT: blr
entry:
store atomic i128 %val, i128* %dst unordered, align 16
ret void
}
define dso_local void @stqx_unordered(i128 %val, i128* %dst, i64 %idx) {
; P8-LABEL: stqx_unordered:
; P8: # %bb.0: # %entry
; P8-NEXT: sldi r6, r6, 4
; P8-NEXT: mr r9, r4
; P8-NEXT: mr r8, r3
; P8-NEXT: add r3, r5, r6
; P8-NEXT: stq r8, 0(r3)
; P8-NEXT: blr
entry:
%0 = getelementptr i128, i128* %dst, i64 %idx
store atomic i128 %val, i128* %0 unordered, align 16
ret void
}
define dso_local void @stq_big_offset_unordered(i128 %val, i128* %dst) {
; P8-LABEL: stq_big_offset_unordered:
; P8: # %bb.0: # %entry
; P8-NEXT: lis r6, 32
; P8-NEXT: mr r9, r4
; P8-NEXT: mr r8, r3
; P8-NEXT: add r3, r5, r6
; P8-NEXT: stq r8, 0(r3)
; P8-NEXT: blr
entry:
%0 = getelementptr i128, i128* %dst, i64 131072
store atomic i128 %val, i128* %0 unordered, align 16
ret void
}
define dso_local void @stq_monotonic(i128 %val, i128* %dst) {
; P8-LABEL: stq_monotonic:
; P8: # %bb.0: # %entry
; P8-NEXT: mr r7, r4
; P8-NEXT: mr r6, r3
; P8-NEXT: stq r6, 0(r5)
; P8-NEXT: blr
entry:
store atomic i128 %val, i128* %dst monotonic, align 16
ret void
}
define dso_local void @stq_release(i128 %val, i128* %dst) {
; P8-LABEL: stq_release:
; P8: # %bb.0: # %entry
; P8-NEXT: lwsync
; P8-NEXT: mr r7, r4
; P8-NEXT: mr r6, r3
; P8-NEXT: stq r6, 0(r5)
; P8-NEXT: blr
entry:
store atomic i128 %val, i128* %dst release, align 16
ret void
}
define dso_local void @stq_seqcst(i128 %val, i128* %dst) {
; P8-LABEL: stq_seqcst:
; P8: # %bb.0: # %entry
; P8-NEXT: sync
; P8-NEXT: mr r7, r4
; P8-NEXT: mr r6, r3
; P8-NEXT: stq r6, 0(r5)
; P8-NEXT: blr
entry:
store atomic i128 %val, i128* %dst seq_cst, align 16
ret void
}