[AArch64] Fix i128 cmpxchg using ldxp/stxp.

Basically two parts to this fix:

1. Stop using AtomicExpand to expand cmpxchg i128
2. Fix AArch64ExpandPseudoInsts to use a correct expansion.

From ARM architecture reference:

To atomically load two 64-bit quantities, perform a Load-Exclusive
pair/Store-Exclusive pair sequence of reading and writing the same value
for which the Store-Exclusive pair succeeds, and use the read values
from the Load-Exclusive pair.

Fixes https://bugs.llvm.org/show_bug.cgi?id=51102

Differential Revision: https://reviews.llvm.org/D106039
This commit is contained in:
Eli Friedman 2021-07-20 12:35:49 -07:00
parent d51f74acdf
commit 843c614058
8 changed files with 136 additions and 29 deletions

View File

@ -279,21 +279,46 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
Register NewLoReg = MI.getOperand(6).getReg();
Register NewHiReg = MI.getOperand(7).getReg();
unsigned LdxpOp, StxpOp;
switch (MI.getOpcode()) {
case AArch64::CMP_SWAP_128_MONOTONIC:
LdxpOp = AArch64::LDXPX;
StxpOp = AArch64::STXPX;
break;
case AArch64::CMP_SWAP_128_RELEASE:
LdxpOp = AArch64::LDXPX;
StxpOp = AArch64::STLXPX;
break;
case AArch64::CMP_SWAP_128_ACQUIRE:
LdxpOp = AArch64::LDAXPX;
StxpOp = AArch64::STXPX;
break;
case AArch64::CMP_SWAP_128:
LdxpOp = AArch64::LDAXPX;
StxpOp = AArch64::STLXPX;
break;
default:
llvm_unreachable("Unexpected opcode");
}
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto FailBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
MF->insert(++MBB.getIterator(), LoadCmpBB);
MF->insert(++LoadCmpBB->getIterator(), StoreBB);
MF->insert(++StoreBB->getIterator(), DoneBB);
MF->insert(++StoreBB->getIterator(), FailBB);
MF->insert(++FailBB->getIterator(), DoneBB);
// .Lloadcmp:
// ldaxp xDestLo, xDestHi, [xAddr]
// cmp xDestLo, xDesiredLo
// sbcs xDestHi, xDesiredHi
// b.ne .Ldone
BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
BuildMI(LoadCmpBB, DL, TII->get(LdxpOp))
.addReg(DestLo.getReg(), RegState::Define)
.addReg(DestHi.getReg(), RegState::Define)
.addReg(AddrReg);
@ -315,23 +340,37 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
.addUse(StatusReg, getKillRegState(StatusDead))
.addMBB(DoneBB);
LoadCmpBB->addSuccessor(DoneBB);
.addMBB(FailBB);
LoadCmpBB->addSuccessor(FailBB);
LoadCmpBB->addSuccessor(StoreBB);
// .Lstore:
// stlxp wStatus, xNewLo, xNewHi, [xAddr]
// cbnz wStatus, .Lloadcmp
BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
BuildMI(StoreBB, DL, TII->get(StxpOp), StatusReg)
.addReg(NewLoReg)
.addReg(NewHiReg)
.addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
.addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
BuildMI(StoreBB, DL, TII->get(AArch64::B)).addMBB(DoneBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
// .Lfail:
// stlxp wStatus, xDestLo, xDestHi, [xAddr]
// cbnz wStatus, .Lloadcmp
BuildMI(FailBB, DL, TII->get(StxpOp), StatusReg)
.addReg(DestLo.getReg())
.addReg(DestHi.getReg())
.addReg(AddrReg);
BuildMI(FailBB, DL, TII->get(AArch64::CBNZW))
.addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
FailBB->addSuccessor(LoadCmpBB);
FailBB->addSuccessor(DoneBB);
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
@ -343,9 +382,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
// Recompute liveness bottom up.
LivePhysRegs LiveRegs;
computeAndAddLiveIns(LiveRegs, *DoneBB);
computeAndAddLiveIns(LiveRegs, *FailBB);
computeAndAddLiveIns(LiveRegs, *StoreBB);
computeAndAddLiveIns(LiveRegs, *LoadCmpBB);
// Do an extra pass in the loop to get the loop carried dependencies right.
FailBB->clearLiveIns();
computeAndAddLiveIns(LiveRegs, *FailBB);
StoreBB->clearLiveIns();
computeAndAddLiveIns(LiveRegs, *StoreBB);
LoadCmpBB->clearLiveIns();
@ -1093,6 +1136,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
AArch64::XZR, NextMBBI);
case AArch64::CMP_SWAP_128:
case AArch64::CMP_SWAP_128_RELEASE:
case AArch64::CMP_SWAP_128_ACQUIRE:
case AArch64::CMP_SWAP_128_MONOTONIC:
return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
case AArch64::AESMCrrTied:

View File

@ -17008,6 +17008,7 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
@ -17018,8 +17019,6 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
N->getOperand(0), // Chain in
};
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
unsigned Opcode;
switch (MemOp->getMergedOrdering()) {
case AtomicOrdering::Monotonic:
@ -17056,6 +17055,25 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
return;
}
unsigned Opcode;
switch (MemOp->getMergedOrdering()) {
case AtomicOrdering::Monotonic:
Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
break;
case AtomicOrdering::Acquire:
Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
break;
case AtomicOrdering::Release:
Opcode = AArch64::CMP_SWAP_128_RELEASE;
break;
case AtomicOrdering::AcquireRelease:
case AtomicOrdering::SequentiallyConsistent:
Opcode = AArch64::CMP_SWAP_128;
break;
default:
llvm_unreachable("Unexpected ordering!");
}
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
@ -17063,8 +17081,6 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
SDNode *CmpSwap = DAG.getMachineNode(
AArch64::CMP_SWAP_128, SDLoc(N),
DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
@ -17285,6 +17301,13 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return AtomicExpansionKind::None;
// 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
// it.
unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
if (Size > 64)
return AtomicExpansionKind::None;
return AtomicExpansionKind::LLSC;
}

View File

@ -429,11 +429,16 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch),
}
let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch",
mayLoad = 1, mayStore = 1 in
def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
mayLoad = 1, mayStore = 1 in {
class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch),
(ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
GPR64:$newLo, GPR64:$newHi), []>,
Sched<[WriteAtomic]>;
def CMP_SWAP_128 : cmp_swap_128;
def CMP_SWAP_128_RELEASE : cmp_swap_128;
def CMP_SWAP_128_ACQUIRE : cmp_swap_128;
def CMP_SWAP_128_MONOTONIC : cmp_swap_128;
}
// v8.1 Atomic instructions:
let Predicates = [HasLSE] in {

View File

@ -1210,8 +1210,26 @@ bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128(
// The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP
// can take arbitrary registers so it just has the normal GPR64 operands the
// rest of AArch64 is expecting.
auto Ordering = (*MI.memoperands_begin())->getMergedOrdering();
unsigned Opcode;
switch (Ordering) {
case AtomicOrdering::Acquire:
Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
break;
case AtomicOrdering::Release:
Opcode = AArch64::CMP_SWAP_128_RELEASE;
break;
case AtomicOrdering::AcquireRelease:
case AtomicOrdering::SequentiallyConsistent:
Opcode = AArch64::CMP_SWAP_128;
break;
default:
Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
break;
}
auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
CAS = MIRBuilder.buildInstr(AArch64::CMP_SWAP_128, {DstLo, DstHi, Scratch},
CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch},
{Addr, DesiredI->getOperand(0),
DesiredI->getOperand(1), NewI->getOperand(0),
NewI->getOperand(1)});

View File

@ -24,7 +24,7 @@ define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LLSC-O0: cmp [[OLD_HI]], x3
; CHECK-LLSC-O0: cinc [[EQUAL:w[0-9]+]], [[EQUAL_TMP]], ne
; CHECK-LLSC-O0: cbnz [[EQUAL]], .LBB0_3
; CHECK-LLSC-O0: stlxp [[STATUS:w[0-9]+]], x4, x5, [x0]
; CHECK-LLSC-O0: stxp [[STATUS:w[0-9]+]], x4, x5, [x0]
; CHECK-LLSC-O0: cbnz [[STATUS]], .LBB0_1
; CHECK-LLSC-O0: .LBB0_3:
; CHECK-LLSC-O0: mov v[[OLD:[0-9]+]].d[0], [[OLD_LO]]

View File

@ -23,7 +23,7 @@ body: |
; CHECK: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64)
; CHECK: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64)
; CHECK: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64)
; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32 = CMP_SWAP_128 [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16)
; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16)
; CHECK: [[COPY9:%[0-9]+]]:gpr64 = COPY %16
; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64)
; CHECK: G_STORE [[MV]](s128), [[COPY]](p0) :: (store 16)
@ -39,7 +39,7 @@ body: |
; CHECK-NOLSE: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64)
; CHECK-NOLSE: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64)
; CHECK-NOLSE: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64)
; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32 = CMP_SWAP_128 [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128))
; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128))
; CHECK-NOLSE: [[COPY9:%[0-9]+]]:gpr64 = COPY %16
; CHECK-NOLSE: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64)
; CHECK-NOLSE: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128))

View File

@ -8,11 +8,16 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LABEL: val_compare_and_swap:
; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
; CHECK: ldaxp [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
; CHECK-DAG: eor [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
; CHECK-DAG: eor [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
; CHECK: cbnz [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
; CHECK: stxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
; CHECK: cmp
; CHECK: cset
; CHECK: cmp
; CHECK: cinc [[MISMATCH:w[0-9]+]]
; CHECK: cbz [[MISMATCH]], [[EQUAL:.LBB[0-9]+_[0-9]+]]
; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[RESULTLO]], [[RESULTHI]], [x[[ADDR]]]
; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
; CHECK: b [[DONE:.LBB[0-9]+_[0-9]+]]
; CHECK: [[EQUAL]]:
; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
; CHECK: [[DONE]]:
%pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire

View File

@ -234,7 +234,12 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
; NOLSE-NEXT: cbnz w12, .LBB4_2
; NOLSE-NEXT: b .LBB4_5
; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
; NOLSE-NEXT: cbnz w12, .LBB4_2
; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1
; NOLSE-NEXT: eor x11, x9, x11
; NOLSE-NEXT: eor x8, x10, x8
@ -244,8 +249,8 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: cbnz x8, .LBB4_1
; NOLSE-NEXT: b .LBB4_5
; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end
; NOLSE-NEXT: b .LBB4_6
; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload
; NOLSE-NEXT: add sp, sp, #48 // =48
@ -630,7 +635,12 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
; NOLSE-NEXT: cbnz w12, .LBB9_2
; NOLSE-NEXT: b .LBB9_5
; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
; NOLSE-NEXT: cbnz w12, .LBB9_2
; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1
; NOLSE-NEXT: eor x11, x9, x11
; NOLSE-NEXT: eor x8, x10, x8
@ -640,8 +650,8 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
; NOLSE-NEXT: cbnz x8, .LBB9_1
; NOLSE-NEXT: b .LBB9_5
; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end
; NOLSE-NEXT: b .LBB9_6
; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload
; NOLSE-NEXT: add sp, sp, #48 // =48