From 843c61405881acdac1727c3a1ca4f36156611753 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Tue, 20 Jul 2021 12:35:49 -0700 Subject: [PATCH] [AArch64] Fix i128 cmpxchg using ldxp/stxp. Basically two parts to this fix: 1. Stop using AtomicExpand to expand cmpxchg i128 2. Fix AArch64ExpandPseudoInsts to use a correct expansion. From ARM architecture reference: To atomically load two 64-bit quantities, perform a Load-Exclusive pair/Store-Exclusive pair sequence of reading and writing the same value for which the Store-Exclusive pair succeeds, and use the read values from the Load-Exclusive pair. Fixes https://bugs.llvm.org/show_bug.cgi?id=51102 Differential Revision: https://reviews.llvm.org/D106039 --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 56 +++++++++++++++++-- .../Target/AArch64/AArch64ISelLowering.cpp | 31 ++++++++-- .../lib/Target/AArch64/AArch64InstrAtomics.td | 15 +++-- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 20 ++++++- .../AArch64/GlobalISel/arm64-atomic-128.ll | 2 +- .../GlobalISel/legalize-cmpxchg-128.mir | 4 +- llvm/test/CodeGen/AArch64/arm64-atomic-128.ll | 19 ++++--- llvm/test/CodeGen/AArch64/atomicrmw-O0.ll | 18 ++++-- 8 files changed, 136 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 4d303876ace2..f8fa9ff3c068 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -279,21 +279,46 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( Register NewLoReg = MI.getOperand(6).getReg(); Register NewHiReg = MI.getOperand(7).getReg(); + unsigned LdxpOp, StxpOp; + + switch (MI.getOpcode()) { + case AArch64::CMP_SWAP_128_MONOTONIC: + LdxpOp = AArch64::LDXPX; + StxpOp = AArch64::STXPX; + break; + case AArch64::CMP_SWAP_128_RELEASE: + LdxpOp = AArch64::LDXPX; + StxpOp = AArch64::STLXPX; + break; + case AArch64::CMP_SWAP_128_ACQUIRE: + LdxpOp = AArch64::LDAXPX; + StxpOp = AArch64::STXPX; + break; + case AArch64::CMP_SWAP_128: + LdxpOp = AArch64::LDAXPX; + StxpOp = AArch64::STLXPX; + break; + default: + llvm_unreachable("Unexpected opcode"); + } + MachineFunction *MF = MBB.getParent(); auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + auto FailBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); MF->insert(++MBB.getIterator(), LoadCmpBB); MF->insert(++LoadCmpBB->getIterator(), StoreBB); - MF->insert(++StoreBB->getIterator(), DoneBB); + MF->insert(++StoreBB->getIterator(), FailBB); + MF->insert(++FailBB->getIterator(), DoneBB); // .Lloadcmp: // ldaxp xDestLo, xDestHi, [xAddr] // cmp xDestLo, xDesiredLo // sbcs xDestHi, xDesiredHi // b.ne .Ldone - BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX)) + BuildMI(LoadCmpBB, DL, TII->get(LdxpOp)) .addReg(DestLo.getReg(), RegState::Define) .addReg(DestHi.getReg(), RegState::Define) .addReg(AddrReg); @@ -315,23 +340,37 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( .addImm(AArch64CC::EQ); BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW)) .addUse(StatusReg, getKillRegState(StatusDead)) - .addMBB(DoneBB); - LoadCmpBB->addSuccessor(DoneBB); + .addMBB(FailBB); + LoadCmpBB->addSuccessor(FailBB); LoadCmpBB->addSuccessor(StoreBB); // .Lstore: // stlxp wStatus, xNewLo, xNewHi, [xAddr] // cbnz wStatus, .Lloadcmp - BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg) + BuildMI(StoreBB, DL, TII->get(StxpOp), StatusReg) .addReg(NewLoReg) .addReg(NewHiReg) .addReg(AddrReg); BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW)) .addReg(StatusReg, getKillRegState(StatusDead)) .addMBB(LoadCmpBB); + BuildMI(StoreBB, DL, TII->get(AArch64::B)).addMBB(DoneBB); StoreBB->addSuccessor(LoadCmpBB); StoreBB->addSuccessor(DoneBB); + // .Lfail: + // stlxp wStatus, xDestLo, xDestHi, [xAddr] + // cbnz wStatus, .Lloadcmp + BuildMI(FailBB, DL, TII->get(StxpOp), StatusReg) + .addReg(DestLo.getReg()) + .addReg(DestHi.getReg()) + .addReg(AddrReg); + BuildMI(FailBB, DL, TII->get(AArch64::CBNZW)) + .addReg(StatusReg, getKillRegState(StatusDead)) + .addMBB(LoadCmpBB); + FailBB->addSuccessor(LoadCmpBB); + FailBB->addSuccessor(DoneBB); + DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end()); DoneBB->transferSuccessors(&MBB); @@ -343,9 +382,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128( // Recompute liveness bottom up. LivePhysRegs LiveRegs; computeAndAddLiveIns(LiveRegs, *DoneBB); + computeAndAddLiveIns(LiveRegs, *FailBB); computeAndAddLiveIns(LiveRegs, *StoreBB); computeAndAddLiveIns(LiveRegs, *LoadCmpBB); + // Do an extra pass in the loop to get the loop carried dependencies right. + FailBB->clearLiveIns(); + computeAndAddLiveIns(LiveRegs, *FailBB); StoreBB->clearLiveIns(); computeAndAddLiveIns(LiveRegs, *StoreBB); LoadCmpBB->clearLiveIns(); @@ -1093,6 +1136,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, AArch64_AM::getShifterImm(AArch64_AM::LSL, 0), AArch64::XZR, NextMBBI); case AArch64::CMP_SWAP_128: + case AArch64::CMP_SWAP_128_RELEASE: + case AArch64::CMP_SWAP_128_ACQUIRE: + case AArch64::CMP_SWAP_128_MONOTONIC: return expandCMP_SWAP_128(MBB, MBBI, NextMBBI); case AArch64::AESMCrrTied: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 57138b1e44dc..267f8a5f9d22 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17008,6 +17008,7 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, assert(N->getValueType(0) == MVT::i128 && "AtomicCmpSwap on types less than 128 should be legal"); + MachineMemOperand *MemOp = cast(N)->getMemOperand(); if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) { // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. @@ -17018,8 +17019,6 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, N->getOperand(0), // Chain in }; - MachineMemOperand *MemOp = cast(N)->getMemOperand(); - unsigned Opcode; switch (MemOp->getMergedOrdering()) { case AtomicOrdering::Monotonic: @@ -17056,6 +17055,25 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, return; } + unsigned Opcode; + switch (MemOp->getMergedOrdering()) { + case AtomicOrdering::Monotonic: + Opcode = AArch64::CMP_SWAP_128_MONOTONIC; + break; + case AtomicOrdering::Acquire: + Opcode = AArch64::CMP_SWAP_128_ACQUIRE; + break; + case AtomicOrdering::Release: + Opcode = AArch64::CMP_SWAP_128_RELEASE; + break; + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: + Opcode = AArch64::CMP_SWAP_128; + break; + default: + llvm_unreachable("Unexpected ordering!"); + } + auto Desired = splitInt128(N->getOperand(2), DAG); auto New = splitInt128(N->getOperand(3), DAG); SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, @@ -17063,8 +17081,6 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N, SDNode *CmpSwap = DAG.getMachineNode( AArch64::CMP_SWAP_128, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); - - MachineMemOperand *MemOp = cast(N)->getMemOperand(); DAG.setNodeMemRefs(cast(CmpSwap), {MemOp}); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, @@ -17285,6 +17301,13 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return AtomicExpansionKind::None; + + // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand + // it. + unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits(); + if (Size > 64) + return AtomicExpansionKind::None; + return AtomicExpansionKind::LLSC; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index 27e1d8ee6b98..84573dac7e41 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -429,11 +429,16 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch), } let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch", - mayLoad = 1, mayStore = 1 in -def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch), - (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, - GPR64:$newLo, GPR64:$newHi), []>, - Sched<[WriteAtomic]>; + mayLoad = 1, mayStore = 1 in { +class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch), + (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, + GPR64:$newLo, GPR64:$newHi), []>, + Sched<[WriteAtomic]>; +def CMP_SWAP_128 : cmp_swap_128; +def CMP_SWAP_128_RELEASE : cmp_swap_128; +def CMP_SWAP_128_ACQUIRE : cmp_swap_128; +def CMP_SWAP_128_MONOTONIC : cmp_swap_128; +} // v8.1 Atomic instructions: let Predicates = [HasLSE] in { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index d37be5ecf46f..04e2aa87db69 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1210,8 +1210,26 @@ bool AArch64LegalizerInfo::legalizeAtomicCmpxchg128( // The -O0 CMP_SWAP_128 is friendlier to generate code for because LDXP/STXP // can take arbitrary registers so it just has the normal GPR64 operands the // rest of AArch64 is expecting. + auto Ordering = (*MI.memoperands_begin())->getMergedOrdering(); + unsigned Opcode; + switch (Ordering) { + case AtomicOrdering::Acquire: + Opcode = AArch64::CMP_SWAP_128_ACQUIRE; + break; + case AtomicOrdering::Release: + Opcode = AArch64::CMP_SWAP_128_RELEASE; + break; + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: + Opcode = AArch64::CMP_SWAP_128; + break; + default: + Opcode = AArch64::CMP_SWAP_128_MONOTONIC; + break; + } + auto Scratch = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - CAS = MIRBuilder.buildInstr(AArch64::CMP_SWAP_128, {DstLo, DstHi, Scratch}, + CAS = MIRBuilder.buildInstr(Opcode, {DstLo, DstHi, Scratch}, {Addr, DesiredI->getOperand(0), DesiredI->getOperand(1), NewI->getOperand(0), NewI->getOperand(1)}); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll index 219d47349d1e..85072688e42b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll @@ -24,7 +24,7 @@ define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { ; CHECK-LLSC-O0: cmp [[OLD_HI]], x3 ; CHECK-LLSC-O0: cinc [[EQUAL:w[0-9]+]], [[EQUAL_TMP]], ne ; CHECK-LLSC-O0: cbnz [[EQUAL]], .LBB0_3 -; CHECK-LLSC-O0: stlxp [[STATUS:w[0-9]+]], x4, x5, [x0] +; CHECK-LLSC-O0: stxp [[STATUS:w[0-9]+]], x4, x5, [x0] ; CHECK-LLSC-O0: cbnz [[STATUS]], .LBB0_1 ; CHECK-LLSC-O0: .LBB0_3: ; CHECK-LLSC-O0: mov v[[OLD:[0-9]+]].d[0], [[OLD_LO]] diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir index 5c03edd472f4..f051fe7c604b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir @@ -23,7 +23,7 @@ body: | ; CHECK: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64) ; CHECK: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64) ; CHECK: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64) - ; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32 = CMP_SWAP_128 [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16) + ; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16) ; CHECK: [[COPY9:%[0-9]+]]:gpr64 = COPY %16 ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64) ; CHECK: G_STORE [[MV]](s128), [[COPY]](p0) :: (store 16) @@ -39,7 +39,7 @@ body: | ; CHECK-NOLSE: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64) ; CHECK-NOLSE: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64) ; CHECK-NOLSE: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64) - ; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32 = CMP_SWAP_128 [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128)) + ; CHECK-NOLSE: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128)) ; CHECK-NOLSE: [[COPY9:%[0-9]+]]:gpr64 = COPY %16 ; CHECK-NOLSE: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64) ; CHECK-NOLSE: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) diff --git a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll index 2409a3463df4..3495dceff399 100644 --- a/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -8,11 +8,16 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { ; CHECK-LABEL: val_compare_and_swap: ; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: ; CHECK: ldaxp [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]] -; CHECK-DAG: eor [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2 -; CHECK-DAG: eor [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3 -; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]] -; CHECK: cbnz [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]] -; CHECK: stxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]] +; CHECK: cmp +; CHECK: cset +; CHECK: cmp +; CHECK: cinc [[MISMATCH:w[0-9]+]] +; CHECK: cbz [[MISMATCH]], [[EQUAL:.LBB[0-9]+_[0-9]+]] +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[RESULTLO]], [[RESULTHI]], [x[[ADDR]]] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] +; CHECK: b [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: [[EQUAL]]: +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]] ; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] ; CHECK: [[DONE]]: %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire @@ -183,8 +188,8 @@ define i128 @atomic_load_relaxed(i64, i64, i128* %p) { ; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2] ; CHECK: cbnz [[SUCCESS]], [[LABEL]] ; CHECK-NOT: dmb - %r = load atomic i128, i128* %p monotonic, align 16 - ret i128 %r + %r = load atomic i128, i128* %p monotonic, align 16 + ret i128 %r } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll index d3a1f144b851..0f2be66dc46e 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll @@ -234,7 +234,12 @@ define i128 @test_rmw_add_128(i128* %dst) { ; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 ; NOLSE-NEXT: stlxp w12, x14, x15, [x13] ; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: b .LBB4_5 ; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2 +; NOLSE-NEXT: stlxp w12, x10, x9, [x13] +; NOLSE-NEXT: cbnz w12, .LBB4_2 +; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1 ; NOLSE-NEXT: eor x11, x9, x11 ; NOLSE-NEXT: eor x8, x10, x8 @@ -244,8 +249,8 @@ define i128 @test_rmw_add_128(i128* %dst) { ; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: cbnz x8, .LBB4_1 -; NOLSE-NEXT: b .LBB4_5 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: b .LBB4_6 +; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload ; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload ; NOLSE-NEXT: add sp, sp, #48 // =48 @@ -630,7 +635,12 @@ define i128 @test_rmw_nand_128(i128* %dst) { ; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 ; NOLSE-NEXT: stlxp w12, x14, x15, [x13] ; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: b .LBB9_5 ; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2 +; NOLSE-NEXT: stlxp w12, x10, x9, [x13] +; NOLSE-NEXT: cbnz w12, .LBB9_2 +; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start ; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1 ; NOLSE-NEXT: eor x11, x9, x11 ; NOLSE-NEXT: eor x8, x10, x8 @@ -640,8 +650,8 @@ define i128 @test_rmw_nand_128(i128* %dst) { ; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill ; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill ; NOLSE-NEXT: cbnz x8, .LBB9_1 -; NOLSE-NEXT: b .LBB9_5 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: b .LBB9_6 +; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end ; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload ; NOLSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload ; NOLSE-NEXT: add sp, sp, #48 // =48