AArch64: use ldp/stp for 128-bit atomic load/store in v.84 onwards

v8.4 says that normal loads/stores of 128-bytes are single-copy atomic if
they're properly aligned (which all LLVM atomics are) so we no longer need to
do a full RMW operation to guarantee we got a clean read.
This commit is contained in:
Tim Northover 2021-09-15 12:20:03 +01:00
parent 798e4bfbed
commit 13aa102e07
10 changed files with 551 additions and 22 deletions

View File

@ -61,6 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
"Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
"Enable out of line atomics to support LSE instructions">;
@ -459,7 +462,7 @@ def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
FeatureNV, FeatureMPAM, FeatureDIT,
FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
FeatureFlagM, FeatureRCPC_IMMO]>;
FeatureFlagM, FeatureRCPC_IMMO, FeatureLSE2]>;
def HasV8_5aOps : SubtargetFeature<
"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",

View File

@ -785,6 +785,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::i128, Custom);
setOperationAction(ISD::STORE, MVT::i128, Custom);
// Aligned 128-bit loads and stores are single-copy atomic according to the
// v8.4a spec.
if (Subtarget->hasLSE2()) {
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
}
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
// custom lowering, as there are no un-paired non-temporal stores and
// legalization will break up 256 bit inputs.
@ -4681,18 +4688,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
return Result;
}
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
SDValue Lo =
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
DAG.getConstant(0, Dl, MVT::i64));
SDValue Hi =
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
DAG.getConstant(1, Dl, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
return LowerStore128(Op, DAG);
} else if (MemVT == MVT::i64x8) {
SDValue Value = StoreNode->getValue();
assert(Value->getValueType(0) == MVT::i64x8);
@ -4713,6 +4709,31 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
return SDValue();
}
/// Lower atomic or volatile 128-bit stores to a single STP instruction.
SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
SelectionDAG &DAG) const {
MemSDNode *StoreNode = cast<MemSDNode>(Op);
assert(StoreNode->getMemoryVT() == MVT::i128);
assert(StoreNode->isVolatile() || StoreNode->isAtomic());
assert(!StoreNode->isAtomic() ||
StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
SDValue Value = StoreNode->getOpcode() == ISD::STORE
? StoreNode->getOperand(1)
: StoreNode->getOperand(2);
SDLoc DL(Op);
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
DAG.getConstant(0, DL, MVT::i64));
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
DAG.getConstant(1, DL, MVT::i64));
SDValue Result = DAG.getMemIntrinsicNode(
AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
return Result;
}
SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@ -4950,6 +4971,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
/*OverrideNEON=*/true);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::ATOMIC_STORE:
if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
assert(Subtarget->hasLSE2());
return LowerStore128(Op, DAG);
}
return SDValue();
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::MSTORE:
@ -17502,12 +17529,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::ATOMIC_CMP_SWAP:
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
return;
case ISD::ATOMIC_LOAD:
case ISD::LOAD: {
assert(SDValue(N, 0).getValueType() == MVT::i128 &&
"unexpected load's value type");
LoadSDNode *LoadNode = cast<LoadSDNode>(N);
if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
// Non-volatile loads are optimized later in AArch64's load/store
MemSDNode *LoadNode = cast<MemSDNode>(N);
if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
LoadNode->getMemoryVT() != MVT::i128) {
// Non-volatile or atomic loads are optimized later in AArch64's load/store
// optimizer.
return;
}
@ -17598,12 +17627,37 @@ AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
// provided the address is 16-byte aligned.
bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
if (!Subtarget->hasLSE2())
return false;
if (auto LI = dyn_cast<LoadInst>(I))
return LI->getType()->getPrimitiveSizeInBits() == 128 &&
LI->getAlignment() >= 16;
if (auto SI = dyn_cast<StoreInst>(I))
return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
SI->getAlignment() >= 16;
return false;
}
bool AArch64TargetLowering::shouldInsertFencesForAtomic(
const Instruction *I) const {
return isOpSuitableForLDPSTP(I);
}
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
return Size == 128;
if (Size != 128)
return false;
return !isOpSuitableForLDPSTP(SI);
}
// Loads and stores less than 128-bits are already atomic; ones above that
@ -17612,7 +17666,11 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
if (Size != 128 || isOpSuitableForLDPSTP(LI))
return AtomicExpansionKind::None;
return AtomicExpansionKind::LLSC;
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,

View File

@ -660,6 +660,9 @@ public:
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
bool isOpSuitableForLDPSTP(const Instruction *I) const;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
@ -863,6 +866,7 @@ private:
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;

View File

@ -99,6 +99,7 @@ protected:
bool HasDotProd = false;
bool HasCRC = false;
bool HasLSE = false;
bool HasLSE2 = false;
bool HasRAS = false;
bool HasRDM = false;
bool HasPerfMon = false;
@ -375,6 +376,7 @@ public:
bool hasDotProd() const { return HasDotProd; }
bool hasCRC() const { return HasCRC; }
bool hasLSE() const { return HasLSE; }
bool hasLSE2() const { return HasLSE2; }
bool hasRAS() const { return HasRAS; }
bool hasRDM() const { return HasRDM; }
bool hasSM4() const { return HasSM4; }

View File

@ -824,6 +824,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
return isStore ? AArch64::STRSui : AArch64::LDRSui;
case 64:
return isStore ? AArch64::STRDui : AArch64::LDRDui;
case 128:
return isStore ? AArch64::STRQui : AArch64::LDRQui;
}
break;
}

View File

@ -16,6 +16,7 @@
#include "AArch64Subtarget.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineInstr.h"
@ -35,6 +36,7 @@ using namespace llvm;
using namespace LegalizeActions;
using namespace LegalizeMutations;
using namespace LegalityPredicates;
using namespace MIPatternMatch;
AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
: ST(&ST) {
@ -278,6 +280,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
};
getActionDefinitionsBuilder(G_LOAD)
.customIf([=](const LegalityQuery &Query) {
return Query.Types[0] == s128 &&
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
})
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
{s16, p0, s16, 8},
{s32, p0, s32, 8},
@ -316,6 +322,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.scalarizeIf(typeIs(0, v2s16), 0);
getActionDefinitionsBuilder(G_STORE)
.customIf([=](const LegalityQuery &Query) {
return Query.Types[0] == s128 &&
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
})
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
{s16, p0, s8, 8}, // truncstorei8 from s16
{s32, p0, s8, 8}, // truncstorei8 from s32
@ -992,6 +1002,20 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
return true;
}
static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
MachineRegisterInfo &MRI) {
Base = Root;
Offset = 0;
Register NewBase;
int64_t NewOffset;
if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
isShiftedInt<7, 3>(NewOffset)) {
Base = NewBase;
Offset = NewOffset;
}
}
// FIXME: This should be removed and replaced with the generic bitcast legalize
// action.
bool AArch64LegalizerInfo::legalizeLoadStore(
@ -1011,6 +1035,36 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
Register ValReg = MI.getOperand(0).getReg();
const LLT ValTy = MRI.getType(ValReg);
if (ValTy == LLT::scalar(128)) {
assert((*MI.memoperands_begin())->getSuccessOrdering() ==
AtomicOrdering::Monotonic ||
(*MI.memoperands_begin())->getSuccessOrdering() ==
AtomicOrdering::Unordered);
assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
LLT s64 = LLT::scalar(64);
MachineInstrBuilder NewI;
if (MI.getOpcode() == TargetOpcode::G_LOAD) {
NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
} else {
auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
NewI = MIRBuilder.buildInstr(
AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
}
Register Base;
int Offset;
matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
NewI.addUse(Base);
NewI.addImm(Offset / 8);
NewI.cloneMemRefs(MI);
constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
*MRI.getTargetRegisterInfo(),
*ST->getRegBankInfo());
MI.eraseFromParent();
return true;
}
if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
ValTy.getElementType().getAddressSpace() != 0) {
LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
@var = global i128 0
define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
@ -411,7 +411,7 @@ define void @atomic_load_relaxed(i64, i64, i128* %p, i128* %p2) {
; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11]
; CHECK-CAS-O0-NEXT: mov x8, #0
; CHECK-CAS-O0-NEXT: mov x8, xzr
; CHECK-CAS-O0-NEXT: orr x9, x9, x8
; CHECK-CAS-O0-NEXT: orr x10, x8, x10
; CHECK-CAS-O0-NEXT: // implicit-def: $q0

View File

@ -0,0 +1,212 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s
define void @test_atomic_load(i128* %addr) {
; CHECK-LABEL: test_atomic_load:
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%res.0 = load atomic i128, i128* %addr monotonic, align 16
store i128 %res.0, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%res.1 = load atomic i128, i128* %addr unordered, align 16
store i128 %res.1, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: dmb ish
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%res.2 = load atomic i128, i128* %addr acquire, align 16
store i128 %res.2, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: dmb ish
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%res.3 = load atomic i128, i128* %addr seq_cst, align 16
store i128 %res.3, i128* %addr
%addr8 = bitcast i128* %addr to i8*
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 8
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.5, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
%addr128.2 = bitcast i8* %addr8.2 to i128*
%res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16
store i128 %res.6, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
%addr128.3 = bitcast i8* %addr8.3 to i128*
%res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16
store i128 %res.7, i128* %addr
ret void
}
define void @test_libcall_load(i128* %addr) {
; CHECK-LABEL: test_libcall_load:
; CHECK: bl __atomic_load
%res.8 = load atomic i128, i128* %addr unordered, align 8
store i128 %res.8, i128* %addr
ret void
}
define void @test_nonfolded_load1(i128* %addr) {
; CHECK-LABEL: test_nonfolded_load1:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.1, i128* %addr
ret void
}
define void @test_nonfolded_load2(i128* %addr) {
; CHECK-LABEL: test_nonfolded_load2:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.1, i128* %addr
ret void
}
define void @test_nonfolded_load3(i128* %addr) {
; CHECK-LABEL: test_nonfolded_load3:
%addr8 = bitcast i128* %addr to i8*
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
; CHECK: mov v[[Q]].d[1], [[HI]]
; CHECK: str q[[Q]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.1, i128* %addr
ret void
}
define void @test_atomic_store(i128* %addr, i128 %val) {
; CHECK-LABEL: test_atomic_store:
; CHECK: stp x2, x3, [x0]
store atomic i128 %val, i128* %addr monotonic, align 16
; CHECK: stp x2, x3, [x0]
store atomic i128 %val, i128* %addr unordered, align 16
; CHECK: dmb ish
; CHECK: stp x2, x3, [x0]
store atomic i128 %val, i128* %addr release, align 16
; CHECK: dmb ish
; CHECK: stp x2, x3, [x0]
; CHECK: dmb ish
store atomic i128 %val, i128* %addr seq_cst, align 16
%addr8 = bitcast i128* %addr to i8*
; CHECK: stp x2, x3, [x0, #8]
%addr8.1 = getelementptr i8, i8* %addr8, i32 8
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
; CHECK: stp x2, x3, [x0, #504]
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
%addr128.2 = bitcast i8* %addr8.2 to i128*
store atomic i128 %val, i128* %addr128.2 monotonic, align 16
; CHECK: stp x2, x3, [x0, #-512]
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
%addr128.3 = bitcast i8* %addr8.3 to i128*
store atomic i128 %val, i128* %addr128.3 monotonic, align 16
ret void
}
define void @test_libcall_store(i128* %addr, i128 %val) {
; CHECK-LABEL: test_libcall_store:
; CHECK: bl __atomic_store
store atomic i128 %val, i128* %addr unordered, align 8
ret void
}
define void @test_nonfolded_store1(i128* %addr, i128 %val) {
; CHECK-LABEL: test_nonfolded_store1:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
; CHECK: stp x2, x3, [x[[ADDR]]]
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
ret void
}
define void @test_nonfolded_store2(i128* %addr, i128 %val) {
; CHECK-LABEL: test_nonfolded_store2:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
; CHECK: stp x2, x3, [x[[ADDR]]]
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
ret void
}
define void @test_nonfolded_store3(i128* %addr, i128 %val) {
; CHECK-LABEL: test_nonfolded_store3:
%addr8 = bitcast i128* %addr to i8*
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
; CHECK: stp x2, x3, [x[[ADDR]]]
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
ret void
}

View File

@ -3,7 +3,7 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira -mattr=-lse2 < %s | FileCheck %s
; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
; (i.e. reusing a register for status & data in store exclusive).

View File

@ -0,0 +1,194 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - | FileCheck %s
define void @test_atomic_load(i128* %addr) {
; CHECK-LABEL: test_atomic_load:
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: stp [[LO]], [[HI]], [x0]
%res.0 = load atomic i128, i128* %addr monotonic, align 16
store i128 %res.0, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: stp [[LO]], [[HI]], [x0]
%res.1 = load atomic i128, i128* %addr unordered, align 16
store i128 %res.1, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: dmb ish
; CHECK: stp [[LO]], [[HI]], [x0]
%res.2 = load atomic i128, i128* %addr acquire, align 16
store i128 %res.2, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: dmb ish
; CHECK: stp [[LO]], [[HI]], [x0]
%res.3 = load atomic i128, i128* %addr seq_cst, align 16
store i128 %res.3, i128* %addr
%addr8 = bitcast i128* %addr to i8*
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #32]
; CHECK-DAG: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 32
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.5, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
%addr128.2 = bitcast i8* %addr8.2 to i128*
%res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16
store i128 %res.6, i128* %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
%addr128.3 = bitcast i8* %addr8.3 to i128*
%res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16
store i128 %res.7, i128* %addr
ret void
}
define void @test_libcall_load(i128* %addr) {
; CHECK-LABEL: test_libcall_load:
; CHECK: bl __atomic_load
%res.8 = load atomic i128, i128* %addr unordered, align 8
store i128 %res.8, i128* %addr
ret void
}
define void @test_nonfolded_load1(i128* %addr) {
; CHECK-LABEL: test_nonfolded_load1:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.1, i128* %addr
ret void
}
define void @test_nonfolded_load2(i128* %addr) {
; CHECK-LABEL: test_nonfolded_load2:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.1, i128* %addr
ret void
}
define void @test_nonfolded_load3(i128* %addr) {
; CHECK-LABEL: test_nonfolded_load3:
%addr8 = bitcast i128* %addr to i8*
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
%addr128.1 = bitcast i8* %addr8.1 to i128*
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
store i128 %res.1, i128* %addr
ret void
}
define void @test_atomic_store(i128* %addr, i128 %val) {
; CHECK-LABEL: test_atomic_store:
; CHECK: stp x2, x3, [x0]
store atomic i128 %val, i128* %addr monotonic, align 16
; CHECK: stp x2, x3, [x0]
store atomic i128 %val, i128* %addr unordered, align 16
; CHECK: dmb ish
; CHECK: stp x2, x3, [x0]
store atomic i128 %val, i128* %addr release, align 16
; CHECK: dmb ish
; CHECK: stp x2, x3, [x0]
; CHECK: dmb ish
store atomic i128 %val, i128* %addr seq_cst, align 16
%addr8 = bitcast i128* %addr to i8*
; CHECK: stp x2, x3, [x0, #8]
%addr8.1 = getelementptr i8, i8* %addr8, i32 8
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
; CHECK: stp x2, x3, [x0, #504]
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
%addr128.2 = bitcast i8* %addr8.2 to i128*
store atomic i128 %val, i128* %addr128.2 monotonic, align 16
; CHECK: stp x2, x3, [x0, #-512]
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
%addr128.3 = bitcast i8* %addr8.3 to i128*
store atomic i128 %val, i128* %addr128.3 monotonic, align 16
ret void
}
define void @test_libcall_store(i128* %addr, i128 %val) {
; CHECK-LABEL: test_libcall_store:
; CHECK: bl __atomic_store
store atomic i128 %val, i128* %addr unordered, align 8
ret void
}
define void @test_nonfolded_store1(i128* %addr, i128 %val) {
; CHECK-LABEL: test_nonfolded_store1:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
; CHECK: stp x2, x3, [x[[ADDR]]]
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
ret void
}
define void @test_nonfolded_store2(i128* %addr, i128 %val) {
; CHECK-LABEL: test_nonfolded_store2:
%addr8 = bitcast i128* %addr to i8*
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
; CHECK: stp x2, x3, [x[[ADDR]]]
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
ret void
}
define void @test_nonfolded_store3(i128* %addr, i128 %val) {
; CHECK-LABEL: test_nonfolded_store3:
%addr8 = bitcast i128* %addr to i8*
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
; CHECK: stp x2, x3, [x[[ADDR]]]
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
%addr128.1 = bitcast i8* %addr8.1 to i128*
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
ret void
}