forked from OSchip/llvm-project
AArch64: use ldp/stp for 128-bit atomic load/store in v.84 onwards
v8.4 says that normal loads/stores of 128-bytes are single-copy atomic if they're properly aligned (which all LLVM atomics are) so we no longer need to do a full RMW operation to guarantee we got a clean read.
This commit is contained in:
parent
798e4bfbed
commit
13aa102e07
|
@ -61,6 +61,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
|
|||
def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
|
||||
"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
|
||||
|
||||
def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true",
|
||||
"Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">;
|
||||
|
||||
def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true",
|
||||
"Enable out of line atomics to support LSE instructions">;
|
||||
|
||||
|
@ -459,7 +462,7 @@ def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true",
|
|||
"Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd,
|
||||
FeatureNV, FeatureMPAM, FeatureDIT,
|
||||
FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI,
|
||||
FeatureFlagM, FeatureRCPC_IMMO]>;
|
||||
FeatureFlagM, FeatureRCPC_IMMO, FeatureLSE2]>;
|
||||
|
||||
def HasV8_5aOps : SubtargetFeature<
|
||||
"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
|
||||
|
|
|
@ -785,6 +785,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::LOAD, MVT::i128, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::i128, Custom);
|
||||
|
||||
// Aligned 128-bit loads and stores are single-copy atomic according to the
|
||||
// v8.4a spec.
|
||||
if (Subtarget->hasLSE2()) {
|
||||
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
|
||||
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
|
||||
}
|
||||
|
||||
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
|
||||
// custom lowering, as there are no un-paired non-temporal stores and
|
||||
// legalization will break up 256 bit inputs.
|
||||
|
@ -4681,18 +4688,7 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
|
|||
return Result;
|
||||
}
|
||||
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
|
||||
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
|
||||
SDValue Lo =
|
||||
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
|
||||
DAG.getConstant(0, Dl, MVT::i64));
|
||||
SDValue Hi =
|
||||
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
|
||||
DAG.getConstant(1, Dl, MVT::i64));
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
|
||||
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
|
||||
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
|
||||
return Result;
|
||||
return LowerStore128(Op, DAG);
|
||||
} else if (MemVT == MVT::i64x8) {
|
||||
SDValue Value = StoreNode->getValue();
|
||||
assert(Value->getValueType(0) == MVT::i64x8);
|
||||
|
@ -4713,6 +4709,31 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
/// Lower atomic or volatile 128-bit stores to a single STP instruction.
|
||||
SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
MemSDNode *StoreNode = cast<MemSDNode>(Op);
|
||||
assert(StoreNode->getMemoryVT() == MVT::i128);
|
||||
assert(StoreNode->isVolatile() || StoreNode->isAtomic());
|
||||
assert(!StoreNode->isAtomic() ||
|
||||
StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
|
||||
StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
|
||||
|
||||
SDValue Value = StoreNode->getOpcode() == ISD::STORE
|
||||
? StoreNode->getOperand(1)
|
||||
: StoreNode->getOperand(2);
|
||||
SDLoc DL(Op);
|
||||
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
|
||||
DAG.getConstant(0, DL, MVT::i64));
|
||||
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
|
||||
DAG.getConstant(1, DL, MVT::i64));
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
|
||||
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
|
||||
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
|
||||
return Result;
|
||||
}
|
||||
|
||||
SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc DL(Op);
|
||||
|
@ -4950,6 +4971,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
|
|||
/*OverrideNEON=*/true);
|
||||
case ISD::INTRINSIC_WO_CHAIN:
|
||||
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
||||
case ISD::ATOMIC_STORE:
|
||||
if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
|
||||
assert(Subtarget->hasLSE2());
|
||||
return LowerStore128(Op, DAG);
|
||||
}
|
||||
return SDValue();
|
||||
case ISD::STORE:
|
||||
return LowerSTORE(Op, DAG);
|
||||
case ISD::MSTORE:
|
||||
|
@ -17502,12 +17529,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
|
|||
case ISD::ATOMIC_CMP_SWAP:
|
||||
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
|
||||
return;
|
||||
case ISD::ATOMIC_LOAD:
|
||||
case ISD::LOAD: {
|
||||
assert(SDValue(N, 0).getValueType() == MVT::i128 &&
|
||||
"unexpected load's value type");
|
||||
LoadSDNode *LoadNode = cast<LoadSDNode>(N);
|
||||
if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
|
||||
// Non-volatile loads are optimized later in AArch64's load/store
|
||||
MemSDNode *LoadNode = cast<MemSDNode>(N);
|
||||
if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
|
||||
LoadNode->getMemoryVT() != MVT::i128) {
|
||||
// Non-volatile or atomic loads are optimized later in AArch64's load/store
|
||||
// optimizer.
|
||||
return;
|
||||
}
|
||||
|
@ -17598,12 +17627,37 @@ AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
|
|||
return TargetLoweringBase::getPreferredVectorAction(VT);
|
||||
}
|
||||
|
||||
// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
|
||||
// provided the address is 16-byte aligned.
|
||||
bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
|
||||
if (!Subtarget->hasLSE2())
|
||||
return false;
|
||||
|
||||
if (auto LI = dyn_cast<LoadInst>(I))
|
||||
return LI->getType()->getPrimitiveSizeInBits() == 128 &&
|
||||
LI->getAlignment() >= 16;
|
||||
|
||||
if (auto SI = dyn_cast<StoreInst>(I))
|
||||
return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
|
||||
SI->getAlignment() >= 16;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool AArch64TargetLowering::shouldInsertFencesForAtomic(
|
||||
const Instruction *I) const {
|
||||
return isOpSuitableForLDPSTP(I);
|
||||
}
|
||||
|
||||
// Loads and stores less than 128-bits are already atomic; ones above that
|
||||
// are doomed anyway, so defer to the default libcall and blame the OS when
|
||||
// things go wrong.
|
||||
bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
|
||||
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
|
||||
return Size == 128;
|
||||
if (Size != 128)
|
||||
return false;
|
||||
|
||||
return !isOpSuitableForLDPSTP(SI);
|
||||
}
|
||||
|
||||
// Loads and stores less than 128-bits are already atomic; ones above that
|
||||
|
@ -17612,7 +17666,11 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
|
|||
TargetLowering::AtomicExpansionKind
|
||||
AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
|
||||
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
|
||||
return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
|
||||
|
||||
if (Size != 128 || isOpSuitableForLDPSTP(LI))
|
||||
return AtomicExpansionKind::None;
|
||||
|
||||
return AtomicExpansionKind::LLSC;
|
||||
}
|
||||
|
||||
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
|
||||
|
|
|
@ -660,6 +660,9 @@ public:
|
|||
|
||||
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
|
||||
|
||||
bool isOpSuitableForLDPSTP(const Instruction *I) const;
|
||||
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
|
||||
|
||||
TargetLoweringBase::AtomicExpansionKind
|
||||
shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
|
||||
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
|
||||
|
@ -863,6 +866,7 @@ private:
|
|||
|
||||
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
|
@ -99,6 +99,7 @@ protected:
|
|||
bool HasDotProd = false;
|
||||
bool HasCRC = false;
|
||||
bool HasLSE = false;
|
||||
bool HasLSE2 = false;
|
||||
bool HasRAS = false;
|
||||
bool HasRDM = false;
|
||||
bool HasPerfMon = false;
|
||||
|
@ -375,6 +376,7 @@ public:
|
|||
bool hasDotProd() const { return HasDotProd; }
|
||||
bool hasCRC() const { return HasCRC; }
|
||||
bool hasLSE() const { return HasLSE; }
|
||||
bool hasLSE2() const { return HasLSE2; }
|
||||
bool hasRAS() const { return HasRAS; }
|
||||
bool hasRDM() const { return HasRDM; }
|
||||
bool hasSM4() const { return HasSM4; }
|
||||
|
|
|
@ -824,6 +824,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
|
|||
return isStore ? AArch64::STRSui : AArch64::LDRSui;
|
||||
case 64:
|
||||
return isStore ? AArch64::STRDui : AArch64::LDRDui;
|
||||
case 128:
|
||||
return isStore ? AArch64::STRQui : AArch64::LDRQui;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "AArch64Subtarget.h"
|
||||
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
|
||||
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
|
||||
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
|
||||
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
|
||||
#include "llvm/CodeGen/GlobalISel/Utils.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
|
@ -35,6 +36,7 @@ using namespace llvm;
|
|||
using namespace LegalizeActions;
|
||||
using namespace LegalizeMutations;
|
||||
using namespace LegalityPredicates;
|
||||
using namespace MIPatternMatch;
|
||||
|
||||
AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
|
||||
: ST(&ST) {
|
||||
|
@ -278,6 +280,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
|
|||
};
|
||||
|
||||
getActionDefinitionsBuilder(G_LOAD)
|
||||
.customIf([=](const LegalityQuery &Query) {
|
||||
return Query.Types[0] == s128 &&
|
||||
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
|
||||
})
|
||||
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
|
||||
{s16, p0, s16, 8},
|
||||
{s32, p0, s32, 8},
|
||||
|
@ -316,6 +322,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
|
|||
.scalarizeIf(typeIs(0, v2s16), 0);
|
||||
|
||||
getActionDefinitionsBuilder(G_STORE)
|
||||
.customIf([=](const LegalityQuery &Query) {
|
||||
return Query.Types[0] == s128 &&
|
||||
Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic;
|
||||
})
|
||||
.legalForTypesWithMemDesc({{s8, p0, s8, 8},
|
||||
{s16, p0, s8, 8}, // truncstorei8 from s16
|
||||
{s32, p0, s8, 8}, // truncstorei8 from s32
|
||||
|
@ -992,6 +1002,20 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
|
|||
return true;
|
||||
}
|
||||
|
||||
static void matchLDPSTPAddrMode(Register Root, Register &Base, int &Offset,
|
||||
MachineRegisterInfo &MRI) {
|
||||
Base = Root;
|
||||
Offset = 0;
|
||||
|
||||
Register NewBase;
|
||||
int64_t NewOffset;
|
||||
if (mi_match(Root, MRI, m_GPtrAdd(m_Reg(NewBase), m_ICst(NewOffset))) &&
|
||||
isShiftedInt<7, 3>(NewOffset)) {
|
||||
Base = NewBase;
|
||||
Offset = NewOffset;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: This should be removed and replaced with the generic bitcast legalize
|
||||
// action.
|
||||
bool AArch64LegalizerInfo::legalizeLoadStore(
|
||||
|
@ -1011,6 +1035,36 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
|
|||
Register ValReg = MI.getOperand(0).getReg();
|
||||
const LLT ValTy = MRI.getType(ValReg);
|
||||
|
||||
if (ValTy == LLT::scalar(128)) {
|
||||
assert((*MI.memoperands_begin())->getSuccessOrdering() ==
|
||||
AtomicOrdering::Monotonic ||
|
||||
(*MI.memoperands_begin())->getSuccessOrdering() ==
|
||||
AtomicOrdering::Unordered);
|
||||
assert(ST->hasLSE2() && "ldp/stp not single copy atomic without +lse2");
|
||||
LLT s64 = LLT::scalar(64);
|
||||
MachineInstrBuilder NewI;
|
||||
if (MI.getOpcode() == TargetOpcode::G_LOAD) {
|
||||
NewI = MIRBuilder.buildInstr(AArch64::LDPXi, {s64, s64}, {});
|
||||
MIRBuilder.buildMerge(ValReg, {NewI->getOperand(0), NewI->getOperand(1)});
|
||||
} else {
|
||||
auto Split = MIRBuilder.buildUnmerge(s64, MI.getOperand(0));
|
||||
NewI = MIRBuilder.buildInstr(
|
||||
AArch64::STPXi, {}, {Split->getOperand(0), Split->getOperand(1)});
|
||||
}
|
||||
Register Base;
|
||||
int Offset;
|
||||
matchLDPSTPAddrMode(MI.getOperand(1).getReg(), Base, Offset, MRI);
|
||||
NewI.addUse(Base);
|
||||
NewI.addImm(Offset / 8);
|
||||
|
||||
NewI.cloneMemRefs(MI);
|
||||
constrainSelectedInstRegOperands(*NewI, *ST->getInstrInfo(),
|
||||
*MRI.getTargetRegisterInfo(),
|
||||
*ST->getRegBankInfo());
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!ValTy.isVector() || !ValTy.getElementType().isPointer() ||
|
||||
ValTy.getElementType().getAddressSpace() != 0) {
|
||||
LLVM_DEBUG(dbgs() << "Tried to do custom legalization on wrong load/store");
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O1
|
||||
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
|
||||
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O1
|
||||
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-LLSC-O0
|
||||
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mcpu=apple-a13 -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
|
||||
; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-machineinstrs -O0 -mattr=+lse -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=CHECK-CAS-O0
|
||||
@var = global i128 0
|
||||
|
||||
define void @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
|
||||
|
@ -411,7 +411,7 @@ define void @atomic_load_relaxed(i64, i64, i128* %p, i128* %p2) {
|
|||
; CHECK-CAS-O0-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECK-CAS-O0-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
|
||||
; CHECK-CAS-O0-NEXT: ldxp x9, x10, [x11]
|
||||
; CHECK-CAS-O0-NEXT: mov x8, #0
|
||||
; CHECK-CAS-O0-NEXT: mov x8, xzr
|
||||
; CHECK-CAS-O0-NEXT: orr x9, x9, x8
|
||||
; CHECK-CAS-O0-NEXT: orr x10, x8, x10
|
||||
; CHECK-CAS-O0-NEXT: // implicit-def: $q0
|
||||
|
|
|
@ -0,0 +1,212 @@
|
|||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s
|
||||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - -global-isel=1 -global-isel-abort=1 | FileCheck %s
|
||||
|
||||
define void @test_atomic_load(i128* %addr) {
|
||||
; CHECK-LABEL: test_atomic_load:
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%res.0 = load atomic i128, i128* %addr monotonic, align 16
|
||||
store i128 %res.0, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%res.1 = load atomic i128, i128* %addr unordered, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: dmb ish
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%res.2 = load atomic i128, i128* %addr acquire, align 16
|
||||
store i128 %res.2, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: dmb ish
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%res.3 = load atomic i128, i128* %addr seq_cst, align 16
|
||||
store i128 %res.3, i128* %addr
|
||||
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 8
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.5, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
|
||||
%addr128.2 = bitcast i8* %addr8.2 to i128*
|
||||
%res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16
|
||||
store i128 %res.6, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
|
||||
%addr128.3 = bitcast i8* %addr8.3 to i128*
|
||||
%res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16
|
||||
store i128 %res.7, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_libcall_load(i128* %addr) {
|
||||
; CHECK-LABEL: test_libcall_load:
|
||||
; CHECK: bl __atomic_load
|
||||
%res.8 = load atomic i128, i128* %addr unordered, align 8
|
||||
store i128 %res.8, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_load1(i128* %addr) {
|
||||
; CHECK-LABEL: test_nonfolded_load1:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_load2(i128* %addr) {
|
||||
; CHECK-LABEL: test_nonfolded_load2:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_load3(i128* %addr) {
|
||||
; CHECK-LABEL: test_nonfolded_load3:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
|
||||
; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
|
||||
; CHECK: mov v[[Q]].d[1], [[HI]]
|
||||
; CHECK: str q[[Q]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_atomic_store(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_atomic_store:
|
||||
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
store atomic i128 %val, i128* %addr monotonic, align 16
|
||||
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
store atomic i128 %val, i128* %addr unordered, align 16
|
||||
|
||||
; CHECK: dmb ish
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
store atomic i128 %val, i128* %addr release, align 16
|
||||
|
||||
; CHECK: dmb ish
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
; CHECK: dmb ish
|
||||
store atomic i128 %val, i128* %addr seq_cst, align 16
|
||||
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: stp x2, x3, [x0, #8]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 8
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
; CHECK: stp x2, x3, [x0, #504]
|
||||
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
|
||||
%addr128.2 = bitcast i8* %addr8.2 to i128*
|
||||
store atomic i128 %val, i128* %addr128.2 monotonic, align 16
|
||||
|
||||
; CHECK: stp x2, x3, [x0, #-512]
|
||||
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
|
||||
%addr128.3 = bitcast i8* %addr8.3 to i128*
|
||||
store atomic i128 %val, i128* %addr128.3 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_libcall_store(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_libcall_store:
|
||||
; CHECK: bl __atomic_store
|
||||
store atomic i128 %val, i128* %addr unordered, align 8
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_store1(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_nonfolded_store1:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
|
||||
; CHECK: stp x2, x3, [x[[ADDR]]]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_store2(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_nonfolded_store2:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
|
||||
; CHECK: stp x2, x3, [x[[ADDR]]]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_store3(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_nonfolded_store3:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
|
||||
; CHECK: stp x2, x3, [x[[ADDR]]]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+outline-atomics < %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
|
||||
; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mattr=+lse < %s | FileCheck %s --check-prefix=CHECK-REG
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs -mcpu=saphira -mattr=-lse2 < %s | FileCheck %s
|
||||
|
||||
; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
|
||||
; (i.e. reusing a register for status & data in store exclusive).
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+v8.4a %s -o - | FileCheck %s
|
||||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+lse2 %s -o - | FileCheck %s
|
||||
|
||||
define void @test_atomic_load(i128* %addr) {
|
||||
; CHECK-LABEL: test_atomic_load:
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%res.0 = load atomic i128, i128* %addr monotonic, align 16
|
||||
store i128 %res.0, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%res.1 = load atomic i128, i128* %addr unordered, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: dmb ish
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%res.2 = load atomic i128, i128* %addr acquire, align 16
|
||||
store i128 %res.2, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
|
||||
; CHECK: dmb ish
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%res.3 = load atomic i128, i128* %addr seq_cst, align 16
|
||||
store i128 %res.3, i128* %addr
|
||||
|
||||
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #32]
|
||||
; CHECK-DAG: stp [[LO]], [[HI]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 32
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.5 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.5, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
|
||||
%addr128.2 = bitcast i8* %addr8.2 to i128*
|
||||
%res.6 = load atomic i128, i128* %addr128.2 monotonic, align 16
|
||||
store i128 %res.6, i128* %addr
|
||||
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
|
||||
%addr128.3 = bitcast i8* %addr8.3 to i128*
|
||||
%res.7 = load atomic i128, i128* %addr128.3 monotonic, align 16
|
||||
store i128 %res.7, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_libcall_load(i128* %addr) {
|
||||
; CHECK-LABEL: test_libcall_load:
|
||||
; CHECK: bl __atomic_load
|
||||
%res.8 = load atomic i128, i128* %addr unordered, align 8
|
||||
store i128 %res.8, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_load1(i128* %addr) {
|
||||
; CHECK-LABEL: test_nonfolded_load1:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_load2(i128* %addr) {
|
||||
; CHECK-LABEL: test_nonfolded_load2:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_load3(i128* %addr) {
|
||||
; CHECK-LABEL: test_nonfolded_load3:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
|
||||
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
|
||||
; CHECK: stp [[LO]], [[HI]], [x0]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
%res.1 = load atomic i128, i128* %addr128.1 monotonic, align 16
|
||||
store i128 %res.1, i128* %addr
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_atomic_store(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_atomic_store:
|
||||
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
store atomic i128 %val, i128* %addr monotonic, align 16
|
||||
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
store atomic i128 %val, i128* %addr unordered, align 16
|
||||
|
||||
; CHECK: dmb ish
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
store atomic i128 %val, i128* %addr release, align 16
|
||||
|
||||
; CHECK: dmb ish
|
||||
; CHECK: stp x2, x3, [x0]
|
||||
; CHECK: dmb ish
|
||||
store atomic i128 %val, i128* %addr seq_cst, align 16
|
||||
|
||||
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: stp x2, x3, [x0, #8]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 8
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
; CHECK: stp x2, x3, [x0, #504]
|
||||
%addr8.2 = getelementptr i8, i8* %addr8, i32 504
|
||||
%addr128.2 = bitcast i8* %addr8.2 to i128*
|
||||
store atomic i128 %val, i128* %addr128.2 monotonic, align 16
|
||||
|
||||
; CHECK: stp x2, x3, [x0, #-512]
|
||||
%addr8.3 = getelementptr i8, i8* %addr8, i32 -512
|
||||
%addr128.3 = bitcast i8* %addr8.3 to i128*
|
||||
store atomic i128 %val, i128* %addr128.3 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_libcall_store(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_libcall_store:
|
||||
; CHECK: bl __atomic_store
|
||||
store atomic i128 %val, i128* %addr unordered, align 8
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_store1(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_nonfolded_store1:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
|
||||
; CHECK: stp x2, x3, [x[[ADDR]]]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 4
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_store2(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_nonfolded_store2:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
|
||||
; CHECK: stp x2, x3, [x[[ADDR]]]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 512
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test_nonfolded_store3(i128* %addr, i128 %val) {
|
||||
; CHECK-LABEL: test_nonfolded_store3:
|
||||
%addr8 = bitcast i128* %addr to i8*
|
||||
|
||||
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
|
||||
; CHECK: stp x2, x3, [x[[ADDR]]]
|
||||
%addr8.1 = getelementptr i8, i8* %addr8, i32 -520
|
||||
%addr128.1 = bitcast i8* %addr8.1 to i128*
|
||||
store atomic i128 %val, i128* %addr128.1 monotonic, align 16
|
||||
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue