forked from OSchip/llvm-project
[AArch64] Improve codegen of volatile load/store of i128
Summary: Instead of generating two i64 instructions for each load or store of a volatile i128 value (two LDRs or STRs), now emit a single LDP or STP. Reviewers: labrinea, t.p.northover, efriedma Reviewed By: efriedma Subscribers: kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D69559
This commit is contained in:
parent
97ca7c2cc9
commit
364b8f5fbe
|
@ -516,6 +516,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
|
||||
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
|
||||
|
||||
// 128-bit loads and stores can be done without expanding
|
||||
setOperationAction(ISD::LOAD, MVT::i128, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::i128, Custom);
|
||||
|
||||
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
|
||||
// This requires the Performance Monitors extension.
|
||||
if (Subtarget->hasPerfMon())
|
||||
|
@ -1364,6 +1368,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
|
||||
case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
|
||||
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
|
||||
case AArch64ISD::LDP: return "AArch64ISD::LDP";
|
||||
case AArch64ISD::STP: return "AArch64ISD::STP";
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -2988,7 +2994,7 @@ static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
|
|||
|
||||
// Custom lowering for any store, vector or scalar and/or default or with
|
||||
// a truncate operations. Currently only custom lower truncate operation
|
||||
// from vector v4i16 to v4i8.
|
||||
// from vector v4i16 to v4i8 or volatile stores of i128.
|
||||
SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc Dl(Op);
|
||||
|
@ -3000,18 +3006,32 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
|
|||
EVT VT = Value.getValueType();
|
||||
EVT MemVT = StoreNode->getMemoryVT();
|
||||
|
||||
assert (VT.isVector() && "Can only custom lower vector store types");
|
||||
if (VT.isVector()) {
|
||||
unsigned AS = StoreNode->getAddressSpace();
|
||||
unsigned Align = StoreNode->getAlignment();
|
||||
if (Align < MemVT.getStoreSize() &&
|
||||
!allowsMisalignedMemoryAccesses(MemVT, AS, Align,
|
||||
StoreNode->getMemOperand()->getFlags(),
|
||||
nullptr)) {
|
||||
return scalarizeVectorStore(StoreNode, DAG);
|
||||
}
|
||||
|
||||
unsigned AS = StoreNode->getAddressSpace();
|
||||
unsigned Align = StoreNode->getAlignment();
|
||||
if (Align < MemVT.getStoreSize() &&
|
||||
!allowsMisalignedMemoryAccesses(
|
||||
MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
|
||||
return scalarizeVectorStore(StoreNode, DAG);
|
||||
}
|
||||
|
||||
if (StoreNode->isTruncatingStore()) {
|
||||
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
|
||||
if (StoreNode->isTruncatingStore()) {
|
||||
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
|
||||
}
|
||||
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
|
||||
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
|
||||
SDValue Lo =
|
||||
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
|
||||
DAG.getConstant(0, Dl, MVT::i64));
|
||||
SDValue Hi =
|
||||
DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
|
||||
DAG.getConstant(1, Dl, MVT::i64));
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
|
||||
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
|
||||
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
|
||||
return Result;
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
@ -12689,6 +12709,27 @@ void AArch64TargetLowering::ReplaceNodeResults(
|
|||
case ISD::ATOMIC_CMP_SWAP:
|
||||
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
|
||||
return;
|
||||
case ISD::LOAD: {
|
||||
assert(SDValue(N, 0).getValueType() == MVT::i128 &&
|
||||
"unexpected load's value type");
|
||||
LoadSDNode *LoadNode = cast<LoadSDNode>(N);
|
||||
if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
|
||||
// Non-volatile loads are optimized later in AArch64's load/store
|
||||
// optimizer.
|
||||
return;
|
||||
}
|
||||
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::LDP, SDLoc(N),
|
||||
DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
|
||||
{LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
|
||||
LoadNode->getMemOperand());
|
||||
|
||||
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
|
||||
Result.getValue(0), Result.getValue(1));
|
||||
Results.append({Pair, Result.getValue(2) /* Chain */});
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -260,8 +260,10 @@ enum NodeType : unsigned {
|
|||
STG,
|
||||
STZG,
|
||||
ST2G,
|
||||
STZ2G
|
||||
STZ2G,
|
||||
|
||||
LDP,
|
||||
STP
|
||||
};
|
||||
|
||||
} // end namespace AArch64ISD
|
||||
|
|
|
@ -243,6 +243,9 @@ def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
|
|||
def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
|
||||
SDTCisPtrTy<1>]>;
|
||||
|
||||
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
|
||||
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
|
||||
|
||||
// Generates the general dynamic sequences, i.e.
|
||||
// adrp x0, :tlsdesc:var
|
||||
// ldr x1, [x0, #:tlsdesc_lo12:var]
|
||||
|
@ -535,6 +538,9 @@ def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
|
|||
def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
|
||||
def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
|
||||
|
||||
def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
||||
def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1987,6 +1993,9 @@ defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
|
|||
defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
|
||||
defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
|
||||
|
||||
def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
|
||||
(LDPXi GPR64sp:$Rn, simm7s8:$offset)>;
|
||||
|
||||
//---
|
||||
// (register offset)
|
||||
//---
|
||||
|
@ -2680,6 +2689,9 @@ defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
|
|||
defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
|
||||
defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
|
||||
|
||||
def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
|
||||
(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
|
||||
|
||||
//---
|
||||
// (Register offset)
|
||||
|
||||
|
|
|
@ -87,10 +87,8 @@ define { i128, i1 } @test_cmpxchg_128(i128* %addr, i128 %desired, i128 %new) nou
|
|||
define {i128, i1} @test_cmpxchg_128_unsplit(i128* %addr) {
|
||||
; CHECK-LABEL: test_cmpxchg_128_unsplit:
|
||||
; CHECK: add x[[VAR128:[0-9]+]], {{x[0-9]+}}, :lo12:var128
|
||||
; CHECK: ldr [[DESIRED_HI:x[0-9]+]], [x[[VAR128]], #8]
|
||||
; CHECK: ldr [[DESIRED_LO:x[0-9]+]], [x[[VAR128]]]
|
||||
; CHECK: ldr [[NEW_HI:x[0-9]+]], [x[[VAR128]], #8]
|
||||
; CHECK: ldr [[NEW_LO:x[0-9]+]], [x[[VAR128]]]
|
||||
; CHECK: ldp [[DESIRED_LO:x[0-9]+]], [[DESIRED_HI:x[0-9]+]], [x[[VAR128]]]
|
||||
; CHECK: ldp [[NEW_LO:x[0-9]+]], [[NEW_HI:x[0-9]+]], [x[[VAR128]]]
|
||||
; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
|
||||
; CHECK: ldaxp [[OLD_LO:x[0-9]+]], [[OLD_HI:x[0-9]+]], [x0]
|
||||
; CHECK: cmp [[OLD_LO]], [[DESIRED_LO]]
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
|
||||
|
||||
@x = common dso_local global i128 0
|
||||
@y = common dso_local global i128 0
|
||||
|
||||
define void @test1() {
|
||||
; CHECK-LABEL: test1:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: ldp x8, x9, [x8]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: stp x8, x9, [x10]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* @x
|
||||
store volatile i128 %tmp, i128* @y
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test2() {
|
||||
; CHECK-LABEL: test2:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: ldp x8, x9, [x8, #504]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: stp x8, x9, [x10, #504]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 504) to i128*)
|
||||
store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 504) to i128*)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test3() {
|
||||
; CHECK-LABEL: test3:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: add x8, x8, #512 // =512
|
||||
; CHECK-NEXT: ldp x8, x9, [x8]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: add x10, x10, #512 // =512
|
||||
; CHECK-NEXT: stp x8, x9, [x10]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 512) to i128*)
|
||||
store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 512) to i128*)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test4() {
|
||||
; CHECK-LABEL: test4:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: ldp x8, x9, [x8, #-512]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: stp x8, x9, [x10, #-512]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -512) to i128*)
|
||||
store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -512) to i128*)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test5() {
|
||||
; CHECK-LABEL: test5:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: sub x8, x8, #520 // =520
|
||||
; CHECK-NEXT: ldp x8, x9, [x8]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: sub x10, x10, #520 // =520
|
||||
; CHECK-NEXT: stp x8, x9, [x10]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*)
|
||||
store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -520) to i128*)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test6() {
|
||||
; CHECK-LABEL: test6:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: sub x8, x8, #520 // =520
|
||||
; CHECK-NEXT: ldp x8, x9, [x8]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: sub x10, x10, #520 // =520
|
||||
; CHECK-NEXT: stp x8, x9, [x10]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 -520) to i128*)
|
||||
store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 -520) to i128*)
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @test7() {
|
||||
; CHECK-LABEL: test7:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: adrp x8, x
|
||||
; CHECK-NEXT: add x8, x8, :lo12:x
|
||||
; CHECK-NEXT: add x8, x8, #503 // =503
|
||||
; CHECK-NEXT: ldp x8, x9, [x8]
|
||||
; CHECK-NEXT: adrp x10, y
|
||||
; CHECK-NEXT: add x10, x10, :lo12:y
|
||||
; CHECK-NEXT: add x10, x10, #503 // =503
|
||||
; CHECK-NEXT: stp x8, x9, [x10]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp = load volatile i128, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @x to i8*), i64 503) to i128*)
|
||||
store volatile i128 %tmp, i128* bitcast (i8* getelementptr (i8, i8* bitcast (i128* @y to i8*), i64 503) to i128*)
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue