forked from OSchip/llvm-project
[AArch64] Add support for 256-bit non temporal loads
Currenlty all temporal loads are mapped to `LDP` or `LDR`. This patch will map all the non temporal 256-bit loads into `LDNP`. Future patches should address other non-temporal loads. Reviewed By: fhahn, dmgreen Differential Revision: https://reviews.llvm.org/D131773
This commit is contained in:
parent
672311bd77
commit
7155ed4289
|
@ -792,6 +792,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::STORE, MVT::v4f64, Custom);
|
||||
setOperationAction(ISD::STORE, MVT::v4i64, Custom);
|
||||
|
||||
// 256 bit non-temporal loads can be lowered to LDNP. This is done using
|
||||
// custom lowering, as there are no un-paired non-temporal loads legalization
|
||||
// will break up 256 bit inputs.
|
||||
setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
|
||||
setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
|
||||
|
||||
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
|
||||
// This requires the Performance Monitors extension.
|
||||
if (Subtarget->hasPerfMon())
|
||||
|
@ -2314,6 +2325,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
MAKE_CASE(AArch64ISD::SSTNT1_PRED)
|
||||
MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
|
||||
MAKE_CASE(AArch64ISD::LDP)
|
||||
MAKE_CASE(AArch64ISD::LDNP)
|
||||
MAKE_CASE(AArch64ISD::STP)
|
||||
MAKE_CASE(AArch64ISD::STNP)
|
||||
MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
|
||||
|
@ -20406,9 +20418,29 @@ void AArch64TargetLowering::ReplaceNodeResults(
|
|||
return;
|
||||
case ISD::ATOMIC_LOAD:
|
||||
case ISD::LOAD: {
|
||||
assert(SDValue(N, 0).getValueType() == MVT::i128 &&
|
||||
"unexpected load's value type");
|
||||
MemSDNode *LoadNode = cast<MemSDNode>(N);
|
||||
EVT MemVT = LoadNode->getMemoryVT();
|
||||
// Handle lowering 256 bit non temporal loads into LDNP.
|
||||
if (LoadNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
|
||||
(MemVT.getScalarSizeInBits() == 8u ||
|
||||
MemVT.getScalarSizeInBits() == 16u ||
|
||||
MemVT.getScalarSizeInBits() == 32u ||
|
||||
MemVT.getScalarSizeInBits() == 64u)) {
|
||||
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::LDNP, SDLoc(N),
|
||||
DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
|
||||
MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
|
||||
MVT::Other}),
|
||||
{LoadNode->getChain(), LoadNode->getBasePtr()},
|
||||
LoadNode->getMemoryVT(), LoadNode->getMemOperand());
|
||||
|
||||
SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
|
||||
Result.getValue(0), Result.getValue(1));
|
||||
Results.append({Pair, Result.getValue(2) /* Chain */});
|
||||
return;
|
||||
}
|
||||
|
||||
if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
|
||||
LoadNode->getMemoryVT() != MVT::i128) {
|
||||
// Non-volatile or atomic loads are optimized later in AArch64's load/store
|
||||
|
@ -20416,15 +20448,17 @@ void AArch64TargetLowering::ReplaceNodeResults(
|
|||
return;
|
||||
}
|
||||
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::LDP, SDLoc(N),
|
||||
DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
|
||||
{LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
|
||||
LoadNode->getMemOperand());
|
||||
if (SDValue(N, 0).getValueType() == MVT::i128) {
|
||||
SDValue Result = DAG.getMemIntrinsicNode(
|
||||
AArch64ISD::LDP, SDLoc(N),
|
||||
DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
|
||||
{LoadNode->getChain(), LoadNode->getBasePtr()},
|
||||
LoadNode->getMemoryVT(), LoadNode->getMemOperand());
|
||||
|
||||
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
|
||||
Result.getValue(0), Result.getValue(1));
|
||||
Results.append({Pair, Result.getValue(2) /* Chain */});
|
||||
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
|
||||
Result.getValue(0), Result.getValue(1));
|
||||
Results.append({Pair, Result.getValue(2) /* Chain */});
|
||||
}
|
||||
return;
|
||||
}
|
||||
case ISD::EXTRACT_SUBVECTOR:
|
||||
|
|
|
@ -450,6 +450,7 @@ enum NodeType : unsigned {
|
|||
STZ2G,
|
||||
|
||||
LDP,
|
||||
LDNP,
|
||||
STP,
|
||||
STNP,
|
||||
|
||||
|
|
|
@ -318,6 +318,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
|
|||
def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
|
||||
|
||||
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
|
||||
def SDT_AArch64ldnp : SDTypeProfile<2, 1, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
|
||||
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
|
||||
def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
|
||||
|
||||
|
@ -728,6 +729,7 @@ def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
|
|||
def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
|
||||
|
||||
def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
||||
def AArch64ldnp : SDNode<"AArch64ISD::LDNP", SDT_AArch64ldnp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
||||
def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
|
||||
|
@ -2581,6 +2583,8 @@ defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
|
|||
def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
|
||||
(LDPXi GPR64sp:$Rn, simm7s8:$offset)>;
|
||||
|
||||
def : Pat<(AArch64ldnp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
|
||||
(LDNPQi GPR64sp:$Rn, simm7s16:$offset)>;
|
||||
//---
|
||||
// (register offset)
|
||||
//---
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
define <4 x double> @test_ldnp_v4f64(<4 x double>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v4f64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <4 x double>, <4 x double>* %A, align 8, !nontemporal !0
|
||||
ret <4 x double> %lv
|
||||
|
@ -13,7 +13,7 @@ define <4 x double> @test_ldnp_v4f64(<4 x double>* %A) {
|
|||
define <4 x i64> @test_ldnp_v4i64(<4 x i64>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v4i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <4 x i64>, <4 x i64>* %A, align 8, !nontemporal !0
|
||||
ret <4 x i64> %lv
|
||||
|
@ -22,7 +22,7 @@ define <4 x i64> @test_ldnp_v4i64(<4 x i64>* %A) {
|
|||
define <8 x i32> @test_ldnp_v8i32(<8 x i32>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v8i32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <8 x i32>, <8 x i32>* %A, align 8, !nontemporal !0
|
||||
ret <8 x i32> %lv
|
||||
|
@ -31,7 +31,7 @@ define <8 x i32> @test_ldnp_v8i32(<8 x i32>* %A) {
|
|||
define <8 x float> @test_ldnp_v8f32(<8 x float>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v8f32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <8 x float>, <8 x float>* %A, align 8, !nontemporal !0
|
||||
ret <8 x float> %lv
|
||||
|
@ -40,7 +40,7 @@ define <8 x float> @test_ldnp_v8f32(<8 x float>* %A) {
|
|||
define <16 x i16> @test_ldnp_v16i16(<16 x i16>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v16i16:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <16 x i16>, <16 x i16>* %A, align 8, !nontemporal !0
|
||||
ret <16 x i16> %lv
|
||||
|
@ -49,7 +49,7 @@ define <16 x i16> @test_ldnp_v16i16(<16 x i16>* %A) {
|
|||
define <16 x half> @test_ldnp_v16f16(<16 x half>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v16f16:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <16 x half>, <16 x half>* %A, align 8, !nontemporal !0
|
||||
ret <16 x half> %lv
|
||||
|
@ -58,7 +58,7 @@ define <16 x half> @test_ldnp_v16f16(<16 x half>* %A) {
|
|||
define <32 x i8> @test_ldnp_v32i8(<32 x i8>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v32i8:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <32 x i8>, <32 x i8>* %A, align 8, !nontemporal !0
|
||||
ret <32 x i8> %lv
|
||||
|
@ -165,8 +165,8 @@ define <1 x i64> @test_ldnp_v1i64(<1 x i64>* %A) {
|
|||
define <32 x i16> @test_ldnp_v32i16(<32 x i16>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v32i16:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <32 x i16>, <32 x i16>* %A, align 8, !nontemporal !0
|
||||
ret <32 x i16> %lv
|
||||
|
@ -175,8 +175,8 @@ define <32 x i16> @test_ldnp_v32i16(<32 x i16>* %A) {
|
|||
define <32 x half> @test_ldnp_v32f16(<32 x half>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v32f16:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <32 x half>, <32 x half>* %A, align 8, !nontemporal !0
|
||||
ret <32 x half> %lv
|
||||
|
@ -185,8 +185,8 @@ define <32 x half> @test_ldnp_v32f16(<32 x half>* %A) {
|
|||
define <16 x i32> @test_ldnp_v16i32(<16 x i32>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v16i32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <16 x i32>, <16 x i32>* %A, align 8, !nontemporal !0
|
||||
ret <16 x i32> %lv
|
||||
|
@ -195,8 +195,8 @@ define <16 x i32> @test_ldnp_v16i32(<16 x i32>* %A) {
|
|||
define <16 x float> @test_ldnp_v16f32(<16 x float>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v16f32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <16 x float>, <16 x float>* %A, align 8, !nontemporal !0
|
||||
ret <16 x float> %lv
|
||||
|
@ -312,10 +312,10 @@ define <5 x double> @test_ldnp_v5f64(<5 x double>* %A) {
|
|||
define <16 x i64> @test_ldnp_v16i64(<16 x i64>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v16i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldp q4, q5, [x0, #64]
|
||||
; CHECK-NEXT: ldp q6, q7, [x0, #96]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldnp q4, q5, [x0, #64]
|
||||
; CHECK-NEXT: ldnp q6, q7, [x0, #96]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <16 x i64>, <16 x i64>* %A, align 8, !nontemporal !0
|
||||
ret <16 x i64> %lv
|
||||
|
@ -324,10 +324,10 @@ define <16 x i64> @test_ldnp_v16i64(<16 x i64>* %A) {
|
|||
define <16 x double> @test_ldnp_v16f64(<16 x double>* %A) {
|
||||
; CHECK-LABEL: test_ldnp_v16f64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: ldp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldp q4, q5, [x0, #64]
|
||||
; CHECK-NEXT: ldp q6, q7, [x0, #96]
|
||||
; CHECK-NEXT: ldnp q0, q1, [x0]
|
||||
; CHECK-NEXT: ldnp q2, q3, [x0, #32]
|
||||
; CHECK-NEXT: ldnp q4, q5, [x0, #64]
|
||||
; CHECK-NEXT: ldnp q6, q7, [x0, #96]
|
||||
; CHECK-NEXT: ret
|
||||
%lv = load <16 x double>, <16 x double>* %A, align 8, !nontemporal !0
|
||||
ret <16 x double> %lv
|
||||
|
|
Loading…
Reference in New Issue