forked from OSchip/llvm-project
[AArch64][SVE] Implement structured load intrinsics
Summary: This patch adds initial support for the following instrinsics: * llvm.aarch64.sve.ld2 * llvm.aarch64.sve.ld3 * llvm.aarch64.sve.ld4 For loading two, three and four vectors worth of data. Basic codegen is implemented with reg+reg and reg+imm addressing modes being addressed in a later patch. The types returned by these intrinsics have a number of elements that is a multiple of the elements in a 128-bit vector for a given type and N, where N is the number of vectors being loaded, i.e. 2, 3 or 4. Thus, for 32-bit elements the types are: LD2 : <vscale x 8 x i32> LD3 : <vscale x 12 x i32> LD4 : <vscale x 16 x i32> This is implemented with target-specific intrinsics for each variant that take the same operands as the IR intrinsic but return N values, where the type of each value is a full vector, i.e. <vscale x 4 x i32> in the above example. These values are then concatenated using the standard concat_vector intrinsic to maintain type legality with the IR. These intrinsics are intended for use in the Arm C Language Extension (ACLE). Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D75751
This commit is contained in:
parent
e9eafb7be9
commit
b82be5db71
|
@ -814,6 +814,10 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
|
|||
: Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
|
||||
[IntrReadMem, IntrArgMemOnly, ImmArg<ArgIndex<1>>]>;
|
||||
|
||||
class AdvSIMD_ManyVec_PredLoad_Intrinsic
|
||||
: Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty],
|
||||
[IntrReadMem, IntrArgMemOnly]>;
|
||||
|
||||
class AdvSIMD_1Vec_PredLoad_Intrinsic
|
||||
: Intrinsic<[llvm_anyvector_ty],
|
||||
[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
|
||||
|
@ -1346,6 +1350,10 @@ def int_aarch64_sve_tuple_set : AdvSIMD_SVE_Set_Vector_Tuple;
|
|||
|
||||
def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
|
||||
|
||||
def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
|
||||
def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
|
||||
def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
|
||||
|
||||
def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
|
||||
def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
|
||||
def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
|
||||
|
|
|
@ -245,6 +245,7 @@ public:
|
|||
unsigned SubRegIdx);
|
||||
void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
|
||||
void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
|
||||
void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
|
||||
|
||||
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
|
||||
/// SVE Reg+Imm addressing mode.
|
||||
|
@ -1441,6 +1442,30 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
|
|||
return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
|
||||
}
|
||||
|
||||
void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
|
||||
const unsigned Opc) {
|
||||
SDLoc DL(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue Chain = N->getOperand(0);
|
||||
|
||||
SDValue Ops[] = {N->getOperand(1), // Predicate
|
||||
N->getOperand(2), // Memory operand
|
||||
CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
|
||||
|
||||
const EVT ResTys[] = {MVT::Untyped, MVT::Other};
|
||||
|
||||
SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
|
||||
SDValue SuperReg = SDValue(Load, 0);
|
||||
for (unsigned i = 0; i < NumVecs; ++i)
|
||||
ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
|
||||
AArch64::zsub0 + i, DL, VT, SuperReg));
|
||||
|
||||
// Copy chain
|
||||
unsigned ChainIdx = NumVecs;
|
||||
ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
|
||||
CurDAG->RemoveDeadNode(N);
|
||||
}
|
||||
|
||||
void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
|
||||
unsigned Opc) {
|
||||
SDLoc dl(N);
|
||||
|
@ -4603,6 +4628,54 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
|
|||
}
|
||||
break;
|
||||
}
|
||||
case AArch64ISD::SVE_LD2: {
|
||||
if (VT == MVT::nxv16i8) {
|
||||
SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
|
||||
SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
|
||||
SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
|
||||
SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case AArch64ISD::SVE_LD3: {
|
||||
if (VT == MVT::nxv16i8) {
|
||||
SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
|
||||
SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
|
||||
SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
|
||||
SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case AArch64ISD::SVE_LD4: {
|
||||
if (VT == MVT::nxv16i8) {
|
||||
SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
|
||||
SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
|
||||
SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
|
||||
return;
|
||||
} else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
|
||||
SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Select the default instruction
|
||||
|
|
|
@ -1467,6 +1467,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S";
|
||||
case AArch64ISD::LD1RQ: return "AArch64ISD::LD1RQ";
|
||||
case AArch64ISD::LD1RO: return "AArch64ISD::LD1RO";
|
||||
case AArch64ISD::SVE_LD2: return "AArch64ISD::SVE_LD2";
|
||||
case AArch64ISD::SVE_LD3: return "AArch64ISD::SVE_LD3";
|
||||
case AArch64ISD::SVE_LD4: return "AArch64ISD::SVE_LD4";
|
||||
case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
|
||||
case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
|
||||
case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
|
||||
|
@ -9796,6 +9799,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
|
|||
return true;
|
||||
}
|
||||
|
||||
// Lower an SVE structured load intrinsic returning a tuple type to target
|
||||
// specific intrinsic taking the same input but returning a multi-result value
|
||||
// of the split tuple type.
|
||||
//
|
||||
// E.g. Lowering an LD3:
|
||||
//
|
||||
// call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
|
||||
// <vscale x 4 x i1> %pred,
|
||||
// <vscale x 4 x i32>* %addr)
|
||||
//
|
||||
// Output DAG:
|
||||
//
|
||||
// t0: ch = EntryToken
|
||||
// t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
|
||||
// t4: i64,ch = CopyFromReg t0, Register:i64 %1
|
||||
// t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
|
||||
// t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
|
||||
//
|
||||
// This is called pre-legalization to avoid widening/splitting issues with
|
||||
// non-power-of-2 tuple types used for LD3, such as nxv12i32.
|
||||
SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
|
||||
ArrayRef<SDValue> LoadOps,
|
||||
EVT VT, SelectionDAG &DAG,
|
||||
const SDLoc &DL) const {
|
||||
assert(VT.isScalableVector() && "Can only lower scalable vectors");
|
||||
|
||||
unsigned N, Opcode;
|
||||
static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
|
||||
{Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2}},
|
||||
{Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3}},
|
||||
{Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4}}};
|
||||
|
||||
std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
|
||||
assert(VT.getVectorElementCount().Min % N == 0 &&
|
||||
"invalid tuple vector type!");
|
||||
|
||||
EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
|
||||
VT.getVectorElementCount() / N);
|
||||
assert(isTypeLegal(SplitVT));
|
||||
|
||||
SmallVector<EVT, 5> VTs(N, SplitVT);
|
||||
VTs.push_back(MVT::Other); // Chain
|
||||
SDVTList NodeTys = DAG.getVTList(VTs);
|
||||
|
||||
SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
|
||||
SmallVector<SDValue, 4> PseudoLoadOps;
|
||||
for (unsigned I = 0; I < N; ++I)
|
||||
PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
|
||||
}
|
||||
|
||||
EVT AArch64TargetLowering::getOptimalMemOpType(
|
||||
const MemOp &Op, const AttributeList &FuncAttributes) const {
|
||||
|
@ -13728,6 +13781,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
|
||||
return DAG.getMergeValues({Concat, Chain}, DL);
|
||||
}
|
||||
case Intrinsic::aarch64_sve_ld2:
|
||||
case Intrinsic::aarch64_sve_ld3:
|
||||
case Intrinsic::aarch64_sve_ld4: {
|
||||
SDLoc DL(N);
|
||||
SDValue Chain = N->getOperand(0);
|
||||
SDValue Mask = N->getOperand(2);
|
||||
SDValue BasePtr = N->getOperand(3);
|
||||
SDValue LoadOps[] = {Chain, Mask, BasePtr};
|
||||
unsigned IntrinsicID =
|
||||
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
|
||||
SDValue Result =
|
||||
LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
|
||||
return DAG.getMergeValues({Result, Chain}, DL);
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -255,6 +255,11 @@ enum NodeType : unsigned {
|
|||
LD1RQ,
|
||||
LD1RO,
|
||||
|
||||
// Structured loads.
|
||||
SVE_LD2,
|
||||
SVE_LD3,
|
||||
SVE_LD4,
|
||||
|
||||
// Unsigned gather loads.
|
||||
GLD1,
|
||||
GLD1_SCALED,
|
||||
|
@ -835,6 +840,8 @@ private:
|
|||
SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
|
||||
SDValue &Size,
|
||||
SelectionDAG &DAG) const;
|
||||
SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
|
||||
EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
|
||||
|
||||
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
|
||||
SmallVectorImpl<SDNode *> &Created) const override;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
|
||||
|
||||
;
|
||||
; LD1RQB
|
||||
|
@ -252,6 +252,244 @@ define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, double* %addr)
|
|||
ret <vscale x 2 x double> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD2B
|
||||
;
|
||||
|
||||
define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
|
||||
; CHECK-LABEL: ld2b_i8:
|
||||
; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
|
||||
<vscale x 16 x i8>* %addr)
|
||||
ret <vscale x 32 x i8> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD2H
|
||||
;
|
||||
|
||||
define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
|
||||
; CHECK-LABEL: ld2h_i16:
|
||||
; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
|
||||
<vscale x 8 x i16>* %addr)
|
||||
ret <vscale x 16 x i16> %res
|
||||
}
|
||||
|
||||
define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
|
||||
; CHECK-LABEL: ld2h_f16:
|
||||
; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
|
||||
<vscale x 8 x half>* %addr)
|
||||
ret <vscale x 16 x half> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD2W
|
||||
;
|
||||
|
||||
define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
|
||||
; CHECK-LABEL: ld2w_i32:
|
||||
; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
|
||||
<vscale x 4 x i32>* %addr)
|
||||
ret <vscale x 8 x i32> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
|
||||
; CHECK-LABEL: ld2w_f32:
|
||||
; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
|
||||
<vscale x 4 x float>* %addr)
|
||||
ret <vscale x 8 x float> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD2D
|
||||
;
|
||||
|
||||
define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
|
||||
; CHECK-LABEL: ld2d_i64:
|
||||
; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
|
||||
<vscale x 2 x i64>* %addr)
|
||||
ret <vscale x 4 x i64> %res
|
||||
}
|
||||
|
||||
define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
|
||||
; CHECK-LABEL: ld2d_f64:
|
||||
; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
|
||||
<vscale x 2 x double>* %addr)
|
||||
ret <vscale x 4 x double> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD3B
|
||||
;
|
||||
|
||||
define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
|
||||
; CHECK-LABEL: ld3b_i8:
|
||||
; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
|
||||
<vscale x 16 x i8>* %addr)
|
||||
ret <vscale x 48 x i8> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD3H
|
||||
;
|
||||
|
||||
define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
|
||||
; CHECK-LABEL: ld3h_i16:
|
||||
; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
|
||||
<vscale x 8 x i16>* %addr)
|
||||
ret <vscale x 24 x i16> %res
|
||||
}
|
||||
|
||||
define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
|
||||
; CHECK-LABEL: ld3h_f16:
|
||||
; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
|
||||
<vscale x 8 x half>* %addr)
|
||||
ret <vscale x 24 x half> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD3W
|
||||
;
|
||||
|
||||
define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
|
||||
; CHECK-LABEL: ld3w_i32:
|
||||
; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
|
||||
<vscale x 4 x i32>* %addr)
|
||||
ret <vscale x 12 x i32> %res
|
||||
}
|
||||
|
||||
define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
|
||||
; CHECK-LABEL: ld3w_f32:
|
||||
; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
|
||||
<vscale x 4 x float>* %addr)
|
||||
ret <vscale x 12 x float> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD3D
|
||||
;
|
||||
|
||||
define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
|
||||
; CHECK-LABEL: ld3d_i64:
|
||||
; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
|
||||
<vscale x 2 x i64>* %addr)
|
||||
ret <vscale x 6 x i64> %res
|
||||
}
|
||||
|
||||
define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
|
||||
; CHECK-LABEL: ld3d_f64:
|
||||
; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
|
||||
<vscale x 2 x double>* %addr)
|
||||
ret <vscale x 6 x double> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD4B
|
||||
;
|
||||
|
||||
define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
|
||||
; CHECK-LABEL: ld4b_i8:
|
||||
; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1> %pred,
|
||||
<vscale x 16 x i8>* %addr)
|
||||
ret <vscale x 64 x i8> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD4H
|
||||
;
|
||||
|
||||
define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
|
||||
; CHECK-LABEL: ld4h_i16:
|
||||
; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1> %pred,
|
||||
<vscale x 8 x i16>* %addr)
|
||||
ret <vscale x 32 x i16> %res
|
||||
}
|
||||
|
||||
define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
|
||||
; CHECK-LABEL: ld4h_f16:
|
||||
; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1> %pred,
|
||||
<vscale x 8 x half>* %addr)
|
||||
ret <vscale x 32 x half> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD4W
|
||||
;
|
||||
|
||||
define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
|
||||
; CHECK-LABEL: ld4w_i32:
|
||||
; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1> %pred,
|
||||
<vscale x 4 x i32>* %addr)
|
||||
ret <vscale x 16 x i32> %res
|
||||
}
|
||||
|
||||
define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
|
||||
; CHECK-LABEL: ld4w_f32:
|
||||
; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1> %pred,
|
||||
<vscale x 4 x float>* %addr)
|
||||
ret <vscale x 16 x float> %res
|
||||
}
|
||||
|
||||
;
|
||||
; LD4D
|
||||
;
|
||||
|
||||
define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
|
||||
; CHECK-LABEL: ld4d_i64:
|
||||
; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1> %pred,
|
||||
<vscale x 2 x i64>* %addr)
|
||||
ret <vscale x 8 x i64> %res
|
||||
}
|
||||
|
||||
define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
|
||||
; CHECK-LABEL: ld4d_f64:
|
||||
; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
|
||||
; CHECK-NEXT: ret
|
||||
%res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1> %pred,
|
||||
<vscale x 2 x double>* %addr)
|
||||
ret <vscale x 8 x double> %res
|
||||
}
|
||||
|
||||
|
||||
declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1>, i8*)
|
||||
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, i16*)
|
||||
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, i32*)
|
||||
|
@ -267,3 +505,27 @@ declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i6
|
|||
declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
|
||||
declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*)
|
||||
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*)
|
||||
|
||||
declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
|
||||
declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
||||
declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
|
||||
declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
||||
declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
|
||||
declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
|
||||
declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
|
||||
declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
|
||||
declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
|
||||
declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
|
||||
declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
|
||||
|
|
Loading…
Reference in New Issue