forked from OSchip/llvm-project
[AArch64][SVE] Add intrinsics for non-temporal scatters/gathers
Summary: This patch adds the following intrinsics for non-temporal gather loads and scatter stores: * aarch64_sve_ldnt1_gather_index * aarch64_sve_stnt1_scatter_index These intrinsics implement the "scalar + vector of indices" addressing mode. As opposed to regular and first-faulting gathers/scatters, there's no instruction that would take indices and then scale them. Instead, the indices for non-temporal gathers/scatters are scaled before the intrinsics are lowered to `ldnt1` instructions. The new ISD nodes, GLDNT1_INDEX and SSTNT1_INDEX, are only used as placeholders so that we can easily identify the cases implemented in this patch in performGatherLoadCombine and performScatterStoreCombined. Once encountered, they are replaced with: * GLDNT1_INDEX -> SPLAT_VECTOR + SHL + GLDNT1 * SSTNT1_INDEX -> SPLAT_VECTOR + SHL + SSTNT1 The patterns for lowering ISD::SHL for scalable vectors (required by this patch) were missing, so these are added too. Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D75601
This commit is contained in:
parent
a66dc755db
commit
46b9f14d71
|
@ -1782,6 +1782,9 @@ def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic
|
||||||
// 64 bit unscaled offsets
|
// 64 bit unscaled offsets
|
||||||
def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
|
def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
|
||||||
|
|
||||||
|
// 64 bit indices
|
||||||
|
def int_aarch64_sve_ldnt1_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
|
||||||
|
|
||||||
// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
|
// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
|
||||||
def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
|
def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
|
||||||
|
|
||||||
|
@ -1829,6 +1832,10 @@ def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsi
|
||||||
// 64 bit unscaled offsets
|
// 64 bit unscaled offsets
|
||||||
def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
|
def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
|
||||||
|
|
||||||
|
// 64 bit indices
|
||||||
|
def int_aarch64_sve_stnt1_scatter_index
|
||||||
|
: AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
|
||||||
|
|
||||||
// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
|
// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
|
||||||
def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
|
def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
|
||||||
|
|
||||||
|
|
|
@ -5262,7 +5262,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
|
||||||
// amounts. This catches things like trying to shift an i1024 value by an
|
// amounts. This catches things like trying to shift an i1024 value by an
|
||||||
// i8, which is easy to fall into in generic code that uses
|
// i8, which is easy to fall into in generic code that uses
|
||||||
// TLI.getShiftAmount().
|
// TLI.getShiftAmount().
|
||||||
assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
|
assert(N2.getValueType().getScalarSizeInBits().getFixedSize() >=
|
||||||
|
Log2_32_Ceil(VT.getScalarSizeInBits().getFixedSize()) &&
|
||||||
"Invalid use of small shift amount with oversized value!");
|
"Invalid use of small shift amount with oversized value!");
|
||||||
|
|
||||||
// Always fold shifts of i1 values so the code generator doesn't need to
|
// Always fold shifts of i1 values so the code generator doesn't need to
|
||||||
|
|
|
@ -190,6 +190,11 @@ public:
|
||||||
return SelectSVELogicalImm(N, VT, Imm);
|
return SelectSVELogicalImm(N, VT, Imm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <unsigned Low, unsigned High>
|
||||||
|
bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
|
||||||
|
return SelectSVEShiftImm64(N, Low, High, Imm);
|
||||||
|
}
|
||||||
|
|
||||||
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
|
// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
|
||||||
template<signed Min, signed Max, signed Scale, bool Shift>
|
template<signed Min, signed Max, signed Scale, bool Shift>
|
||||||
bool SelectCntImm(SDValue N, SDValue &Imm) {
|
bool SelectCntImm(SDValue N, SDValue &Imm) {
|
||||||
|
@ -307,6 +312,8 @@ private:
|
||||||
bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
|
bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
|
||||||
|
|
||||||
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
|
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
|
||||||
|
bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
|
||||||
|
SDValue &Imm);
|
||||||
|
|
||||||
bool SelectSVEArithImm(SDValue N, SDValue &Imm);
|
bool SelectSVEArithImm(SDValue N, SDValue &Imm);
|
||||||
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
|
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
|
||||||
|
@ -3072,6 +3079,24 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This method is only needed to "cast" i64s into i32s when the value
|
||||||
|
// is a valid shift which has been splatted into a vector with i64 elements.
|
||||||
|
// Every other type is fine in tablegen.
|
||||||
|
bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
|
||||||
|
uint64_t High, SDValue &Imm) {
|
||||||
|
if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
|
||||||
|
uint64_t ImmVal = CN->getZExtValue();
|
||||||
|
SDLoc DL(N);
|
||||||
|
|
||||||
|
if (ImmVal >= Low && ImmVal <= High) {
|
||||||
|
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
|
bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
|
||||||
// tagp(FrameIndex, IRGstack, tag_offset):
|
// tagp(FrameIndex, IRGstack, tag_offset):
|
||||||
// since the offset between FrameIndex and IRGstack is a compile-time
|
// since the offset between FrameIndex and IRGstack is a compile-time
|
||||||
|
|
|
@ -1440,6 +1440,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||||
case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM";
|
case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM";
|
||||||
|
|
||||||
case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1";
|
case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1";
|
||||||
|
case AArch64ISD::GLDNT1_INDEX: return "AArch64ISD::GLDNT1_INDEX";
|
||||||
case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S";
|
case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S";
|
||||||
|
|
||||||
case AArch64ISD::SST1: return "AArch64ISD::SST1";
|
case AArch64ISD::SST1: return "AArch64ISD::SST1";
|
||||||
|
@ -1451,6 +1452,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||||
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
|
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
|
||||||
|
|
||||||
case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1";
|
case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1";
|
||||||
|
case AArch64ISD::SSTNT1_INDEX: return "AArch64ISD::SSTNT1_INDEX";
|
||||||
|
|
||||||
case AArch64ISD::LDP: return "AArch64ISD::LDP";
|
case AArch64ISD::LDP: return "AArch64ISD::LDP";
|
||||||
case AArch64ISD::STP: return "AArch64ISD::STP";
|
case AArch64ISD::STP: return "AArch64ISD::STP";
|
||||||
|
@ -12628,6 +12630,19 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
|
||||||
DAG.getConstant(MinOffset, DL, MVT::i64));
|
DAG.getConstant(MinOffset, DL, MVT::i64));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Turns the vector of indices into a vector of byte offstes by scaling Offset
|
||||||
|
// by (BitWidth / 8).
|
||||||
|
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
|
||||||
|
SDLoc DL, unsigned BitWidth) {
|
||||||
|
assert(Offset.getValueType().isScalableVector() &&
|
||||||
|
"This method is only for scalable vectors of offsets");
|
||||||
|
|
||||||
|
SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
|
||||||
|
SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
|
||||||
|
|
||||||
|
return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
|
||||||
|
}
|
||||||
|
|
||||||
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
|
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
|
||||||
unsigned Opcode,
|
unsigned Opcode,
|
||||||
bool OnlyPackedOffsets = true) {
|
bool OnlyPackedOffsets = true) {
|
||||||
|
@ -12655,6 +12670,15 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
|
||||||
// vector of offsets (that fits into one register)
|
// vector of offsets (that fits into one register)
|
||||||
SDValue Offset = N->getOperand(5);
|
SDValue Offset = N->getOperand(5);
|
||||||
|
|
||||||
|
// For "scalar + vector of indices", just scale the indices. This only
|
||||||
|
// applies to non-temporal scatters because there's no instruction that takes
|
||||||
|
// indicies.
|
||||||
|
if (Opcode == AArch64ISD::SSTNT1_INDEX) {
|
||||||
|
Offset =
|
||||||
|
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
|
||||||
|
Opcode = AArch64ISD::SSTNT1;
|
||||||
|
}
|
||||||
|
|
||||||
// In the case of non-temporal gather loads there's only one SVE instruction
|
// In the case of non-temporal gather loads there's only one SVE instruction
|
||||||
// per data-size: "scalar + vector", i.e.
|
// per data-size: "scalar + vector", i.e.
|
||||||
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
|
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
|
||||||
|
@ -12749,6 +12773,15 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
|
||||||
// vector of offsets (that fits into one register)
|
// vector of offsets (that fits into one register)
|
||||||
SDValue Offset = N->getOperand(4);
|
SDValue Offset = N->getOperand(4);
|
||||||
|
|
||||||
|
// For "scalar + vector of indices", just scale the indices. This only
|
||||||
|
// applies to non-temporal gathers because there's no instruction that takes
|
||||||
|
// indicies.
|
||||||
|
if (Opcode == AArch64ISD::GLDNT1_INDEX) {
|
||||||
|
Offset =
|
||||||
|
getScaledOffsetForBitWidth(DAG, Offset, DL, RetElVT.getSizeInBits());
|
||||||
|
Opcode = AArch64ISD::GLDNT1;
|
||||||
|
}
|
||||||
|
|
||||||
// In the case of non-temporal gather loads there's only one SVE instruction
|
// In the case of non-temporal gather loads there's only one SVE instruction
|
||||||
// per data-size: "scalar + vector", i.e.
|
// per data-size: "scalar + vector", i.e.
|
||||||
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
|
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
|
||||||
|
@ -13006,6 +13039,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
||||||
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
|
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
|
||||||
case Intrinsic::aarch64_sve_ldnt1_gather:
|
case Intrinsic::aarch64_sve_ldnt1_gather:
|
||||||
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
|
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
|
||||||
|
case Intrinsic::aarch64_sve_ldnt1_gather_index:
|
||||||
|
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_INDEX);
|
||||||
case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
|
case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
|
||||||
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
|
return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
|
||||||
case Intrinsic::aarch64_sve_ldnf1:
|
case Intrinsic::aarch64_sve_ldnf1:
|
||||||
|
@ -13020,6 +13055,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
||||||
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
|
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
|
||||||
case Intrinsic::aarch64_sve_stnt1_scatter:
|
case Intrinsic::aarch64_sve_stnt1_scatter:
|
||||||
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
|
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
|
||||||
|
case Intrinsic::aarch64_sve_stnt1_scatter_index:
|
||||||
|
return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX);
|
||||||
case Intrinsic::aarch64_sve_ld1_gather:
|
case Intrinsic::aarch64_sve_ld1_gather:
|
||||||
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
|
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
|
||||||
case Intrinsic::aarch64_sve_ld1_gather_index:
|
case Intrinsic::aarch64_sve_ld1_gather_index:
|
||||||
|
|
|
@ -263,6 +263,7 @@ enum NodeType : unsigned {
|
||||||
|
|
||||||
// Non-temporal gather loads
|
// Non-temporal gather loads
|
||||||
GLDNT1,
|
GLDNT1,
|
||||||
|
GLDNT1_INDEX,
|
||||||
GLDNT1S,
|
GLDNT1S,
|
||||||
|
|
||||||
// Scatter store
|
// Scatter store
|
||||||
|
@ -276,6 +277,7 @@ enum NodeType : unsigned {
|
||||||
|
|
||||||
// Non-temporal scatter store
|
// Non-temporal scatter store
|
||||||
SSTNT1,
|
SSTNT1,
|
||||||
|
SSTNT1_INDEX,
|
||||||
|
|
||||||
// Strict (exception-raising) floating point comparison
|
// Strict (exception-raising) floating point comparison
|
||||||
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
|
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
|
def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
|
||||||
|
def SVELShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
|
||||||
|
|
||||||
// Non-faulting loads - node definitions
|
// Non-faulting loads - node definitions
|
||||||
//
|
//
|
||||||
|
@ -139,7 +140,6 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
|
||||||
def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
|
def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
|
||||||
|
|
||||||
let Predicates = [HasSVE] in {
|
let Predicates = [HasSVE] in {
|
||||||
|
|
||||||
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
|
defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
|
||||||
def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
|
def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
|
||||||
defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
|
defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
|
||||||
|
@ -1108,9 +1108,23 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
|
||||||
defm INDEX_II : sve_int_index_ii<"index", index_vector>;
|
defm INDEX_II : sve_int_index_ii<"index", index_vector>;
|
||||||
|
|
||||||
// Unpredicated shifts
|
// Unpredicated shifts
|
||||||
defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
|
defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", sra>;
|
||||||
defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
|
defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", srl>;
|
||||||
defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
|
defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", shl>;
|
||||||
|
|
||||||
|
// Patterns for unpredicated left shift by immediate
|
||||||
|
def : Pat<(nxv16i8 (shl (nxv16i8 ZPR:$Zs1),
|
||||||
|
(nxv16i8 (AArch64dup (vecshiftL8:$imm))))),
|
||||||
|
(LSL_ZZI_B ZPR:$Zs1, vecshiftL8:$imm)>;
|
||||||
|
def : Pat<(nxv8i16 (shl (nxv8i16 ZPR:$Zs1),
|
||||||
|
(nxv8i16 (AArch64dup (vecshiftL16:$imm))))),
|
||||||
|
(LSL_ZZI_H ZPR:$Zs1, vecshiftL16:$imm)>;
|
||||||
|
def : Pat<(nxv4i32 (shl (nxv4i32 ZPR:$Zs1),
|
||||||
|
(nxv4i32 (AArch64dup (vecshiftL32:$imm))))),
|
||||||
|
(LSL_ZZI_S ZPR:$Zs1, vecshiftL32:$imm)>;
|
||||||
|
def : Pat<(nxv2i64 (shl (nxv2i64 ZPR:$Zs1),
|
||||||
|
(nxv2i64 (AArch64dup (i64 (SVELShiftImm64 i32:$imm)))))),
|
||||||
|
(LSL_ZZI_D ZPR:$Zs1, vecshiftL64:$imm)>;
|
||||||
|
|
||||||
defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
|
defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
|
||||||
defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
|
defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
|
||||||
|
|
|
@ -4828,10 +4828,12 @@ multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
|
||||||
}
|
}
|
||||||
|
|
||||||
class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
|
class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
|
||||||
ZPRRegOp zprty, Operand immtype>
|
ZPRRegOp zprty, Operand immtype, ValueType vt,
|
||||||
|
SDPatternOperator op>
|
||||||
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
|
: I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
|
||||||
asm, "\t$Zd, $Zn, $imm",
|
asm, "\t$Zd, $Zn, $imm",
|
||||||
"", []>, Sched<[]> {
|
"",
|
||||||
|
[(set (vt zprty:$Zd), (op (vt zprty:$Zn), immtype:$imm))]>, Sched<[]> {
|
||||||
bits<5> Zd;
|
bits<5> Zd;
|
||||||
bits<5> Zn;
|
bits<5> Zn;
|
||||||
bits<6> imm;
|
bits<6> imm;
|
||||||
|
@ -4846,29 +4848,31 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
|
||||||
let Inst{4-0} = Zd;
|
let Inst{4-0} = Zd;
|
||||||
}
|
}
|
||||||
|
|
||||||
multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
|
multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
|
||||||
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
|
SDPatternOperator op> {
|
||||||
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
|
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, nxv16i8, op>;
|
||||||
|
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, nxv8i16, op> {
|
||||||
let Inst{19} = imm{3};
|
let Inst{19} = imm{3};
|
||||||
}
|
}
|
||||||
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
|
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, nxv4i32, op> {
|
||||||
let Inst{20-19} = imm{4-3};
|
let Inst{20-19} = imm{4-3};
|
||||||
}
|
}
|
||||||
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
|
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, nxv2i64, op> {
|
||||||
let Inst{22} = imm{5};
|
let Inst{22} = imm{5};
|
||||||
let Inst{20-19} = imm{4-3};
|
let Inst{20-19} = imm{4-3};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
|
multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
|
||||||
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
|
SDPatternOperator op> {
|
||||||
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
|
def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, nxv16i8, op>;
|
||||||
|
def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, nxv8i16, op> {
|
||||||
let Inst{19} = imm{3};
|
let Inst{19} = imm{3};
|
||||||
}
|
}
|
||||||
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
|
def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, nxv4i32, op> {
|
||||||
let Inst{20-19} = imm{4-3};
|
let Inst{20-19} = imm{4-3};
|
||||||
}
|
}
|
||||||
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
|
def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, nxv2i64, op> {
|
||||||
let Inst{22} = imm{5};
|
let Inst{22} = imm{5};
|
||||||
let Inst{20-19} = imm{4-3};
|
let Inst{20-19} = imm{4-3};
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,90 @@
|
||||||
|
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
|
||||||
|
|
||||||
|
;
|
||||||
|
; LDNT1H, LDNT1W, LDNT1D: base + 64-bit index
|
||||||
|
; e.g.
|
||||||
|
; lsl z0.d, z0.d, #1
|
||||||
|
; ldnt1h z0.d, p0/z, [z0.d, x0]
|
||||||
|
;
|
||||||
|
|
||||||
|
define <vscale x 2 x i64> @gldnt1h_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
|
||||||
|
; CHECK-LABEL: gldnt1h_index
|
||||||
|
; CHECK: lsl z0.d, z0.d, #1
|
||||||
|
; CHECK-NEXT: ldnt1h { z0.d }, p0/z, [z0.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
|
||||||
|
i16* %base,
|
||||||
|
<vscale x 2 x i64> %b)
|
||||||
|
%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
|
||||||
|
ret <vscale x 2 x i64> %res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 2 x i64> @gldnt1w_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
|
||||||
|
; CHECK-LABEL: gldnt1w_index
|
||||||
|
; CHECK: lsl z0.d, z0.d, #2
|
||||||
|
; CHECK-NEXT: ldnt1w { z0.d }, p0/z, [z0.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
|
||||||
|
i32* %base,
|
||||||
|
<vscale x 2 x i64> %b)
|
||||||
|
%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
|
||||||
|
ret <vscale x 2 x i64> %res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 2 x i64> @gldnt1d_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
|
||||||
|
; CHECK-LABEL: gldnt1d_index
|
||||||
|
; CHECK: lsl z0.d, z0.d, #3
|
||||||
|
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [z0.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
|
||||||
|
i64* %base,
|
||||||
|
<vscale x 2 x i64> %b)
|
||||||
|
ret <vscale x 2 x i64> %load
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 2 x double> @gldnt1d_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
|
||||||
|
; CHECK-LABEL: gldnt1d_index_double
|
||||||
|
; CHECK: lsl z0.d, z0.d, #3
|
||||||
|
; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [z0.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
|
||||||
|
double* %base,
|
||||||
|
<vscale x 2 x i64> %b)
|
||||||
|
ret <vscale x 2 x double> %load
|
||||||
|
}
|
||||||
|
|
||||||
|
;
|
||||||
|
; LDNT1SH, LDNT1SW: base + 64-bit index
|
||||||
|
; e.g.
|
||||||
|
; lsl z0.d, z0.d, #1
|
||||||
|
; ldnt1sh z0.d, p0/z, [z0.d, x0]
|
||||||
|
;
|
||||||
|
|
||||||
|
define <vscale x 2 x i64> @gldnt1sh_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
|
||||||
|
; CHECK-LABEL: gldnt1sh_index
|
||||||
|
; CHECK: lsl z0.d, z0.d, #1
|
||||||
|
; CHECK-NEXT: ldnt1sh { z0.d }, p0/z, [z0.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
|
||||||
|
i16* %base,
|
||||||
|
<vscale x 2 x i64> %b)
|
||||||
|
%res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
|
||||||
|
ret <vscale x 2 x i64> %res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <vscale x 2 x i64> @gldnt1sw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
|
||||||
|
; CHECK-LABEL: gldnt1sw_index
|
||||||
|
; CHECK: lsl z0.d, z0.d, #2
|
||||||
|
; CHECK-NEXT: ldnt1sw { z0.d }, p0/z, [z0.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
|
||||||
|
i32* %base,
|
||||||
|
<vscale x 2 x i64> %b)
|
||||||
|
%res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
|
||||||
|
ret <vscale x 2 x i64> %res
|
||||||
|
}
|
||||||
|
|
||||||
|
declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
|
||||||
|
declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
|
||||||
|
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
|
||||||
|
declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
|
|
@ -0,0 +1,64 @@
|
||||||
|
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
|
||||||
|
|
||||||
|
;
|
||||||
|
; STNT1H, STNT1W, STNT1D: base + 64-bit index
|
||||||
|
; e.g.
|
||||||
|
; lsl z1.d, z1.d, #1
|
||||||
|
; stnt1h { z0.d }, p0, [z0.d, x0]
|
||||||
|
;
|
||||||
|
|
||||||
|
define void @sstnt1h_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %offsets) {
|
||||||
|
; CHECK-LABEL: sstnt1h_index
|
||||||
|
; CHECK: lsl z1.d, z1.d, #1
|
||||||
|
; CHECK-NEXT: stnt1h { z0.d }, p0, [z1.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
|
||||||
|
call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16(<vscale x 2 x i16> %data_trunc,
|
||||||
|
<vscale x 2 x i1> %pg,
|
||||||
|
i16* %base,
|
||||||
|
<vscale x 2 x i64> %offsets)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @sstnt1w_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
|
||||||
|
; CHECK-LABEL: sstnt1w_index
|
||||||
|
; CHECK: lsl z1.d, z1.d, #2
|
||||||
|
; CHECK-NEXT: stnt1w { z0.d }, p0, [z1.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
%data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
|
||||||
|
call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
|
||||||
|
<vscale x 2 x i1> %pg,
|
||||||
|
i32* %base,
|
||||||
|
<vscale x 2 x i64> %offsets)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @sstnt1d_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %offsets) {
|
||||||
|
; CHECK-LABEL: sstnt1d_index
|
||||||
|
; CHECK: lsl z1.d, z1.d, #3
|
||||||
|
; CHECK-NEXT: stnt1d { z0.d }, p0, [z1.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64(<vscale x 2 x i64> %data,
|
||||||
|
<vscale x 2 x i1> %pg,
|
||||||
|
i64* %base,
|
||||||
|
<vscale x 2 x i64> %offsets)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @sstnt1d_index_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %offsets) {
|
||||||
|
; CHECK-LABEL: sstnt1d_index_double
|
||||||
|
; CHECK: lsl z1.d, z1.d, #3
|
||||||
|
; CHECK-NEXT: stnt1d { z0.d }, p0, [z1.d, x0]
|
||||||
|
; CHECK-NEXT: ret
|
||||||
|
call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64(<vscale x 2 x double> %data,
|
||||||
|
<vscale x 2 x i1> %pg,
|
||||||
|
double* %base,
|
||||||
|
<vscale x 2 x i64> %offsets)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
|
||||||
|
declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
|
||||||
|
declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
|
||||||
|
declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)
|
Loading…
Reference in New Issue