[SVE][CodeGen] Lower scalable masked scatters

Lowers the llvm.masked.scatter intrinsics (scalar plus vector addressing mode only)

Changes included in this patch:
 - Custom lowering for MSCATTER, which chooses the appropriate scatter store opcode to use.
    Floating-point scatters are cast to integer, with patterns added to match FP reinterpret_casts.
 - Added the getCanonicalIndexType function to convert redundant addressing
   modes (e.g. scaling is redundant when accessing bytes)
 - Tests with 32 & 64-bit scaled & unscaled offsets

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D90941
This commit is contained in:
Kerry McLaughlin 2020-11-11 11:15:32 +00:00
parent 98aa067109
commit 170947a5de
13 changed files with 1311 additions and 5 deletions

View File

@ -2391,6 +2391,9 @@ public:
ISD::MemIndexType getIndexType() const {
return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
}
void setIndexType(ISD::MemIndexType IndexType) {
LSBaseSDNodeBits.AddressingMode = IndexType;
}
bool isIndexScaled() const {
return (getIndexType() == ISD::SIGNED_SCALED) ||
(getIndexType() == ISD::UNSIGNED_SCALED);

View File

@ -4510,6 +4510,10 @@ public:
// combiner can fold the new nodes.
SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
/// Give targets the chance to reduce the number of distinct addresing modes.
ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType,
EVT MemVT, SDValue Offsets) const;
private:
SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
const SDLoc &DL, DAGCombinerInfo &DCI) const;

View File

@ -1865,6 +1865,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
else
NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
N->setIndexType(TLI.getCanonicalIndexType(N->getIndexType(),
N->getMemoryVT(), NewOps[OpNo]));
} else {
NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
TruncateStore = true;

View File

@ -7356,15 +7356,21 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
return SDValue(E, 0);
}
IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VTs, VT, MMO, IndexType, IsTrunc);
createOperands(N, Ops);
assert(N->getMask().getValueType().getVectorNumElements() ==
N->getValue().getValueType().getVectorNumElements() &&
assert(N->getMask().getValueType().getVectorElementCount() ==
N->getValue().getValueType().getVectorElementCount() &&
"Vector width mismatch between mask and data");
assert(N->getIndex().getValueType().getVectorNumElements() >=
N->getValue().getValueType().getVectorNumElements() &&
assert(
N->getIndex().getValueType().getVectorElementCount().isScalable() ==
N->getValue().getValueType().getVectorElementCount().isScalable() &&
"Scalable flags of index and data do not match");
assert(ElementCount::isKnownGE(
N->getIndex().getValueType().getVectorElementCount(),
N->getValue().getValueType().getVectorElementCount()) &&
"Vector width mismatch between index and data");
assert(isa<ConstantSDNode>(N->getScale()) &&
cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&

View File

@ -4297,7 +4297,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
IndexType = ISD::SIGNED_SCALED;
IndexType = ISD::SIGNED_UNSCALED;
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };

View File

@ -7439,6 +7439,25 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
return SDValue();
}
// Convert redundant addressing modes (e.g. scaling is redundant
// when accessing bytes).
ISD::MemIndexType
TargetLowering::getCanonicalIndexType(ISD::MemIndexType IndexType, EVT MemVT,
SDValue Offsets) const {
bool IsScaledIndex =
(IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::UNSIGNED_SCALED);
bool IsSignedIndex =
(IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::SIGNED_UNSCALED);
// Scaling is unimportant for bytes, canonicalize to unscaled.
if (IsScaledIndex && MemVT.getScalarType() == MVT::i8) {
IsScaledIndex = false;
IndexType = IsSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
}
return IndexType;
}
SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
unsigned Opcode = Node->getOpcode();
SDValue LHS = Node->getOperand(0);

View File

@ -1001,6 +1001,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
@ -1052,6 +1053,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
@ -1073,6 +1075,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, VT, Custom);
}
for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16})
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
@ -3705,6 +3710,100 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
}
unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
{std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
AArch64ISD::SST1_PRED},
{std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
AArch64ISD::SST1_UXTW_PRED},
{std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
AArch64ISD::SST1_PRED},
{std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
AArch64ISD::SST1_SXTW_PRED},
{std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
AArch64ISD::SST1_SCALED_PRED},
{std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
AArch64ISD::SST1_UXTW_SCALED_PRED},
{std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
AArch64ISD::SST1_SCALED_PRED},
{std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
AArch64ISD::SST1_SXTW_SCALED_PRED},
};
auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
return AddrModes.find(Key)->second;
}
bool getScatterIndexIsExtended(SDValue Index) {
unsigned Opcode = Index.getOpcode();
if (Opcode == ISD::SIGN_EXTEND_INREG)
return true;
if (Opcode == ISD::AND) {
SDValue Splat = Index.getOperand(1);
if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
return false;
ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
return false;
return true;
}
return false;
}
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
assert(MSC && "Can only custom lower scatter store nodes");
SDValue Index = MSC->getIndex();
SDValue Chain = MSC->getChain();
SDValue StoreVal = MSC->getValue();
SDValue Mask = MSC->getMask();
SDValue BasePtr = MSC->getBasePtr();
ISD::MemIndexType IndexType = MSC->getIndexType();
bool IsScaled =
IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
bool IsSigned =
IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
bool NeedsExtend =
getScatterIndexIsExtended(Index) ||
Index.getSimpleValueType().getVectorElementType() == MVT::i32;
EVT VT = StoreVal.getSimpleValueType();
SDVTList VTs = DAG.getVTList(MVT::Other);
EVT MemVT = MSC->getMemoryVT();
SDValue InputVT = DAG.getValueType(MemVT);
if (VT.getVectorElementType() == MVT::bf16 &&
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
// Handle FP data
if (VT.isFloatingPoint()) {
VT = VT.changeVectorElementTypeToInteger();
ElementCount EC = VT.getVectorElementCount();
auto ScalarIntVT =
MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
StoreVal = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
MVT::getVectorVT(ScalarIntVT, EC), StoreVal);
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
}
if (getScatterIndexIsExtended(Index)) {
if (Index.getOpcode() == ISD::AND)
IsSigned = false;
Index = Index.getOperand(0);
}
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,
VTs, Ops);
}
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
@ -3982,6 +4081,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
case ISD::MSCATTER:
return LowerMSCATTER(Op, DAG);
case ISD::VECREDUCE_SEQ_FADD:
return LowerVECREDUCE_SEQ_FADD(Op, DAG);
case ISD::VECREDUCE_ADD:

View File

@ -807,6 +807,8 @@ private:
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(

View File

@ -1191,6 +1191,13 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
(UUNPKHI_ZZ_D ZPR:$Zs)>;
let Predicates = [HasSVE, HasBF16] in {
def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))),
(UUNPKLO_ZZ_D ZPR:$Zs)>;
def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))),
(UUNPKHI_ZZ_D ZPR:$Zs)>;
}
def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
(UUNPKLO_ZZ_S ZPR:$Zs)>;
def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
@ -1769,6 +1776,16 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i64 (reinterpret_cast (nxv2f64 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
def : Pat<(nxv2i64 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
def : Pat<(nxv2i64 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
def : Pat<(nxv4i32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
def : Pat<(nxv4i32 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
let Predicates = [HasSVE, HasBF16] in {
def : Pat<(nxv2i64 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
def : Pat<(nxv4i32 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
}
def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),

View File

@ -0,0 +1,370 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; scaled unpacked 32-bit offsets
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define void @masked_scatter_nxv2i16_sext(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i32_sext(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw #2]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i64_sext(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw #3]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f16_sext(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2bf16_sext(<vscale x 2 x bfloat> %data, bfloat* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f32_sext(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw #2]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f64_sext(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f64_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw #3]
; CHECK-NEXT: ret
%ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i16_zext(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i32_zext(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i64_zext(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw #3]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f16_zext(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2bf16_zext(<vscale x 2 x bfloat> %data, bfloat* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f32_zext(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f64_zext(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f64_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw #3]
; CHECK-NEXT: ret
%ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
%ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %ext
call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; scaled packed 32-bit offset
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define void @masked_scatter_nxv4i16_sext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i32_sext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f16_sext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4bf16_sext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f32_sext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i16_zext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i32_zext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f16_zext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4bf16_zext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f32_zext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
attributes #0 = { "target-features"="+sve,+bf16" }

View File

@ -0,0 +1,577 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define void @masked_scatter_nxv2i8_sext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i8_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i16_sext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i32_sext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i64_sext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f16_sext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2bf16_sext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f32_sext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f64_sext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f64_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i8_zext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i8_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i16_zext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i32_zext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i64_zext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f16_zext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2bf16_zext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f32_zext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f64_zext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f64_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled packed 32-bit offsets
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define void @masked_scatter_nxv4i8_sext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i8_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i8*>
call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i16_sext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i32_sext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f16_sext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4bf16_sext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f32_sext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i8_zext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i8_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i8*>
call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i16_zext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4i32_zext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f16_zext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4bf16_zext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
define void @masked_scatter_nxv4f32_zext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
ret void
}
declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
attributes #0 = { "target-features"="+sve,+bf16" }

View File

@ -0,0 +1,73 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; scaled 64-bit offsets
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define void @masked_scatter_nxv2i16(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_scatter_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
ret void
}
define void @masked_scatter_nxv2i32(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_scatter_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
ret void
}
define void @masked_scatter_nxv2i64(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_scatter_nxv2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
; CHECK-NEXT: ret
%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
ret void
}
define void @masked_scatter_nxv2f16(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_scatter_nxv2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
; CHECK-NEXT: ret
%ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
ret void
}
define void @masked_scatter_nxv2f32(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_scatter_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
; CHECK-NEXT: ret
%ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
ret void
}
define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_scatter_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
; CHECK-NEXT: ret
%ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
ret void
}
declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)

View File

@ -0,0 +1,132 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled 64-bit offsets
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
define void @masked_scatter_nxv2i8_unscaled_64bit_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i8_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i16_unscaled_64bit_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i32_unscaled_64bit_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2i64_unscaled_64bit_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f16_unscaled_64bit_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2bf16_unscaled_64bit_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f32_unscaled_64bit_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2f32_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
define void @masked_scatter_nxv2f64_unscaled_64bit_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2f64_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
ret void
}
declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
attributes #0 = { "target-features"="+sve,+bf16" }