[SVE][CodeGen] Add DAG combines for s/zext_masked_gather

This patch adds the following DAGCombines, which apply if isVectorLoadExtDesirable() returns true:
 - fold (and (masked_gather x)) -> (zext_masked_gather x)
 - fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)

LowerMGATHER has also been updated to fetch the LoadExtType associated with the
gather and also use this value to determine the correct masked gather opcode to use.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D92230
This commit is contained in:
Kerry McLaughlin 2020-12-09 11:21:51 +00:00
parent d568cff696
commit 05edfc5475
9 changed files with 167 additions and 90 deletions

View File

@ -932,6 +932,33 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
return false;
}
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
if (!ScalarTy.isSimple())
return false;
uint64_t MaskForTy = 0ULL;
switch (ScalarTy.getSimpleVT().SimpleTy) {
case MVT::i8:
MaskForTy = 0xFFULL;
break;
case MVT::i16:
MaskForTy = 0xFFFFULL;
break;
case MVT::i32:
MaskForTy = 0xFFFFFFFFULL;
break;
default:
return false;
break;
}
APInt Val;
if (ISD::isConstantSplatVector(N, Val))
return Val.getLimitedValue() == MaskForTy;
return false;
}
// Returns the SDNode if it is a constant float BuildVector
// or constant float.
static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
@ -5622,6 +5649,28 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
}
}
// fold (and (masked_gather x)) -> (zext_masked_gather x)
if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
EVT MemVT = GN0->getMemoryVT();
EVT ScalarVT = MemVT.getScalarType();
if (SDValue(GN0, 0).hasOneUse() &&
isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
SDValue ZExtLoad = DAG.getMaskedGather(
DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
CombineTo(N, ZExtLoad);
AddToWorklist(ZExtLoad.getNode());
// Avoid recheck of N.
return SDValue(N, 0);
}
}
// fold (and (load x), 255) -> (zextload x, i8)
// fold (and (extload x, i16), 255) -> (zextload x, i8)
// fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
@ -11597,6 +11646,25 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
}
}
// fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
if (SDValue(GN0, 0).hasOneUse() &&
ExtVT == GN0->getMemoryVT() &&
TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
SDValue ExtLoad = DAG.getMaskedGather(
DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
CombineTo(N, ExtLoad);
CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
AddToWorklist(ExtLoad.getNode());
return SDValue(N, 0); // Return N so it doesn't get rechecked!
}
}
// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),

View File

@ -3836,6 +3836,26 @@ unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
return AddrModes.find(Key)->second;
}
unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
switch (Opcode) {
default:
llvm_unreachable("unimplemented opcode");
return Opcode;
case AArch64ISD::GLD1_MERGE_ZERO:
return AArch64ISD::GLD1S_MERGE_ZERO;
case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
}
}
bool getGatherScatterIndexIsExtended(SDValue Index) {
unsigned Opcode = Index.getOpcode();
if (Opcode == ISD::SIGN_EXTEND_INREG)
@ -3865,6 +3885,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
SDValue PassThru = MGT->getPassThru();
SDValue Mask = MGT->getMask();
SDValue BasePtr = MGT->getBasePtr();
ISD::LoadExtType ExtTy = MGT->getExtensionType();
ISD::MemIndexType IndexType = MGT->getIndexType();
bool IsScaled =
@ -3874,6 +3895,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
bool IdxNeedsExtend =
getGatherScatterIndexIsExtended(Index) ||
Index.getSimpleValueType().getVectorElementType() == MVT::i32;
bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
EVT VT = PassThru.getSimpleValueType();
EVT MemVT = MGT->getMemoryVT();
@ -3900,9 +3922,12 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);
unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
if (ResNeedsSignExtend)
Opcode = getSignExtendedGatherOpcode(Opcode);
SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
VTs, Ops);
return DAG.getNode(Opcode, DL, VTs, Ops);
}
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@ -72,9 +71,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@ -85,9 +82,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@ -103,7 +98,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
@ -144,9 +138,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@ -34,7 +33,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@ -90,9 +88,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@ -103,9 +99,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@ -117,9 +111,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@ -136,7 +128,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
; CHECK-LABEL: masked_gather_nxv4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xff
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@ -148,7 +139,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
@ -193,9 +183,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@ -206,9 +194,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
%ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@ -78,9 +77,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@ -92,9 +89,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@ -111,7 +106,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
@ -156,9 +150,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unscaled unpacked 32-bit offsets
@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@ -36,7 +35,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@ -97,9 +95,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@ -111,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@ -126,9 +120,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@ -146,7 +138,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
; CHECK-LABEL: masked_gather_nxv4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@ -159,7 +150,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
; CHECK-LABEL: masked_gather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: and z0.s, z0.s, #0xffff
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@ -208,9 +198,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@ -222,9 +210,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
; CHECK-NEXT: ret
%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext

View File

@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64>
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@ -68,9 +67,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
; CHECK-NEXT: ret
%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@ -81,9 +78,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
; CHECK-NEXT: ret
%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)

View File

@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %
; CHECK-LABEL: masked_gather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: and z0.d, z0.d, #0xffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@ -30,7 +29,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@ -86,9 +84,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64
define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@ -99,9 +95,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %
define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxth z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@ -113,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64>
define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_sgather_nxv2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>

View File

@ -1,5 +1,46 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
; Test for multiple uses of the mgather where the s/zext should not be combined
define <vscale x 2 x i64> @masked_sgather_sext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
; CHECK-LABEL: masked_sgather_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sxtb z2.d, p0/m, z0.d
; CHECK-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
%data.sext = sext <vscale x 2 x i8> %data to <vscale x 2 x i64>
%add = add <vscale x 2 x i8> %data, %vals
%add.sext = sext <vscale x 2 x i8> %add to <vscale x 2 x i64>
%mul = mul <vscale x 2 x i64> %data.sext, %add.sext
ret <vscale x 2 x i64> %mul
}
define <vscale x 2 x i64> @masked_sgather_zext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
; CHECK-LABEL: masked_sgather_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: add z1.d, z0.d, z1.d
; CHECK-NEXT: and z0.d, z0.d, #0xff
; CHECK-NEXT: and z1.d, z1.d, #0xff
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
%data.zext = zext <vscale x 2 x i8> %data to <vscale x 2 x i64>
%add = add <vscale x 2 x i8> %data, %vals
%add.zext = zext <vscale x 2 x i8> %add to <vscale x 2 x i64>
%mul = mul <vscale x 2 x i64> %data.zext, %add.zext
ret <vscale x 2 x i64> %mul
}
; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
@ -7,7 +48,7 @@
define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_nxv2i32:
; CHECK-DAG: mov x8, xzr
; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d]
; CHECK: ret
%data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
ret <vscale x 2 x i32> %data
@ -41,8 +82,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vsca
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
; CHECK-NEXT: ld1b { z1.d }, p2/z, [x8, z1.d]
; CHECK-NEXT: ld1b { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ld1sb { z1.d }, p2/z, [x8, z1.d]
; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x8, z0.d]
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
; CHECK-NEXT: sxtb z0.s, p0/m, z0.s