[AArch64][SVE] Gather loads: pass 32 bit unpacked offsets as nxv2i32

Summary: Currently 32 bit unpacked offsets are passed as nxv2i64. However, as pointed out in https://reviews.llvm.org/D71074, using nxv2i32 instead would improve consistency with: * how other arguments are treated * how scatter stores are implemented This patch makes sure that 32 bit unpacked offsets are passes as nxv2i32 instead of nxv2i64. Reviewers: sdesmalen, efriedma Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D71724
2019-12-19 20:37:57 +00:00 · 2019-12-19 20:37:57 +00:00 · 404da13e1e
parent 535b3c6b2f
commit 404da13e1e
4 changed files with 227 additions and 219 deletions
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@ -1114,7 +1114,8 @@ class AdvSIMD_GatherLoad_32bitOffset_Intrinsic
    : Intrinsic<[llvm_anyvector_ty],
                [
                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                  LLVMPointerToElt<0>, llvm_anyvector_ty
+                  LLVMPointerToElt<0>,
+                  LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>
                ],
                [IntrReadMem, IntrArgMemOnly]>;

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -12231,18 +12231,14 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
 }

 static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
-                                       unsigned Opcode) {
+                                       unsigned Opcode,
+                                       bool OnlyPackedOffsets = true) {
  EVT RetVT = N->getValueType(0);
  assert(RetVT.isScalableVector() &&
         "Gather loads are only possible for SVE vectors");
-
  SDLoc DL(N);
-  MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
-  unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();

-  EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
-  if (RetVT.getSizeInBits().getKnownMinSize() >
-      MaxVT.getSizeInBits().getKnownMinSize())
+  if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
    return SDValue();

  // Depending on the addressing mode, this is either a pointer or a vector of
@ -12250,12 +12246,19 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
  const SDValue Base = N->getOperand(3);
  // Depending on the addressing mode, this is either a single offset or a
  // vector of offsets  (that fits into one register)
-  const SDValue Offset = N->getOperand(4);
+  SDValue Offset = N->getOperand(4);

-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) ||
-      !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
+  auto &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(Base.getValueType()))
    return SDValue();

+  // Some gather load variants allow unpacked offsets, but only as nxv2i32
+  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
+  // nxv2i64. Legalize accordingly.
+  if (!OnlyPackedOffsets &&
+      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
+    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
+
  // Return value type that is representable in hardware
  EVT HwRetVt = getSVEContainerType(RetVT);

@ -12439,13 +12442,17 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    case Intrinsic::aarch64_sve_ld1_gather_index:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
    case Intrinsic::aarch64_sve_ld1_gather_sxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_uxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+                                      /*OnlyPackedOffsets=*/false);
    case Intrinsic::aarch64_sve_ld1_gather_imm:
      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
    case Intrinsic::aarch64_sve_st1_scatter:
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
@ -11,9 +11,9 @@ define <vscale x 4 x i32> @gld1h_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base,
 ; CHECK-LABEL: gld1h_s_uxtw_index:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }
@ -22,31 +22,31 @@ define <vscale x 4 x i32> @gld1h_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base,
 ; CHECK-LABEL: gld1h_s_sxtw_index:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }

-define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_uxtw_index:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_sxtw_index:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }
@ -56,9 +56,9 @@ define <vscale x 4 x i32> @gld1w_s_uxtw_index(<vscale x 4 x i1> %pg, i32* %base,
 ; CHECK-LABEL: gld1w_s_uxtw_index:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 4 x i32> %b)
  ret <vscale x 4 x i32> %load
 }

@ -66,30 +66,30 @@ define <vscale x 4 x i32> @gld1w_s_sxtw_index(<vscale x 4 x i1> %pg, i32* %base,
 ; CHECK-LABEL: gld1w_s_sxtw_index:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 4 x i32> %b)
  ret <vscale x 4 x i32> %load
 }

-define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_uxtw_index:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_sxtw_index:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }
@ -98,9 +98,9 @@ define <vscale x 4 x float> @gld1w_s_uxtw_index_float(<vscale x 4 x i1> %pg, flo
 ; CHECK-LABEL: gld1w_s_uxtw_index_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                            float* %base,
-                                                                                            <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                                    float* %base,
+                                                                                    <vscale x 4 x i32> %b)
  ret <vscale x 4 x float> %load
 }

@ -108,50 +108,50 @@ define <vscale x 4 x float> @gld1w_s_sxtw_index_float(<vscale x 4 x i1> %pg, flo
 ; CHECK-LABEL: gld1w_s_sxtw_index_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                            float* %base,
-                                                                                            <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                                    float* %base,
+                                                                                    <vscale x 4 x i32> %b)
  ret <vscale x 4 x float> %load
 }

 ; LD1D
-define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_s_uxtw_index:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i64* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                  i64* %base,
+                                                                                  <vscale x 2 x i32> %b)
  ret <vscale x 2 x i64> %load
 }

-define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_sxtw_index:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i64* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                  i64* %base,
+                                                                                  <vscale x 2 x i32> %b)
  ret <vscale x 2 x i64> %load
 }

-define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_uxtw_index_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                             double* %base,
-                                                                                             <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                     double* %base,
+                                                                                     <vscale x 2 x i32> %b)
  ret <vscale x 2 x double> %load
 }

-define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_sxtw_index_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                             double* %base,
-                                                                                             <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                     double* %base,
+                                                                                     <vscale x 2 x i32> %b)
  ret <vscale x 2 x double> %load
 }

@ -166,9 +166,9 @@ define <vscale x 4 x i32> @gld1sh_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base
 ; CHECK-LABEL: gld1sh_s_uxtw_index:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }
@ -177,79 +177,79 @@ define <vscale x 4 x i32> @gld1sh_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base
 ; CHECK-LABEL: gld1sh_s_sxtw_index:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }

-define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_uxtw_index:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_sxtw_index:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

 ; LD1SW
-define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_uxtw_index:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_sxtw_index:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }


 ; LD1H/LD1SH
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)

-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)

 ; LD1W/LD1SW
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)

-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)

-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)

 ; LD1D
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)

-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
@ -11,9 +11,9 @@ define <vscale x 4 x i32> @gld1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscal
 ; CHECK-LABEL: gld1b_s_uxtw:
 ; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }
@ -22,31 +22,31 @@ define <vscale x 4 x i32> @gld1b_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscal
 ; CHECK-LABEL: gld1b_s_sxtw:
 ; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }

-define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1b_d_uxtw:
 ; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1b_d_sxtw:
 ; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }
@ -56,9 +56,9 @@ define <vscale x 4 x i32> @gld1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vsca
 ; CHECK-LABEL: gld1h_s_uxtw:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }
@ -67,31 +67,31 @@ define <vscale x 4 x i32> @gld1h_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vsca
 ; CHECK-LABEL: gld1h_s_sxtw:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }

-define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_uxtw:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_sxtw:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }
@ -101,9 +101,9 @@ define <vscale x 4 x i32> @gld1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vsca
 ; CHECK-LABEL: gld1w_s_uxtw:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 4 x i32> %b)
  ret <vscale x 4 x i32> %load
 }

@ -111,30 +111,30 @@ define <vscale x 4 x i32> @gld1w_s_sxtw(<vscale x 4 x i1> %pg, i32* %base, <vsca
 ; CHECK-LABEL: gld1w_s_sxtw:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 4 x i32> %b)
  ret <vscale x 4 x i32> %load
 }

-define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_uxtw:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_sxtw:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }
@ -143,9 +143,9 @@ define <vscale x 4 x float> @gld1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %b
 ; CHECK-LABEL: gld1w_s_uxtw_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                      float* %base,
-                                                                                      <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                              float* %base,
+                                                                              <vscale x 4 x i32> %b)
  ret <vscale x 4 x float> %load
 }

@ -153,50 +153,50 @@ define <vscale x 4 x float> @gld1w_s_sxtw_float(<vscale x 4 x i1> %pg, float* %b
 ; CHECK-LABEL: gld1w_s_sxtw_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                      float* %base,
-                                                                                      <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                              float* %base,
+                                                                              <vscale x 4 x i32> %b)
  ret <vscale x 4 x float> %load
 }

 ; LD1D
-define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_uxtw:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i64* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                            i64* %base,
+                                                                            <vscale x 2 x i32> %b)
  ret <vscale x 2 x i64> %load
 }

-define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_sxtw:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i64* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                            i64* %base,
+                                                                            <vscale x 2 x i32> %b)
  ret <vscale x 2 x i64> %load
 }

-define <vscale x 2 x double> @gld1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_uxtw_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                       double* %base,
-                                                                                       <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                               double* %base,
+                                                                               <vscale x 2 x i32> %b)
  ret <vscale x 2 x double> %load
 }

-define <vscale x 2 x double> @gld1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_sxtw_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                       double* %base,
-                                                                                       <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                               double* %base,
+                                                                               <vscale x 2 x i32> %b)
  ret <vscale x 2 x double> %load
 }

@ -211,9 +211,9 @@ define <vscale x 4 x i32> @gld1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vsca
 ; CHECK-LABEL: gld1sb_s_uxtw:
 ; CHECK: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }
@ -222,31 +222,31 @@ define <vscale x 4 x i32> @gld1sb_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vsca
 ; CHECK-LABEL: gld1sb_s_sxtw:
 ; CHECK: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }

-define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sb_d_uxtw:
 ; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sb_d_sxtw:
 ; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }
@ -256,9 +256,9 @@ define <vscale x 4 x i32> @gld1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vsc
 ; CHECK-LABEL: gld1sh_s_uxtw:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }
@ -267,82 +267,82 @@ define <vscale x 4 x i32> @gld1sh_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vsc
 ; CHECK-LABEL: gld1sh_s_sxtw:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
  ret <vscale x 4 x i32> %res
 }

-define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_uxtw:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_sxtw:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

 ; LD1SW
-define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_uxtw:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

-define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_sxtw:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
  ret <vscale x 2 x i64> %res
 }

 ; LD1B/LD1SB
-declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
-declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
-declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
-declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)

 ; LD1H/LD1SH
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)

 ; LD1W/LD1SW
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)

-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)

 ; LD1D
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)

-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)