[SelectionDAGBuilder][CGP][X86] Move some of SDB's gather/scatter uniform base handling to CGP.

I've always found the "findValue" a little odd and inconsistent with other things in SDB. This simplfifies the code in SDB to just handle a splat constant address or a 2 operand GEP in the same BB. This removes the need for "findValue" since the operands to the GEP are guaranteed to be available. The splat constant handling is new, but was needed to avoid regressions due to constant folding combining GEPs created in CGP. CGP is now responsible for canonicalizing gather/scatters into this form. The pattern I'm using for scalarizing, a scalar GEP followed by a GEP with an all zeroes index, seems to be subject to constant folding that the insertelement+shufflevector was not. Differential Revision: https://reviews.llvm.org/D76947
2020-04-16 17:03:16 -07:00 · 2020-04-16 17:03:16 -07:00 · 944cc5e0ab
parent 741d3c2016
commit 944cc5e0ab
7 changed files with 264 additions and 76 deletions
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@ -368,6 +368,7 @@ class TypePromotionTransaction;
    bool optimizeInst(Instruction *I, bool &ModifiedDT);
    bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                            Type *AccessTy, unsigned AddrSpace);
+    bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
    bool optimizeInlineAsmInst(CallInst *CS);
    bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
    bool optimizeExt(Instruction *&I);
@ -2041,7 +2042,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
        II->eraseFromParent();
        return true;
      }
+      break;
    }
+    case Intrinsic::masked_gather:
+      return optimizeGatherScatterInst(II, II->getArgOperand(0));
+    case Intrinsic::masked_scatter:
+      return optimizeGatherScatterInst(II, II->getArgOperand(1));
    }

    SmallVector<Value *, 2> PtrOps;
@ -5182,6 +5188,119 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
  return true;
 }

+/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
+/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
+/// only handle a 2 operand GEP in the same basic block or a splat constant
+/// vector. The 2 operands to the GEP must have a scalar pointer and a vector
+/// index.
+///
+/// If the existing GEP has a vector base pointer that is splat, we can look
+/// through the splat to find the scalar pointer. If we can't find a scalar
+/// pointer there's nothing we can do.
+///
+/// If we have a GEP with more than 2 indices where the middle indices are all
+/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
+///
+/// If the final index isn't a vector or is a splat, we can emit a scalar GEP
+/// followed by a GEP with an all zeroes vector index. This will enable
+/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
+/// zero index.
+bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
+                                               Value *Ptr) {
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || !GEP->hasIndices())
+    return false;
+
+  // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
+  // FIXME: We should support this by sinking the GEP.
+  if (MemoryInst->getParent() != GEP->getParent())
+    return false;
+
+  SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
+
+  bool RewriteGEP = false;
+
+  if (Ops[0]->getType()->isVectorTy()) {
+    Ops[0] = const_cast<Value *>(getSplatValue(Ops[0]));
+    if (!Ops[0])
+      return false;
+    RewriteGEP = true;
+  }
+
+  unsigned FinalIndex = Ops.size() - 1;
+
+  // Ensure all but the last index is 0.
+  // FIXME: This isn't strictly required. All that's required is that they are
+  // all scalars or splats.
+  for (unsigned i = 1; i < FinalIndex; ++i) {
+    auto *C = dyn_cast<Constant>(Ops[i]);
+    if (!C)
+      return false;
+    if (isa<VectorType>(C->getType()))
+      C = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(C);
+    if (!CI || !CI->isZero())
+      return false;
+    // Scalarize the index if needed.
+    Ops[i] = CI;
+  }
+
+  // Try to scalarize the final index.
+  if (Ops[FinalIndex]->getType()->isVectorTy()) {
+    if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) {
+      auto *C = dyn_cast<ConstantInt>(V);
+      // Don't scalarize all zeros vector.
+      if (!C || !C->isZero()) {
+        Ops[FinalIndex] = V;
+        RewriteGEP = true;
+      }
+    }
+  }
+
+  // If we made any changes or the we have extra operands, we need to generate
+  // new instructions.
+  if (!RewriteGEP && Ops.size() == 2)
+    return false;
+
+  unsigned NumElts = Ptr->getType()->getVectorNumElements();
+
+  IRBuilder<> Builder(MemoryInst);
+
+  Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
+
+  Value *NewAddr;
+
+  // If the final index isn't a vector, emit a scalar GEP containing all ops
+  // and a vector GEP with all zeroes final index.
+  if (!Ops[FinalIndex]->getType()->isVectorTy()) {
+    NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
+    Type *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
+    NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
+  } else {
+    Value *Base = Ops[0];
+    Value *Index = Ops[FinalIndex];
+
+    // Create a scalar GEP if there are more than 2 operands.
+    if (Ops.size() != 2) {
+      // Replace the last index with 0.
+      Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
+      Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
+    }
+
+    // Now create the GEP with scalar pointer and vector index.
+    NewAddr = Builder.CreateGEP(Base, Index);
+  }
+
+  MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
+
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
+  if (Ptr->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo);
+
+  return true;
+}
+
 /// If there are any memory operands, use OptimizeMemoryInst to sink their
 /// address computing into the block when possible / profitable.
 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -1435,12 +1435,6 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
  return Val;
 }

-// Return true if SDValue exists for the given Value
-bool SelectionDAGBuilder::findValue(const Value *V) const {
-  return (NodeMap.find(V) != NodeMap.end()) ||
-    (FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
-}
-
 /// getNonRegisterValue - Return an SDValue for the given Value, but
 /// don't look in FuncInfo.ValueMap for a virtual register.
 SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
@ -4254,70 +4248,49 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // In all other cases the function returns 'false'.
 static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
                           ISD::MemIndexType &IndexType, SDValue &Scale,
-                           SelectionDAGBuilder *SDB) {
+                           SelectionDAGBuilder *SDB, const BasicBlock *CurBB) {
  SelectionDAG& DAG = SDB->DAG;
-  LLVMContext &Context = *DAG.getContext();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();

  assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
+
+  // Handle splat constant pointer.
+  if (auto *C = dyn_cast<Constant>(Ptr)) {
+    C = C->getSplatValue();
+    if (!C)
+      return false;
+
+    Base = SDB->getValue(C);
+
+    unsigned NumElts = Ptr->getType()->getVectorNumElements();
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts);
+    Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
+    return true;
+  }
+
  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-  if (!GEP)
+  if (!GEP || GEP->getParent() != CurBB)
+    return false;
+
+  if (GEP->getNumOperands() != 2)
    return false;

  const Value *BasePtr = GEP->getPointerOperand();
-  if (BasePtr->getType()->isVectorTy()) {
-    BasePtr = getSplatValue(BasePtr);
-    if (!BasePtr)
-      return false;
-  }
+  const Value *IndexVal = GEP->getOperand(GEP->getNumOperands() - 1);

-  unsigned FinalIndex = GEP->getNumOperands() - 1;
-  Value *IndexVal = GEP->getOperand(FinalIndex);
-  gep_type_iterator GTI = gep_type_begin(*GEP);
-
-  // Ensure all the other indices are 0.
-  for (unsigned i = 1; i < FinalIndex; ++i, ++GTI) {
-    auto *C = dyn_cast<Constant>(GEP->getOperand(i));
-    if (!C)
-      return false;
-    if (isa<VectorType>(C->getType()))
-      C = C->getSplatValue();
-    auto *CI = dyn_cast_or_null<ConstantInt>(C);
-    if (!CI || !CI->isZero())
-      return false;
-  }
-
-  // The operands of the GEP may be defined in another basic block.
-  // In this case we'll not find nodes for the operands.
-  if (!SDB->findValue(BasePtr))
-    return false;
-  Constant *C = dyn_cast<Constant>(IndexVal);
-  if (!C && !SDB->findValue(IndexVal))
+  // Make sure the base is scalar and the index is a vector.
+  if (BasePtr->getType()->isVectorTy() || !IndexVal->getType()->isVectorTy())
    return false;

-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const DataLayout &DL = DAG.getDataLayout();
-  StructType *STy = GTI.getStructTypeOrNull();
-
-  if (STy) {
-    const StructLayout *SL = DL.getStructLayout(STy);
-    unsigned Field = cast<Constant>(IndexVal)->getUniqueInteger().getZExtValue();
-    Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
-    Index = DAG.getConstant(SL->getElementOffset(Field),
-                            SDB->getCurSDLoc(), TLI.getPointerTy(DL));
-  } else {
-    Scale = DAG.getTargetConstant(
-                DL.getTypeAllocSize(GEP->getResultElementType()),
-                SDB->getCurSDLoc(), TLI.getPointerTy(DL));
-    Index = SDB->getValue(IndexVal);
-  }
  Base = SDB->getValue(BasePtr);
+  Index = SDB->getValue(IndexVal);
  IndexType = ISD::SIGNED_SCALED;
-
-  if (STy || !Index.getValueType().isVector()) {
-    unsigned GEPWidth = cast<VectorType>(GEP->getType())->getNumElements();
-    EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
-    Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index);
-  }
+  Scale = DAG.getTargetConstant(
+              DL.getTypeAllocSize(GEP->getResultElementType()),
+              SDB->getCurSDLoc(), TLI.getPointerTy(DL));
  return true;
 }

@ -4341,7 +4314,8 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
  SDValue Index;
  ISD::MemIndexType IndexType;
  SDValue Scale;
-  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this);
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent());

  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
@ -4452,7 +4426,8 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
  SDValue Index;
  ISD::MemIndexType IndexType;
  SDValue Scale;
-  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this);
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent());
  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@ -518,7 +518,6 @@ public:
  void resolveOrClearDbgInfo();

  SDValue getValue(const Value *V);
-  bool findValue(const Value *V) const;

  /// Return the SDNode for the specified IR value if it exists.
  SDNode *getNodeForIRValue(const Value *V) {
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@ -1721,11 +1721,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX512-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512-NEXT:    kshiftrw $8, %k0, %k1
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
+; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    kmovw %k1, %k2
-; AVX512-NEXT:    vpgatherdd c(,%zmm0,4), %zmm1 {%k2}
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28]
-; AVX512-NEXT:    vpgatherdd c(,%zmm0), %zmm2 {%k1}
+; AVX512-NEXT:    vpgatherdd c+12(,%zmm0), %zmm1 {%k2}
+; AVX512-NEXT:    vpgatherdd c+28(,%zmm0), %zmm2 {%k1}
 ; AVX512-NEXT:    vpaddd %ymm2, %ymm2, %ymm0
 ; AVX512-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@ -638,30 +638,38 @@ entry:
 define <16 x float> @test11(float* %base, i32 %ind) {
 ; KNL_64-LABEL: test11:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpbroadcastd %esi, %zmm1
+; KNL_64-NEXT:    movslq %esi, %rax
+; KNL_64-NEXT:    leaq (%rdi,%rax,4), %rax
+; KNL_64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT:    vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test11:
 ; KNL_32:       # %bb.0:
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %zmm1
+; KNL_32-NEXT:    shll $2, %eax
+; KNL_32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
 ; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test11:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd %esi, %zmm1
+; SKX-NEXT:    movslq %esi, %rax
+; SKX-NEXT:    leaq (%rdi,%rax,4), %rax
+; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; SKX-NEXT:    vgatherdps (%rax,%zmm1,4), %zmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test11:
 ; SKX_32:       # %bb.0:
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %zmm1
+; SKX_32-NEXT:    shll $2, %eax
+; SKX_32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
 ; SKX_32-NEXT:    retl
--- a/llvm/test/CodeGen/X86/pr45067.ll
+++ b/llvm/test/CodeGen/X86/pr45067.ll
@ -6,13 +6,13 @@
 define void @foo(<8 x i32>* %x, <8 x i1> %y) {
 ; CHECK-LABEL: foo:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpbroadcastq _global@{{.*}}(%rip), %ymm2
-; CHECK-NEXT:    vpgatherqd %xmm1, (,%ymm2), %xmm3
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    movq _global@{{.*}}(%rip), %rax
+; CHECK-NEXT:    vpgatherdd %ymm1, (%rax,%ymm2), %ymm3
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; CHECK-NEXT:    vpslld $31, %ymm0, %ymm0
-; CHECK-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm1
-; CHECK-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
+; CHECK-NEXT:    vpmaskmovd %ymm3, %ymm0, (%rdi)
 ; CHECK-NEXT:    ud2
  %tmp = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> <i32* @global, i32* @global, i32* @global, i32* @global, i32* @global, i32* @global, i32* @global, i32* @global>, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
  call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %tmp, <8 x i32>* %x, i32 4, <8 x i1> %y)
--- a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll
@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.a = type { i32, i32 }
+@c = external dso_local global %struct.a, align 4
+@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
+
+define <4 x i32> @splat_base(i32* %base, <4 x i64> %index) {
+; CHECK-LABEL: @splat_base(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <4 x i64> [[INDEX:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <4 x i32*> undef, i32* %base, i32 0
+  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %gep = getelementptr i32, <4 x i32*> %broadcast.splat, <4 x i64> %index
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @splat_struct(%struct.a* %base) {
+; CHECK-LABEL: @splat_struct(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], %struct.a* [[BASE:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %gep = getelementptr %struct.a, %struct.a* %base, <4 x i64> zeroinitializer, i32 1
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @scalar_index(i32* %base, i64 %index) {
+; CHECK-LABEL: @scalar_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <4 x i32*> undef, i32* %base, i32 0
+  %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer
+  %gep = getelementptr i32, <4 x i32*> %broadcast.splat, i64 %index
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @splat_index(i32* %base, i64 %index) {
+; CHECK-LABEL: @splat_index(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
+  %gep = getelementptr i32, i32* %base, <4 x i64> %broadcast.splat
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_global_array(<4 x i64> %indxs) {
+; CHECK-LABEL: @test_global_array(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @glob_array, i64 0, i64 0), <4 x i64> [[INDXS:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[G]]
+;
+  %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <4 x i64> %indxs
+  %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %g
+}
+
+define <4 x i32> @global_struct_splat() {
+; CHECK-LABEL: @global_struct_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> <i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1), i32* getelementptr inbounds (%struct.a, %struct.a* @c, i64 0, i32 1)>, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = insertelement <4 x %struct.a*> undef, %struct.a* @c, i32 0
+  %2 = shufflevector <4 x %struct.a*> %1, <4 x %struct.a*> undef, <4 x i32> zeroinitializer
+  %3 = getelementptr %struct.a, <4 x %struct.a*> %2, <4 x i64> zeroinitializer, i32 1
+  %4 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  ret <4 x i32> %4
+}
+
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)