diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index af2d3f530643..814baca35f63 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -277,32 +277,6 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -/// A helper function that returns GEP instruction and knows to skip a -/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination -/// pointee types of the 'bitcast' have the same size. -/// For example: -/// bitcast double** %var to i64* - can be skipped -/// bitcast double** %var to i8* - can not -static GetElementPtrInst *getGEPInstruction(Value *Ptr) { - - if (isa(Ptr)) - return cast(Ptr); - - if (isa(Ptr) && - isa(cast(Ptr)->getOperand(0))) { - Type *BitcastTy = Ptr->getType(); - Type *GEPTy = cast(Ptr)->getSrcTy(); - if (!isa(BitcastTy) || !isa(GEPTy)) - return nullptr; - Type *Pointee1Ty = cast(BitcastTy)->getPointerElementType(); - Type *Pointee2Ty = cast(GEPTy)->getPointerElementType(); - const DataLayout &DL = cast(Ptr)->getModule()->getDataLayout(); - if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty)) - return cast(cast(Ptr)->getOperand(0)); - } - return nullptr; -} - // FIXME: The following helper functions have multiple implementations // in the project. They can be effectively organized in a common Load/Store // utilities unit. @@ -2996,40 +2970,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { VectorParts VectorGep; // Handle consecutive loads/stores. - GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { Ptr = getScalarValue(Ptr, 0, 0); } else { // At this point we should vector version of GEP for Gather or Scatter assert(CreateGatherScatter && "The instruction should be scalarized"); - if (Gep) { - // Vectorizing GEP, across UF parts. We want to get a vector value for base - // and each index that's defined inside the loop, even if it is - // loop-invariant but wasn't hoisted out. Otherwise we want to keep them - // scalar. - SmallVector OpsV; - for (Value *Op : Gep->operands()) { - Instruction *SrcInst = dyn_cast(Op); - if (SrcInst && OrigLoop->contains(SrcInst)) - OpsV.push_back(getVectorValue(Op)); - else - OpsV.push_back(VectorParts(UF, Op)); - } - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Ops; - Value *GEPBasePtr = OpsV[0][Part]; - for (unsigned i = 1; i < Gep->getNumOperands(); i++) - Ops.push_back(OpsV[i][Part]); - Value *NewGep = Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep"); - cast(NewGep)->setIsInBounds(Gep->isInBounds()); - assert(NewGep->getType()->isVectorTy() && "Expected vector GEP"); - - NewGep = - Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF)); - VectorGep.push_back(NewGep); - } - } else - VectorGep = getVectorValue(Ptr); + VectorGep = getVectorValue(Ptr); } VectorParts Mask = createBlockInMask(Instr->getParent()); @@ -4789,7 +4735,72 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) { widenPHIInstruction(&I, UF, VF); continue; } // End of PHI. + case Instruction::GetElementPtr: { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + auto *GEP = cast(&I); + VectorParts Entry(UF); + if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part] = Builder.CreateVectorSplat(VF, Clone); + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand()) + ? GEP->getPointerOperand() + : getVectorValue(GEP->getPointerOperand())[Part]; + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { + if (OrigLoop->isLoopInvariant(U.get())) + Indices.push_back(U.get()); + else + Indices.push_back(getVectorValue(U.get())[Part]); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = GEP->isInBounds() + ? Builder.CreateInBoundsGEP(Ptr, Indices) + : Builder.CreateGEP(Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + Entry[Part] = NewGEP; + } + } + + VectorLoopValueMap.initVector(&I, Entry); + addMetadata(Entry, GEP); + break; + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -5469,46 +5480,158 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { - // We should not collect Scalars more than once per VF. Right now, - // this function is called from collectUniformsAndScalars(), which - // already does this check. Collecting Scalars for VF=1 does not make any - // sense. - + // We should not collect Scalars more than once per VF. Right now, this + // function is called from collectUniformsAndScalars(), which already does + // this check. Collecting Scalars for VF=1 does not make any sense. assert(VF >= 2 && !Scalars.count(VF) && "This function should not be visited twice for the same VF"); - // If an instruction is uniform after vectorization, it will remain scalar. - Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); + SmallSetVector Worklist; - // Collect the getelementptr instructions that will not be vectorized. A - // getelementptr instruction is only vectorized if it is used for a legal - // gather or scatter operation. + // These sets are used to seed the analysis with pointers used by memory + // accesses that will remain scalar. + SmallSetVector ScalarPtrs; + SmallPtrSet PossibleNonScalarPtrs; + + // A helper that returns true if the use of Ptr by MemAccess will be scalar. + // The pointer operands of loads and stores will be scalar as long as the + // memory access is not a gather or scatter operation. The value operand of a + // store will remain scalar if the store is scalarized. + auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { + InstWidening WideningDecision = getWideningDecision(MemAccess, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + if (auto *Store = dyn_cast(MemAccess)) + if (Ptr == Store->getValueOperand()) + return WideningDecision == CM_Scalarize; + assert(Ptr == getPointerOperand(MemAccess) && + "Ptr is neither a value or pointer operand"); + return WideningDecision != CM_GatherScatter; + }; + + // A helper that returns true if the given value is a bitcast or + // getelementptr instruction contained in the loop. + auto isLoopVaryingBitCastOrGEP = [&](Value *V) { + return ((isa(V) && V->getType()->isPointerTy()) || + isa(V)) && + !TheLoop->isLoopInvariant(V); + }; + + // A helper that evaluates a memory access's use of a pointer. If the use + // will be a scalar use, and the pointer is only used by memory accesses, we + // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in + // PossibleNonScalarPtrs. + auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { + + // We only care about bitcast and getelementptr instructions contained in + // the loop. + if (!isLoopVaryingBitCastOrGEP(Ptr)) + return; + + // If the pointer has already been identified as scalar (e.g., if it was + // also identified as uniform), there's nothing to do. + auto *I = cast(Ptr); + if (Worklist.count(I)) + return; + + // If the use of the pointer will be a scalar use, and all users of the + // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, + // place the pointer in PossibleNonScalarPtrs. + if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) { + return isa(U) || isa(U); + })) + ScalarPtrs.insert(I); + else + PossibleNonScalarPtrs.insert(I); + }; + + // We seed the scalars analysis with three classes of instructions: (1) + // instructions marked uniform-after-vectorization, (2) bitcast and + // getelementptr instructions used by memory accesses requiring a scalar use, + // and (3) pointer induction variables and their update instructions (we + // currently only scalarize these). + // + // (1) Add to the worklist all instructions that have been identified as + // uniform-after-vectorization. + Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); + + // (2) Add to the worklist all bitcast and getelementptr instructions used by + // memory accesses requiring a scalar use. The pointer operands of loads and + // stores will be scalar as long as the memory accesses is not a gather or + // scatter operation. The value operand of a store will remain scalar if the + // store is scalarized. for (auto *BB : TheLoop->blocks()) for (auto &I : *BB) { - if (auto *GEP = dyn_cast(&I)) { - Scalars[VF].insert(GEP); - continue; + if (auto *Load = dyn_cast(&I)) { + evaluatePtrUse(Load, Load->getPointerOperand()); + } else if (auto *Store = dyn_cast(&I)) { + evaluatePtrUse(Store, Store->getPointerOperand()); + evaluatePtrUse(Store, Store->getValueOperand()); } - auto *Ptr = getPointerOperand(&I); - if (!Ptr) - continue; - auto *GEP = getGEPInstruction(Ptr); - if (GEP && getWideningDecision(&I, VF) == CM_GatherScatter) - Scalars[VF].erase(GEP); + } + for (auto *I : ScalarPtrs) + if (!PossibleNonScalarPtrs.count(I)) { + DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n"); + Worklist.insert(I); } - // An induction variable will remain scalar if all users of the induction - // variable and induction variable update remain scalar. + // (3) Add to the worklist all pointer induction variables and their update + // instructions. + // + // TODO: Once we are able to vectorize pointer induction variables we should + // no longer insert them into the worklist here. auto *Latch = TheLoop->getLoopLatch(); for (auto &Induction : *Legal->getInductionVars()) { auto *Ind = Induction.first; auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); + if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction) + continue; + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); + } + + // Expand the worklist by looking through any bitcasts and getelementptr + // instructions we've already identified as scalar. This is similar to the + // expansion step in collectLoopUniforms(); however, here we're only + // expanding to include additional bitcasts and getelementptr instructions. + unsigned Idx = 0; + while (Idx != Worklist.size()) { + Instruction *Dst = Worklist[Idx++]; + if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0))) + continue; + auto *Src = cast(Dst->getOperand(0)); + if (all_of(Src->users(), [&](User *U) -> bool { + auto *J = cast(U); + return !TheLoop->contains(J) || Worklist.count(J) || + ((isa(J) || isa(J)) && + isScalarUse(J, Src)); + })) { + Worklist.insert(Src); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n"); + } + } + + // An induction variable will remain scalar if all users of the induction + // variable and induction variable update remain scalar. + for (auto &Induction : *Legal->getInductionVars()) { + auto *Ind = Induction.first; + auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); + + // We already considered pointer induction variables, so there's no reason + // to look at their users again. + // + // TODO: Once we are able to vectorize pointer induction variables we + // should no longer skip over them here. + if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) + continue; // Determine if all users of the induction variable are scalar after // vectorization. auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == IndUpdate || !TheLoop->contains(I) || Scalars[VF].count(I); + return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I); }); if (!ScalarInd) continue; @@ -5517,15 +5640,19 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // scalar after vectorization. auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool { auto *I = cast(U); - return I == Ind || !TheLoop->contains(I) || Scalars[VF].count(I); + return I == Ind || !TheLoop->contains(I) || Worklist.count(I); }); if (!ScalarIndUpdate) continue; // The induction variable and its update instruction will remain scalar. - Scalars[VF].insert(Ind); - Scalars[VF].insert(IndUpdate); + Worklist.insert(Ind); + Worklist.insert(IndUpdate); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n"); + DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } + + Scalars[VF].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll index 820335276dc7..82f2e064a581 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -13,23 +13,33 @@ target triple = "x86_64-unknown-linux-gnu" ; scatter operation. %tmp3 (and the induction variable) should not be marked ; uniform-after-vectorization. ; -; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i -; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] -; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 -; CHECK: vector.body: -; CHECK: %index = phi i64 -; CHECK: %vec.ind = phi <16 x i64> -; CHECK: %[[T0:.+]] = mul i64 %index, 5 -; CHECK: %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]] -; CHECK: %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T2]], align 4 -; CHECK: %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]] -; CHECK: %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>* -; CHECK: load <80 x float>, <80 x float>* %[[T4]], align 4 -; CHECK: %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind -; CHECK: call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}}) -; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; CHECK: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i +; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label %vector.body +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0 +; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]] +; CHECK-NEXT: call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body %data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] } diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index ec67e632efbd..bda4b2454ee2 100755 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -16,97 +16,23 @@ target triple = "x86_64-apple-macosx10.11.0" define void @_Z3fn1v() #0 { ; CHECK-LABEL: @_Z3fn1v( ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX:%.*]].next, %vector.body ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ -; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ -; CHECK-NEXT: [[SHL:%.*]] = shl i64 %index, 1 -; CHECK-NEXT: %offset.idx = add i64 [[SHL]], 8 -; CHECK-NEXT: [[IND00:%.*]] = add i64 %offset.idx, 0 -; CHECK-NEXT: [[IND02:%.*]] = add i64 %offset.idx, 2 -; CHECK-NEXT: [[IND04:%.*]] = add i64 %offset.idx, 4 -; CHECK-NEXT: [[IND06:%.*]] = add i64 %offset.idx, 6 -; CHECK-NEXT: [[IND08:%.*]] = add i64 %offset.idx, 8 -; CHECK-NEXT: [[IND10:%.*]] = add i64 %offset.idx, 10 -; CHECK-NEXT: [[IND12:%.*]] = add i64 %offset.idx, 12 -; CHECK-NEXT: [[IND14:%.*]] = add i64 %offset.idx, 14 -; CHECK-NEXT: [[IND16:%.*]] = add i64 %offset.idx, 16 -; CHECK-NEXT: [[IND18:%.*]] = add i64 %offset.idx, 18 -; CHECK-NEXT: [[IND20:%.*]] = add i64 %offset.idx, 20 -; CHECK-NEXT: [[IND22:%.*]] = add i64 %offset.idx, 22 -; CHECK-NEXT: [[IND24:%.*]] = add i64 %offset.idx, 24 -; CHECK-NEXT: [[IND26:%.*]] = add i64 %offset.idx, 26 -; CHECK-NEXT: [[IND28:%.*]] = add i64 %offset.idx, 28 -; CHECK-NEXT: [[IND30:%.*]] = add i64 %offset.idx, 30 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> , [[VEC_IND]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]] -; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]] -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]] -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]] -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]] -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]] -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]] -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]] -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]] -; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]] -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11 -; CHECK-NEXT: [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13 -; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14 -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15 -; CHECK-NEXT: [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0 -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1 -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0 -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0 -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3 -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0 -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0 -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5 -; CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0 -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6 -; CHECK-NEXT: [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0 -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7 -; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0 -; CHECK-NEXT: [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8 -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0 -; CHECK-NEXT: [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9 -; CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0 -; CHECK-NEXT: [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10 -; CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0 -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11 -; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0 -; CHECK-NEXT: [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12 -; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0 -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13 -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0 -; CHECK-NEXT: [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14 -; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0 -; CHECK-NEXT: [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15 -; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0 -; CHECK-NEXT: [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> ) -; CHECK: [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], -; CHECK: [[STEP_ADD4:%.*]] = add <16 x i64> [[VEC_IND3]], +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[TMP13]], i32 16, <16 x i1> ) +; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], +; CHECK-NEXT: [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> [[TMP16]], i32 8, <16 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], +; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; entry: %0 = load i32, i32* @c, align 4 %cmp34 = icmp sgt i32 %0, 8 diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll new file mode 100644 index 000000000000..4dcd5993c128 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll @@ -0,0 +1,143 @@ +; REQUIRES: asserts +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: vector_gep +; CHECK-NOT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>* +; CHECK-NEXT: store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @vector_gep(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: scalar_store +; CHECK: LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]] +; CHECK-NEXT: store i32* [[TMP5]], i32** [[TMP7]], align 8 +; CHECK-NEXT: store i32* [[TMP6]], i32** [[TMP8]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @scalar_store(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: expansion +; CHECK: LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32* +; CHECK-NEXT: LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0 +; CHECK-NEXT: LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i +; CHECK-NEXT: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64** +; CHECK-NEXT: store i64* [[TMP5]], i64** [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64** +; CHECK-NEXT: store i64* [[TMP6]], i64** [[TMP10]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @expansion(i32** %a, i64 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i + %tmp1 = bitcast i64* %tmp0 to i32* + %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0 + %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i + store i32* %tmp1, i32** %tmp3, align 8 + %i.next = add nuw nsw i64 %i, 2 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: no_gep_or_bitcast +; CHECK-NOT: LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8 +; CHECK: LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] +; CHECK-NEXT: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1 +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: store i32 0, i32* [[TMP4]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i + %tmp1 = load i32*, i32** %tmp0, align 8 + store i32 0, i32* %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll new file mode 100644 index 000000000000..bd79499d5d34 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @vector_gep_stored( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +} + +; CHECK-LABEL: @uniform_vector_gep_stored( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>* +; CHECK-NEXT: store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body +; +define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = getelementptr inbounds i32, i32* %b, i64 1 + %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i + store i32* %tmp0, i32** %tmp1, align 8 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: + ret void +}