diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index c890216c9e01..012ad1cd782d 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -686,10 +686,8 @@ public: if (getMember(getFactor() - 1)) return false; - // We have a group with gaps. It therefore cannot be a group of stores, - // and it can't be a reversed access, because such groups get invalidated. - assert(!getMember(0)->mayWriteToMemory() && - "Group should have been invalidated"); + // We have a group with gaps. It therefore can't be a reversed access, + // because such groups get invalidated (TODO). assert(!isReverse() && "Group should have been invalidated"); // This is a group of loads, with gaps, and without a last-member diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 205680348420..e07cd068a63b 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1212,9 +1212,9 @@ public: // used (those corresponding to elements [0:1] and [8:9] of the unlegalized // type). The other loads are unused. // - // We only scale the cost of loads since interleaved store groups aren't - // allowed to have gaps. - if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { + // TODO: Note that legalization can turn masked loads/stores into unmasked + // (legalized) loads/stores. This can be reflected in the cost. + if (VecTySize > VecTyLTSize) { // The number of loads of a legal type it will take to represent a load // of the unlegalized vector type. unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); @@ -1235,6 +1235,8 @@ public: } // Then plus the cost of interleave operation. + assert(Indices.size() <= Factor && + "Interleaved memory op has too many members"); if (Opcode == Instruction::Load) { // The interleave cost is similar to extract sub vectors' elements // from the wide vector, and insert them into sub vectors. @@ -1244,44 +1246,49 @@ public: // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. - - assert(Indices.size() <= Factor && - "Interleaved memory op has too many members"); - for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); // Extract elements from loaded vector for each sub vector. - for (unsigned i = 0; i < NumSubElts; i++) + for (unsigned Elm = 0; Elm < NumSubElts; Elm++) Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, - Index + i * Factor); + Index + Elm * Factor); } InstructionCost InsSubCost = 0; - for (unsigned i = 0; i < NumSubElts; i++) + for (unsigned Elm = 0; Elm < NumSubElts; Elm++) InsSubCost += - thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i); + thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, Elm); Cost += Indices.size() * InsSubCost; } else { - // The interleave cost is extract all elements from sub vectors, and + // The interleave cost is extract elements from sub vectors, and // insert them into the wide vector. // - // E.g. An interleaved store of factor 2: - // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> - // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr - // The cost is estimated as extract all elements from both <4 x i32> - // vectors and insert into the <8 x i32> vector. - + // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: + // (using VF=4): + // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> + // %gaps.mask = + // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, + // i32 Align, <12 x i1> %gaps.mask + // The cost is estimated as extract all elements (of actual members, + // excluding gaps) from both <4 x i32> vectors and insert into the <12 x + // i32> vector. InstructionCost ExtSubCost = 0; - for (unsigned i = 0; i < NumSubElts; i++) - ExtSubCost += - thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); - Cost += ExtSubCost * Factor; + for (unsigned Elm = 0; Elm < NumSubElts; Elm++) + ExtSubCost += thisT()->getVectorInstrCost(Instruction::ExtractElement, + SubVT, Elm); + Cost += ExtSubCost * Indices.size(); - for (unsigned i = 0; i < NumElts; i++) - Cost += static_cast(this) - ->getVectorInstrCost(Instruction::InsertElement, VT, i); + for (unsigned Index : Indices) { + assert(Index < Factor && "Invalid index for interleaved memory op"); + + // Insert elements from loaded vector for each sub vector. + for (unsigned Elm = 0; Elm < NumSubElts; Elm++) + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VT, + Index + Elm * Factor); + } } if (!UseMaskForCond) diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 0a14a1432934..ce91e16fdfb4 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1193,15 +1193,23 @@ void InterleavedAccessInfo::analyzeInterleaving( } // Iteration over A accesses. } // Iteration over B accesses. - // Remove interleaved store groups with gaps. - for (auto *Group : StoreGroups) - if (Group->getNumMembers() != Group->getFactor()) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved store group due " - "to gaps.\n"); - releaseGroup(Group); - } - // Remove interleaved groups with gaps (currently only loads) whose memory + auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup *Group, + int Index, + std::string FirstOrLast) -> bool { + Instruction *Member = Group->getMember(Index); + assert(Member && "Group member does not exist"); + Value *MemberPtr = getLoadStorePointerOperand(Member); + if (getPtrStride(PSE, MemberPtr, TheLoop, Strides, /*Assume=*/false, + /*ShouldCheckWrap=*/true)) + return false; + LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " + << FirstOrLast + << " group member potentially pointer-wrapping.\n"); + releaseGroup(Group); + return true; + }; + + // Remove interleaved groups with gaps whose memory // accesses may wrap around. We have to revisit the getPtrStride analysis, // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does // not check wrapping (see documentation there). @@ -1227,26 +1235,12 @@ void InterleavedAccessInfo::analyzeInterleaving( // So we check only group member 0 (which is always guaranteed to exist), // and group member Factor - 1; If the latter doesn't exist we rely on // peeling (if it is a non-reversed accsess -- see Case 3). - Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0)); - if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, - /*ShouldCheckWrap=*/true)) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved group due to " - "first group member potentially pointer-wrapping.\n"); - releaseGroup(Group); + if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first"))) continue; - } - Instruction *LastMember = Group->getMember(Group->getFactor() - 1); - if (LastMember) { - Value *LastMemberPtr = getLoadStorePointerOperand(LastMember); - if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, - /*ShouldCheckWrap=*/true)) { - LLVM_DEBUG( - dbgs() << "LV: Invalidate candidate interleaved group due to " - "last group member potentially pointer-wrapping.\n"); - releaseGroup(Group); - } - } else { + if (Group->getMember(Group->getFactor() - 1)) + InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, + std::string("last")); + else { // Case 3: A non-reversed interleaved load group with gaps: We need // to execute at least one scalar epilogue iteration. This will ensure // we don't speculatively access memory out-of-bounds. We only need @@ -1264,6 +1258,39 @@ void InterleavedAccessInfo::analyzeInterleaving( RequiresScalarEpilogue = true; } } + + for (auto *Group : StoreGroups) { + // Case 1: A full group. Can Skip the checks; For full groups, if the wide + // store would wrap around the address space we would do a memory access at + // nullptr even without the transformation. + if (Group->getNumMembers() == Group->getFactor()) + continue; + + // Interleave-store-group with gaps is implemented using masked wide store. + // Remove interleaved store groups with gaps if + // masked-interleaved-accesses are not enabled by the target. + if (!EnablePredicatedInterleavedMemAccesses) { + LLVM_DEBUG( + dbgs() << "LV: Invalidate candidate interleaved store group due " + "to gaps.\n"); + releaseGroup(Group); + continue; + } + + // Case 2: If first and last members of the group don't wrap this implies + // that all the pointers in the group don't wrap. + // So we check only group member 0 (which is always guaranteed to exist), + // and the last group member. Case 3 (scalar epilog) is not relevant for + // stores with gaps, which are implemented with masked-store (rather than + // speculative access, as in loads). + if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first"))) + continue; + for (int Index = Group->getFactor() - 1; Index > 0; Index--) + if (Group->getMember(Index)) { + InvalidateGroupIfMemberMayWrap(Group, Index, std::string("last")); + break; + } + } } void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7fa2cabc73b1..742a41dc47c7 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2837,12 +2837,25 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. + MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); + assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && + "masked interleaved groups are not allowed."); + assert((!MaskForGaps || !VF.isScalable()) && + "masking gaps for scalable vectors is not yet supported."); for (unsigned Part = 0; Part < UF; Part++) { // Collect the stored vector from each member. SmallVector StoredVecs; for (unsigned i = 0; i < InterleaveFactor; i++) { - // Interleaved store group doesn't allow a gap, so each index has a member - assert(Group->getMember(i) && "Fail to get a member from an interleaved store group"); + assert((Group->getMember(i) || MaskForGaps) && + "Fail to get a member from an interleaved store group"); + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) { + Value *Undef = PoisonValue::get(SubVT); + StoredVecs.push_back(Undef); + continue; + } Value *StoredVec = State.get(StoredValues[i], Part); @@ -2866,16 +2879,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( "interleaved.vec"); Instruction *NewStoreInstr; - if (BlockInMask) { - Value *BlockInMaskPart = State.get(BlockInMask, Part); - Value *ShuffledMask = Builder.CreateShuffleVector( - BlockInMaskPart, - createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), - "interleaved.mask"); - NewStoreInstr = Builder.CreateMaskedStore( - IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); - } - else + if (BlockInMask || MaskForGaps) { + Value *GroupMask = MaskForGaps; + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + Value *ShuffledMask = Builder.CreateShuffleVector( + BlockInMaskPart, + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); + GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, + ShuffledMask, MaskForGaps) + : ShuffledMask; + } + NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part], + Group->getAlign(), GroupMask); + } else NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign()); @@ -5274,12 +5292,19 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that - // needs predication, or it was decided to use masking to deal with gaps. + // needs predication, or it was decided to use masking to deal with gaps + // (either a gap at the end of a load-access that may result in a speculative + // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); - bool AccessWithGapsRequiresMasking = - Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); - if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) + bool LoadAccessWithGapsRequiresEpilogMasking = + isa(I) && Group->requiresScalarEpilogue() && + !isScalarEpilogueAllowed(); + bool StoreAccessWithGapsRequiresMasking = + isa(I) && (Group->getNumMembers() < Group->getFactor()); + if (!PredicatedAccessRequiresMasking && + !LoadAccessWithGapsRequiresEpilogMasking && + !StoreAccessWithGapsRequiresMasking) return true; // If masked interleaving is required, we expect that the user/target had @@ -7118,18 +7143,16 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned InterleaveFactor = Group->getFactor(); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); - // Holds the indices of existing members in an interleaved load group. - // An interleaved store group doesn't need this as it doesn't allow gaps. + // Holds the indices of existing members in the interleaved group. SmallVector Indices; - if (isa(I)) { - for (unsigned i = 0; i < InterleaveFactor; i++) - if (Group->getMember(i)) - Indices.push_back(i); - } + for (unsigned IF = 0; IF < InterleaveFactor; IF++) + if (Group->getMember(IF)) + Indices.push_back(IF); // Calculate the cost of the whole interleaved group. bool UseMaskForGaps = - Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); + (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || + (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll new file mode 100644 index 000000000000..65838c1f4b02 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -0,0 +1,417 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -licm -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -licm -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; (1) Interleave-group with factor 4, storing only 2 members out of the 4. +; Check that when we allow masked-memops to support interleave-group with gaps, +; the store is vectorized using a wide masked store, with a 1,1,0,0,1,1,0,0,... mask. +; Check that when we don't allow masked-memops to support interleave-group with gaps, +; the store is scalarized. +; The input IR was generated from this source: +; for(i=0;i<1024;i++){ +; points[i*4] = x[i]; +; points[i*4 + 1] = y[i]; +; } +; (relates to the testcase in PR50566) + +; Function Attrs: nofree norecurse nosync nounwind uwtable +define dso_local void @test1(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) local_unnamed_addr { +; DISABLED_MASKED_STRIDED-LABEL: @test1( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP2]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP2]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP7]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP2]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], i16* [[TMP4]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP12]], i16* [[TMP6]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP13]], i16* [[TMP8]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP14]], i16* [[TMP10]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = bitcast i16* [[TMP15]] to <4 x i16>* +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP16]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = or <4 x i64> [[TMP2]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP18]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP20]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP22]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP26]], i16* [[TMP19]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP27]], i16* [[TMP21]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP28]], i16* [[TMP23]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <4 x i16> [[WIDE_LOAD1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP29]], i16* [[TMP25]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP30]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @test1( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[INDEX]], 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <16 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> [[INTERLEAVED_VEC]], <16 x i16>* [[TMP6]], i32 2, <16 x i1> ) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %1 = shl nuw nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds i16, i16* %points, i64 %1 + store i16 %0, i16* %arrayidx2, align 2 + %arrayidx4 = getelementptr inbounds i16, i16* %y, i64 %indvars.iv + %2 = load i16, i16* %arrayidx4, align 2 + %3 = or i64 %1, 1 + %arrayidx7 = getelementptr inbounds i16, i16* %points, i64 %3 + store i16 %2, i16* %arrayidx7, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; (2) Same as above, but this time the gaps mask of the store is also And-ed with the +; fold-tail mask. If using masked memops to vectorize interleaved-group with gaps is +; not allowed, the store is scalarized and predicated. +; The input IR was generated from this source: +; for(i=0;i poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[FOR_BODY_PREHEADER]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>* +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP2]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison) +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl nsw <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP7]], i16* [[TMP6]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.store.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], i16* [[TMP10]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.store.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP13]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.store.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP17]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP19]], i16* [[TMP18]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.store.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = bitcast i16* [[TMP20]] to <4 x i16>* +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP21]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison) +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = or <4 x i64> [[TMP3]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP22]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP24]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP26]], i16* [[TMP25]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE9]] +; DISABLED_MASKED_STRIDED: pred.store.continue9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP22]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP28]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP30]], i16* [[TMP29]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE11]] +; DISABLED_MASKED_STRIDED: pred.store.continue11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP22]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP32]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP34]], i16* [[TMP33]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE13]] +; DISABLED_MASKED_STRIDED: pred.store.continue13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]] +; DISABLED_MASKED_STRIDED: pred.store.if14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = extractelement <4 x i64> [[TMP22]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP36]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <4 x i16> [[WIDE_MASKED_LOAD7]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP38]], i16* [[TMP37]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE15]] +; DISABLED_MASKED_STRIDED: pred.store.continue15: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP39]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; DISABLED_MASKED_STRIDED: for.end.loopexit: +; DISABLED_MASKED_STRIDED-NEXT: br label [[FOR_END]] +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @test2( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[NUMPOINTS:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: for.body.preheader: +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUMPOINTS]] to i64 +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP2]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shl nsw i64 [[INDEX]], 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP5]], i32 2, <4 x i1> [[TMP0]], <4 x i16> poison) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <16 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_MASKED_LOAD]], <4 x i16> [[WIDE_MASKED_LOAD3]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> [[INTERLEAVED_VEC]], <16 x i16>* [[TMP7]], i32 2, <16 x i1> [[TMP8]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END_LOOPEXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; ENABLED_MASKED_STRIDED: for.end.loopexit: +; ENABLED_MASKED_STRIDED-NEXT: br label [[FOR_END]] +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; +entry: + %cmp15 = icmp sgt i32 %numPoints, 0 + br i1 %cmp15, label %for.body.preheader, label %for.end + +for.body.preheader: + %wide.trip.count = zext i32 %numPoints to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %1 = shl nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds i16, i16* %points, i64 %1 + store i16 %0, i16* %arrayidx2, align 2 + %arrayidx4 = getelementptr inbounds i16, i16* %y, i64 %indvars.iv + %2 = load i16, i16* %arrayidx4, align 2 + %3 = or i64 %1, 1 + %arrayidx7 = getelementptr inbounds i16, i16* %points, i64 %3 + store i16 %2, i16* %arrayidx7, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +; (3) Testing a scenario of a conditional store. The gaps mask of the store is also +; And-ed with the condition mask (x[i] > 0). +; If using masked memops to vectorize interleaved-group with gaps is +; not allowed, the store is scalarized and predicated. +; Here the Interleave-group is with factor 3, storing only 1 member out of the 3. +; The input IR was generated from this source: +; for(i=0;i<1024;i++){ +; if (x[i] > 0) +; points[i*3] = x[i]; +; } +; Function Attrs: nofree norecurse nosync nounwind uwtable +define dso_local void @test(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readnone %y) local_unnamed_addr { +; DISABLED_MASKED_STRIDED-LABEL: @test( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP7]], i16* [[TMP6]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.store.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP11]], i16* [[TMP10]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.store.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP13]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP15]], i16* [[TMP14]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.store.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.store.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[POINTS]], i64 [[TMP17]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i16 [[TMP19]], i16* [[TMP18]], align 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.store.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @test( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[WIDE_LOAD]], zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <12 x i16>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> poison, <12 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <12 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <12 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v12i16.p0v12i16(<12 x i16> [[INTERLEAVED_VEC]], <12 x i16>* [[TMP5]], i32 2, <12 x i1> [[TMP6]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv + %0 = load i16, i16* %arrayidx, align 2 + %cmp1 = icmp sgt i16 %0, 0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %1 = mul nuw nsw i64 %indvars.iv, 3 + %arrayidx6 = getelementptr inbounds i16, i16* %points, i64 %1 + store i16 %0, i16* %arrayidx6, align 2 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll index 9ed66a22dbfd..9224fcd3d7e0 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll @@ -46,8 +46,7 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" ; Scenario 2: Check the case where it is illegal to create a masked interleave- ; group because the first access is predicated, and the second isn't. ; We therefore create a separate interleave-group with gaps for each of the -; stores (if masked-interleaved-accesses are enabled) and these are later -; invalidated because interleave-groups of stores with gaps are not supported. +; stores (if masked-interleaved-accesses are enabled). ; If masked-interleaved-accesses is not enabled we create only one interleave ; group of stores (for the non-predicated store) and it is later invalidated ; due to gaps. @@ -74,15 +73,12 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" ; STRIDED_MASKED: LV: Analyzing interleaved accesses... ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NOT: LV: Invalidate candidate interleaved store group due to gaps. ; Scenario 3: Check the case where it is illegal to create a masked interleave- ; group because the two accesses are in separate predicated blocks. ; We therefore create a separate interleave-group with gaps for each of the accesses, -; (which are later invalidated because interleave-groups of stores with gaps are -; not supported). ; If masked-interleaved-accesses is not enabled we don't create any interleave ; group because all accesses are predicated. ; @@ -109,8 +105,7 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" ; STRIDED_MASKED: LV: Analyzing interleaved accesses... ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 ; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NOT: LV: Invalidate candidate interleaved store group due to gaps. ; ModuleID = 'test.c'