[IAI,LV] Avoid creating a scalar epilogue due to gaps in interleave-groups when

optimizing for size

LV is careful to respect -Os and not to create a scalar epilog in all cases
(runtime tests, trip-counts that require a remainder loop) except for peeling
due to gaps in interleave-groups. This patch fixes that; -Os will now have us
invalidate such interleave-groups and vectorize without an epilog.

The patch also removes a related FIXME comment that is now obsolete, and was
also inaccurate:
"FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a smaller
MaxVF that does not require a scalar epilog."
(requiresScalarEpilog() has nothing to do with VF).

Reviewers: Ayal, hsaito, dcaballe, fhahn

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D53420

llvm-svn: 344883
This commit is contained in:
Dorit Nuzman 2018-10-22 06:17:09 +00:00
parent 2336dc3c51
commit 3ec99fe21b
4 changed files with 166 additions and 4 deletions

View File

@ -308,6 +308,23 @@ public:
propagateMetadata(NewInst, VL);
}
/// Returns true if this Group requires a scalar iteration to handle gaps.
bool requiresScalarEpilogue() const {
// If Group has no gaps, or has gaps but the last member exists, then a
// scalar epilog is not needed for this group.
if (getNumMembers() == getFactor() || getMember(getFactor() - 1))
return false;
// We have a group with gaps. It therefore cannot be a group of stores,
// and it can't be a reversed access, because such groups get invalidated.
assert(!getMember(0)->mayWriteToMemory() &&
"Group should have been invalidated");
assert(!isReverse() && "Group should have been invalidated");
// This is a group of loads, with gaps, and without a last-member
return true;
}
private:
unsigned Factor; // Interleave Factor.
bool Reverse;
@ -388,6 +405,11 @@ public:
/// out-of-bounds requires a scalar epilogue iteration for correctness.
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
/// Invalidate groups that require a scalar epilogue (due to gaps). This can
/// happen when we optimize for size and don't allow creating a scalar
/// epilogue.
void invalidateGroupsRequiringScalarEpilogue();
private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.

View File

@ -919,3 +919,27 @@ void InterleavedAccessInfo::analyzeInterleaving(
}
}
}
void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
// If no group had triggered the requirement to create an epilogue loop,
// there is nothing to do.
if (!requiresScalarEpilogue())
return;
// Avoid releasing a Group twice.
SmallPtrSet<InterleaveGroup *, 4> DelSet;
for (auto &I : InterleaveGroupMap) {
InterleaveGroup *Group = I.second;
if (Group->requiresScalarEpilogue())
DelSet.insert(Group);
}
for (auto *Ptr : DelSet) {
LLVM_DEBUG(
dbgs()
<< "LV: Invalidate candidate interleaved group due to gaps that "
"require a scalar epilogue.\n");
releaseGroup(Ptr);
}
RequiresScalarEpilogue = false;
}

View File

@ -4599,6 +4599,14 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
return None;
}
// Record that scalar epilogue is not allowed.
LLVM_DEBUG(dbgs() << "LV: Not inserting scalar epilogue for access with gaps "
"due to -Os/-Oz.\n");
// We don't create an epilogue when optimizing for size.
// Invalidate interleave groups that require an epilogue.
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
if (TC > 0 && TC % MaxVF == 0) {
@ -4610,8 +4618,6 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
// FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
// smaller MaxVF that does not require a scalar epilog.
if (Legal->canFoldTailByMasking()) {
FoldTailByMasking = true;
return MaxVF;

View File

@ -1,5 +1,5 @@
; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386-unknown-linux-gnu"
@ -9,9 +9,13 @@ target triple = "i386-unknown-linux-gnu"
; interleaved-group but rather as a scalarized accesses.
; (For SKX, Gather is not supported by the compiler for chars, therefore
; the only remaining alternative is to scalarize).
; In this case a scalar epilogue is not needed.
;
; When masked-interleave-group is enabled we expect to find the proper mask
; shuffling code, feeding the wide masked load for an interleave-group (with
; a single member).
; Since the last (second) member of the load-group is a gap, peeling is used,
; so we also expect to find a scalar epilogue loop.
;
; void masked_strided1(const unsigned char* restrict p,
; unsigned char* restrict q,
@ -38,6 +42,8 @@ target triple = "i386-unknown-linux-gnu"
;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
;DISABLED_MASKED_STRIDED-NOT: for.body:
;DISABLED_MASKED_STRIDED: for.end:
;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
;ENABLED_MASKED_STRIDED: vector.body:
@ -47,6 +53,7 @@ target triple = "i386-unknown-linux-gnu"
;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
;ENABLED_MASKED_STRIDED: for.body:
define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
entry:
@ -75,6 +82,109 @@ for.end:
ret void
}
; Exactly the same scenario except we are now optimizing for size, therefore
; we check that no scalar epilogue is created. Since we can't create an epilog
; the interleave-group is invalidated because is has gaps, so we end up
; scalarizing.
; (Before the fix that this test checks, we used to create an epilogue despite
; optsize, and vectorized the access as an interleaved-group. This is now fixed,
; and we make sure that a scalar epilogue does not exist).
;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
;ENABLED_MASKED_STRIDED: vector.body:
;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask =
;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
;ENABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
;ENABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask =
;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
;ENABLED_MASKED_STRIDED-NOT: for.body:
;ENABLED_MASKED_STRIDED: for.end:
define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
entry:
%conv = zext i8 %guard to i32
br label %for.body
for.body:
%ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%cmp1 = icmp ugt i32 %ix.09, %conv
br i1 %cmp1, label %if.then, label %for.inc
if.then:
%mul = shl nuw nsw i32 %ix.09, 1
%arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
%0 = load i8, i8* %arrayidx, align 1
%arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
store i8 %0, i8* %arrayidx3, align 1
br label %for.inc
for.inc:
%inc = add nuw nsw i32 %ix.09, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
; Same, but the load/store are not predicated. The interleave-group is
; invalidated here as well because we have gaps and we can't create an epilog.
; The access is thus scalarized.
; (Before the fix that this test checks, we used to create an epilogue despite
; optsize, and vectorized the access as an interleaved-group. This is now fixed,
; and we make sure that a scalar epilogue does not exist).
; Since enable-masked-interleaved-accesses currently only affects predicated
; accesses, the behavior is the same with this switch set/unset.
; void unconditional_strided1_optsize(const unsigned char* restrict p,
; unsigned char* restrict q,
; unsigned char guard) {
; for(ix=0; ix < 1024; ++ix) {
; char t = p[2*ix];
; q[ix] = t;
; }
; }
;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
;DISABLED_MASKED_STRIDED: vector.body:
;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
;DISABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
;DISABLED_MASKED_STRIDED-NOT: for.body:
;DISABLED_MASKED_STRIDED: for.end:
;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
;ENABLED_MASKED_STRIDED: vector.body:
;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
;ENABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0
;ENABLED_MASKED_STRIDED-NOT: for.body:
;ENABLED_MASKED_STRIDED: for.end:
define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
entry:
br label %for.body
for.body:
%ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%mul = shl nuw nsw i32 %ix.06, 1
%arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
%0 = load i8, i8* %arrayidx, align 1
%arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
store i8 %0, i8* %arrayidx1, align 1
%inc = add nuw nsw i32 %ix.06, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret void
}
; Check also a scenario with full interleave-groups (no gaps) as well as both
; load and store groups. We check that when masked-interleave-group is disabled
; the predicated loads (and stores) are not vectorized as an