forked from OSchip/llvm-project
[SLP]Detect reused scalars in all possible gathers for better vectorization cost.
Some initially gathered nodes missed the check for the reused scalars, which leads to high gather cost. Such nodes still can be represented as m gathers + shuffle instead of n gathers, where m < n. Differential Revision: https://reviews.llvm.org/D111153
This commit is contained in:
parent
200edc152b
commit
bebe702dbe
|
@ -3198,10 +3198,42 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
const EdgeInfo &UserTreeIdx) {
|
||||
assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
|
||||
|
||||
SmallVector<int> ReuseShuffleIndicies;
|
||||
SmallVector<Value *> UniqueValues;
|
||||
auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
|
||||
&UserTreeIdx,
|
||||
this](const InstructionsState &S) {
|
||||
// Check that every instruction appears once in this bundle.
|
||||
DenseMap<Value *, unsigned> UniquePositions;
|
||||
for (Value *V : VL) {
|
||||
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
|
||||
ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1
|
||||
: Res.first->second);
|
||||
if (Res.second)
|
||||
UniqueValues.emplace_back(V);
|
||||
}
|
||||
size_t NumUniqueScalarValues = UniqueValues.size();
|
||||
if (NumUniqueScalarValues == VL.size()) {
|
||||
ReuseShuffleIndicies.clear();
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
|
||||
if (NumUniqueScalarValues <= 1 ||
|
||||
!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
return false;
|
||||
}
|
||||
VL = UniqueValues;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
InstructionsState S = getSameOpcode(VL);
|
||||
if (Depth == RecursionMaxDepth) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (TryToFindDuplicates(S))
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
||||
ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3210,7 +3242,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
isa<ScalableVectorType>(
|
||||
cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (TryToFindDuplicates(S))
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
||||
ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3236,7 +3270,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(S.MainOp) &&
|
||||
!all_of(VL, isVectorLikeInstWithConstOps))) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (TryToFindDuplicates(S))
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
||||
ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3258,7 +3294,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
|
||||
if (!E->isSame(VL)) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (TryToFindDuplicates(S))
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
||||
ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
// Record the reuse of the tree node. FIXME, currently this is only used to
|
||||
|
@ -3277,7 +3315,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
if (getTreeEntry(I)) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
|
||||
<< ") is already in tree.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (TryToFindDuplicates(S))
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
||||
ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -3288,7 +3328,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
for (Value *V : VL) {
|
||||
if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (TryToFindDuplicates(S))
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
|
||||
ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -3307,28 +3349,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
}
|
||||
|
||||
// Check that every instruction appears once in this bundle.
|
||||
SmallVector<int> ReuseShuffleIndicies;
|
||||
SmallVector<Value *, 4> UniqueValues;
|
||||
DenseMap<Value *, unsigned> UniquePositions;
|
||||
for (Value *V : VL) {
|
||||
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
|
||||
ReuseShuffleIndicies.emplace_back(Res.first->second);
|
||||
if (Res.second)
|
||||
UniqueValues.emplace_back(V);
|
||||
}
|
||||
size_t NumUniqueScalarValues = UniqueValues.size();
|
||||
if (NumUniqueScalarValues == VL.size()) {
|
||||
ReuseShuffleIndicies.clear();
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
|
||||
if (NumUniqueScalarValues <= 1 ||
|
||||
!llvm::isPowerOf2_32(NumUniqueScalarValues)) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
||||
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
|
||||
if (!TryToFindDuplicates(S))
|
||||
return;
|
||||
}
|
||||
VL = UniqueValues;
|
||||
}
|
||||
|
||||
auto &BSRef = BlocksSchedules[BB];
|
||||
if (!BSRef)
|
||||
|
|
|
@ -16,38 +16,13 @@
|
|||
|
||||
define void @splat(i8 %a, i8 %b, i8 %c) {
|
||||
; SSE-LABEL: @splat(
|
||||
; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
|
||||
; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
|
||||
; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1
|
||||
; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1
|
||||
; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1
|
||||
; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]]
|
||||
; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1
|
||||
; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
|
||||
; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1
|
||||
; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]]
|
||||
; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1
|
||||
; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]]
|
||||
; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1
|
||||
; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1
|
||||
; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1
|
||||
; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1
|
||||
; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1
|
||||
; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1
|
||||
; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1
|
||||
; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1
|
||||
; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]]
|
||||
; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1
|
||||
; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
|
||||
; SSE-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
|
||||
; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
|
||||
; SSE-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[B:%.*]], i32 1
|
||||
; SSE-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
||||
; SSE-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]]
|
||||
; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
|
||||
; SSE-NEXT: ret void
|
||||
;
|
||||
; AVX-LABEL: @splat(
|
||||
|
|
Loading…
Reference in New Issue