[SLP] Fix for PR32036: Vectorized horizontal reduction returning wrong

result

Summary:
If the same value is used several times as an extra value, SLP
vectorizer takes it into account only once instead of actual number of
using.
For example:
```
int val = 1;
for (int y = 0; y < 8; y++) {
  for (int x = 0; x < 8; x++) {
    val = val + input[y * 8 + x] + 3;
  }
}
```
We have 2 extra rguments: `1` - initial value of horizontal reduction
and `3`, which is added 8*8 times to the reduction. Before the patch we
added `1` to the reduction value and added once `3`, though it must be
added 64 times.

Reviewers: mkuper, mzolotukhin

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30262

llvm-svn: 295949
This commit is contained in:
Alexey Bataev 2017-02-23 09:40:38 +00:00
parent 39af790bb1
commit 68f2402c61
2 changed files with 25 additions and 17 deletions

View File

@ -304,6 +304,7 @@ public:
typedef SmallVector<Instruction *, 16> InstrList; typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet; typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList; typedef SmallVector<StoreInst *, 8> StoreList;
typedef MapVector<Value *, std::vector<DebugLoc>> ExtraValueToDebugLocsMap;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
@ -333,7 +334,7 @@ public:
/// Vectorize the tree but with the list of externally used values \p /// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the /// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions. /// generated extractvalue instructions.
Value *vectorizeTree(MapVector<Value *, DebugLoc> &ExternallyUsedValues); Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
/// \returns the cost incurred by unwanted spills and fills, caused by /// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites. /// holding live values over call sites.
@ -352,7 +353,7 @@ public:
/// into account (anf updating it, if required) list of externally used /// into account (anf updating it, if required) list of externally used
/// values stored in \p ExternallyUsedValues. /// values stored in \p ExternallyUsedValues.
void buildTree(ArrayRef<Value *> Roots, void buildTree(ArrayRef<Value *> Roots,
MapVector<Value *, DebugLoc> &ExternallyUsedValues, ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst = None); ArrayRef<Value *> UserIgnoreLst = None);
/// Clear the internal data structures that are created by 'buildTree'. /// Clear the internal data structures that are created by 'buildTree'.
@ -953,11 +954,11 @@ private:
void BoUpSLP::buildTree(ArrayRef<Value *> Roots, void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
ArrayRef<Value *> UserIgnoreLst) { ArrayRef<Value *> UserIgnoreLst) {
MapVector<Value *, DebugLoc> ExternallyUsedValues; ExtraValueToDebugLocsMap ExternallyUsedValues;
buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
} }
void BoUpSLP::buildTree(ArrayRef<Value *> Roots, void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
MapVector<Value *, DebugLoc> &ExternallyUsedValues, ExtraValueToDebugLocsMap &ExternallyUsedValues,
ArrayRef<Value *> UserIgnoreLst) { ArrayRef<Value *> UserIgnoreLst) {
deleteTree(); deleteTree();
UserIgnoreList = UserIgnoreLst; UserIgnoreList = UserIgnoreLst;
@ -2801,12 +2802,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
} }
Value *BoUpSLP::vectorizeTree() { Value *BoUpSLP::vectorizeTree() {
MapVector<Value *, DebugLoc> ExternallyUsedValues; ExtraValueToDebugLocsMap ExternallyUsedValues;
return vectorizeTree(ExternallyUsedValues); return vectorizeTree(ExternallyUsedValues);
} }
Value * Value *
BoUpSLP::vectorizeTree(MapVector<Value *, DebugLoc> &ExternallyUsedValues) { BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// All blocks must be scheduled before any instructions are inserted. // All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) { for (auto &BSIter : BlocksSchedules) {
@ -2868,7 +2869,6 @@ BoUpSLP::vectorizeTree(MapVector<Value *, DebugLoc> &ExternallyUsedValues) {
assert(ExternallyUsedValues.count(Scalar) && assert(ExternallyUsedValues.count(Scalar) &&
"Scalar with nullptr as an external user must be registered in " "Scalar with nullptr as an external user must be registered in "
"ExternallyUsedValues map"); "ExternallyUsedValues map");
DebugLoc DL = ExternallyUsedValues[Scalar];
if (auto *VecI = dyn_cast<Instruction>(Vec)) { if (auto *VecI = dyn_cast<Instruction>(Vec)) {
Builder.SetInsertPoint(VecI->getParent(), Builder.SetInsertPoint(VecI->getParent(),
std::next(VecI->getIterator())); std::next(VecI->getIterator()));
@ -2878,8 +2878,8 @@ BoUpSLP::vectorizeTree(MapVector<Value *, DebugLoc> &ExternallyUsedValues) {
Value *Ex = Builder.CreateExtractElement(Vec, Lane); Value *Ex = Builder.CreateExtractElement(Vec, Lane);
Ex = extend(ScalarRoot, Ex, Scalar->getType()); Ex = extend(ScalarRoot, Ex, Scalar->getType());
CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
std::swap(ExternallyUsedValues[Ex], ExternallyUsedValues[Scalar]);
ExternallyUsedValues.erase(Scalar); ExternallyUsedValues.erase(Scalar);
ExternallyUsedValues[Ex] = DL;
continue; continue;
} }
@ -4439,9 +4439,11 @@ public:
Builder.setFastMathFlags(Unsafe); Builder.setFastMathFlags(Unsafe);
unsigned i = 0; unsigned i = 0;
MapVector<Value *, DebugLoc> ExternallyUsedValues; BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several time, so log each attempt
// to use it.
for (auto &Pair : ExtraArgs) for (auto &Pair : ExtraArgs)
ExternallyUsedValues[Pair.second] = Pair.first->getDebugLoc(); ExternallyUsedValues[Pair.second].push_back(Pair.first->getDebugLoc());
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, ReductionOps); V.buildTree(VL, ExternallyUsedValues, ReductionOps);
@ -4489,10 +4491,14 @@ public:
Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
} }
for (auto &Pair : ExternallyUsedValues) { for (auto &Pair : ExternallyUsedValues) {
Builder.SetCurrentDebugLocation(Pair.second); // Add each externally used value to the final reduction.
assert(!Pair.second.empty() && "At least one DebugLoc must be added.");
for (auto &DL : Pair.second) {
Builder.SetCurrentDebugLocation(DL);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
Pair.first, "bin.extra"); Pair.first, "bin.extra");
} }
}
// Update users. // Update users.
if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) { if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
assert(ReductionRoot && "Need a reduction operation"); assert(ReductionRoot && "Need a reduction operation");

View File

@ -1473,9 +1473,10 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00 ; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
; CHECK-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], [[CONV]] ; CHECK-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
; CHECK-NEXT: [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] ; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
; CHECK-NEXT: ret float [[BIN_EXTRA6]] ; CHECK-NEXT: ret float [[BIN_EXTRA7]]
; ;
; THRESHOLD-LABEL: @extra_args_same_several_times( ; THRESHOLD-LABEL: @extra_args_same_several_times(
; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: entry:
@ -1510,9 +1511,10 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 ; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] ; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00 ; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
; THRESHOLD-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], [[CONV]] ; THRESHOLD-NEXT: [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
; THRESHOLD-NEXT: [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] ; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
; THRESHOLD-NEXT: ret float [[BIN_EXTRA6]] ; THRESHOLD-NEXT: ret float [[BIN_EXTRA7]]
; ;
entry: entry:
%mul = mul nsw i32 %b, %a %mul = mul nsw i32 %b, %a