[SLP]Fix PR39774: Set ReductionRoot if the original instruction is vectorized.

Summary:
If the original reduction root instruction was vectorized, it might be
removed from the tree. It means that the insertion point may become
invalidated and the whole vectorization of the reduction leads to the
incorrect output result.
The ReductionRoot instruction must be marked as externally used so it
could not be removed. Otherwise it might cause inconsistency with the
cost model and we may end up with too optimistic optimization.

Reviewers: RKSimon, spatel, hfinkel, mkuper

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D54955

llvm-svn: 347759
This commit is contained in:
Alexey Bataev 2018-11-28 14:34:11 +00:00
parent 30ce962732
commit 579c2d9d64
2 changed files with 117 additions and 5 deletions

View File

@ -5453,7 +5453,7 @@ class HorizontalReduction {
}
};
Instruction *ReductionRoot = nullptr;
WeakTrackingVH ReductionRoot;
/// The operation data of the reduction operation.
OperationData ReductionData;
@ -5738,7 +5738,7 @@ public:
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
Value *VectorizedTree = nullptr;
IRBuilder<> Builder(ReductionRoot);
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
FastMathFlags Unsafe;
Unsafe.setFast();
Builder.setFastMathFlags(Unsafe);
@ -5747,8 +5747,13 @@ public:
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several time, so log each attempt
// to use it.
for (auto &Pair : ExtraArgs)
for (auto &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
}
// The reduction root is used as the insertion point for new instructions,
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
SmallVector<Value *, 16> IgnoreList;
for (auto &V : ReductionOps)
IgnoreList.append(V.begin(), V.end());
@ -5800,6 +5805,7 @@ public:
Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
// Emit a reduction.
Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (VectorizedTree) {
@ -5826,8 +5832,6 @@ public:
VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
}
for (auto &Pair : ExternallyUsedValues) {
assert(!Pair.second.empty() &&
"At least one DebugLoc must be inserted");
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());

View File

@ -0,0 +1,108 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s
define void @Test(i32) {
; CHECK-LABEL: @Test(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[LOCAL_4_39_US:%.*]] = phi i32 [ [[VAL_42:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[VAL_0:%.*]] = add i32 [[LOCAL_4_39_US]], 0
; CHECK-NEXT: [[VAL_1:%.*]] = and i32 [[LOCAL_8_43_US]], [[VAL_0]]
; CHECK-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
; CHECK-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
; CHECK-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
; CHECK-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
; CHECK-NEXT: [[VAL_6:%.*]] = add i32 [[LOCAL_8_43_US]], 55
; CHECK-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], [[VAL_6]]
; CHECK-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
; CHECK-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
; CHECK-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
; CHECK-NEXT: [[VAL_11:%.*]] = add i32 [[LOCAL_8_43_US]], 285
; CHECK-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], [[VAL_11]]
; CHECK-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
; CHECK-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
; CHECK-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
; CHECK-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
; CHECK-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
; CHECK-NEXT: [[VAL_18:%.*]] = add i32 [[LOCAL_8_43_US]], 1240
; CHECK-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], [[VAL_18]]
; CHECK-NEXT: [[VAL_20:%.*]] = add i32 [[LOCAL_8_43_US]], 1496
; CHECK-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]]
; CHECK-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
; CHECK-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
; CHECK-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
; CHECK-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
; CHECK-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
; CHECK-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
; CHECK-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
; CHECK-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
; CHECK-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
; CHECK-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
; CHECK-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
; CHECK-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
; CHECK-NEXT: [[VAL_34:%.*]] = add i32 [[LOCAL_8_43_US]], 8555
; CHECK-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]]
; CHECK-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
; CHECK-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
; CHECK-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
; CHECK-NEXT: [[VAL_39:%.*]] = add i32 [[LOCAL_8_43_US]], 12529
; CHECK-NEXT: [[VAL_40:%.*]] = and i32 [[VAL_38]], [[VAL_39]]
; CHECK-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US]], 13685
; CHECK-NEXT: [[VAL_42]] = and i32 [[VAL_40]], [[VAL_41]]
; CHECK-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
; CHECK-NEXT: br label [[LOOP]]
;
entry:
br label %loop
loop:
%local_4_39.us = phi i32 [ %val_42, %loop ], [ 0, %entry ]
%local_8_43.us = phi i32 [ %val_43, %loop ], [ 0, %entry ]
%val_0 = add i32 %local_4_39.us, 0
%val_1 = and i32 %local_8_43.us, %val_0
%val_2 = and i32 %val_1, %0
%val_3 = and i32 %val_2, %0
%val_4 = and i32 %val_3, %0
%val_5 = and i32 %val_4, %0
%val_6 = add i32 %local_8_43.us, 55
%val_7 = and i32 %val_5, %val_6
%val_8 = and i32 %val_7, %0
%val_9 = and i32 %val_8, %0
%val_10 = and i32 %val_9, %0
%val_11 = add i32 %local_8_43.us, 285
%val_12 = and i32 %val_10, %val_11
%val_13 = and i32 %val_12, %0
%val_14 = and i32 %val_13, %0
%val_15 = and i32 %val_14, %0
%val_16 = and i32 %val_15, %0
%val_17 = and i32 %val_16, %0
%val_18 = add i32 %local_8_43.us, 1240
%val_19 = and i32 %val_17, %val_18
%val_20 = add i32 %local_8_43.us, 1496
%val_21 = and i32 %val_19, %val_20
%val_22 = and i32 %val_21, %0
%val_23 = and i32 %val_22, %0
%val_24 = and i32 %val_23, %0
%val_25 = and i32 %val_24, %0
%val_26 = and i32 %val_25, %0
%val_27 = and i32 %val_26, %0
%val_28 = and i32 %val_27, %0
%val_29 = and i32 %val_28, %0
%val_30 = and i32 %val_29, %0
%val_31 = and i32 %val_30, %0
%val_32 = and i32 %val_31, %0
%val_33 = and i32 %val_32, %0
%val_34 = add i32 %local_8_43.us, 8555
%val_35 = and i32 %val_33, %val_34
%val_36 = and i32 %val_35, %0
%val_37 = and i32 %val_36, %0
%val_38 = and i32 %val_37, %0
%val_39 = add i32 %local_8_43.us, 12529
%val_40 = and i32 %val_38, %val_39
%val_41 = add i32 %local_8_43.us, 13685
%val_42 = and i32 %val_40, %val_41
%val_43 = add i32 %local_8_43.us, 14910
br label %loop
}