Delay predication of stores until near the end of vector code generation

Predicating stores requires creating extra blocks. It's much cleaner if we do this in one pass instead of mutating the CFG while writing vector instructions.

Besides which we can make use of helper functions to update domtree for us, reducing the work we need to do.

llvm-svn: 247139
This commit is contained in:
James Molloy 2015-09-09 12:51:06 +00:00
parent 3834d2ca08
commit 89eccee4db
2 changed files with 62 additions and 65 deletions

View File

@ -285,8 +285,6 @@ public:
// Widen each instruction in the old loop to a new one in the new loop.
// Use the Legality module to find the induction and reduction variables.
vectorizeLoop();
// Register the new loop and update the analysis passes.
updateAnalysis();
}
// Return true if any runtime check is added.
@ -490,6 +488,9 @@ protected:
PHINode *OldInduction;
/// Maps scalars to widened vectors.
ValueMap WidenMap;
/// Store instructions that should be predicated, as a pair
/// <StoreInst, Predicate>
SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
EdgeMaskCache MaskCache;
/// Trip count of the original loop.
Value *TripCount;
@ -2472,19 +2473,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
Instruction *InsertPt = Builder.GetInsertPoint();
BasicBlock *IfBlock = Builder.GetInsertBlock();
BasicBlock *CondBlock = nullptr;
VectorParts Cond;
Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
VectorLp = LI->getLoopFor(IfBlock);
assert(VectorLp && "Must have a loop for this block");
}
// For each vector unroll 'part':
@ -2497,11 +2491,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
if (IfPredicateStore) {
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
LoopVectorBody.push_back(CondBlock);
VectorLp->addBasicBlockToLoop(CondBlock, *LI);
// Update Builder with newly created basic block.
Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
@ -2525,15 +2514,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
// End if-block.
if (IfPredicateStore) {
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
LoopVectorBody.push_back(NewIfBlock);
VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
Builder.SetInsertPoint(InsertPt);
ReplaceInstWithInst(IfBlock->getTerminator(),
BranchInst::Create(CondBlock, NewIfBlock, Cmp));
IfBlock = NewIfBlock;
}
if (IfPredicateStore)
PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
Cmp));
}
}
}
@ -3367,6 +3350,20 @@ void InnerLoopVectorizer::vectorizeLoop() {
fixLCSSAPHIs();
// Make sure DomTree is updated.
updateAnalysis();
// Predicate any stores.
for (auto KV : PredicatedStores) {
BasicBlock::iterator I(KV.first);
auto *BB = SplitBlock(I->getParent(), std::next(I), DT, LI);
auto *T = SplitBlockAndInsertIfThen(KV.second, I, /*Unreachable=*/false,
/*BranchWeights=*/nullptr, DT);
I->moveBefore(T);
I->getParent()->setName("pred.store.if");
BB->setName("pred.store.continue");
}
DEBUG(DT->verifyDomTree());
// Remove redundant induction instructions.
cse(LoopVectorBody);
}
@ -3805,17 +3802,10 @@ void InnerLoopVectorizer::updateAnalysis() {
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
// Due to if predication of stores we might create a sequence of "if(pred)
// a[i] = ...; " blocks.
for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
if (i == 0)
DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
else if (isPredicatedBlock(i)) {
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
} else {
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
}
}
// We don't predicate stores by this point, so the vector body should be a
// single loop.
assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
@ -5412,19 +5402,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
Instruction *InsertPt = Builder.GetInsertPoint();
BasicBlock *IfBlock = Builder.GetInsertBlock();
BasicBlock *CondBlock = nullptr;
VectorParts Cond;
Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
VectorLp = LI->getLoopFor(IfBlock);
assert(VectorLp && "Must have a loop for this block");
}
// For each vector unroll 'part':
@ -5439,11 +5422,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
ConstantInt::get(Cond[Part]->getType(), 1));
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
LoopVectorBody.push_back(CondBlock);
VectorLp->addBasicBlockToLoop(CondBlock, *LI);
// Update Builder with newly created basic block.
Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
@ -5463,16 +5441,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
// End if-block.
if (IfPredicateStore) {
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
LoopVectorBody.push_back(NewIfBlock);
VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
Builder.SetInsertPoint(InsertPt);
ReplaceInstWithInst(IfBlock->getTerminator(),
BranchInst::Create(CondBlock, NewIfBlock, Cmp));
IfBlock = NewIfBlock;
}
// End if-block.
if (IfPredicateStore)
PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
Cmp));
}
}

View File

@ -1,5 +1,8 @@
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -simplifycfg < %s | FileCheck %s --check-prefix=VEC
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"
@ -14,27 +17,49 @@ entry:
; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0
; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true
; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
;
; VEC: [[cond]]:
; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
; VEC: store i32 %[[v13]], i32* %[[v14]], align 4
; VEC: br label %[[else:.+]]
;
; VEC: [[else]]:
; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1
; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true
; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
;
; VEC: [[cond2]]:
; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
; VEC: store i32 %[[v17]], i32* %[[v18]], align 4
; VEC: br label %[[else2:.+]]
;
; VEC: [[else2]]:
; VEC-IC-LABEL: test
; VEC-IC: %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
; VEC-IC: %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
; VEC-IC: %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0
; VEC-IC: br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]]
;
; VEC-IC: [[cond]]:
; VEC-IC: %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0
; VEC-IC: store i32 %[[v4]], i32* %{{.*}}, align 4
; VEC-IC: br label %[[else:.+]]
;
; VEC-IC: [[else]]:
; VEC-IC: %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1
; VEC-IC: br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]]
;
; VEC-IC: [[cond2]]:
; VEC-IC: %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1
; VEC-IC: store i32 %[[v6]], i32* %{{.*}}, align 4
; VEC-IC: br label %[[else2:.+]]
;
; VEC-IC: [[else2]]:
; UNROLL-LABEL: test
; UNROLL: vector.body:
; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0
@ -90,9 +115,9 @@ for.end:
; vectorized loop body.
; PR18724
; UNROLL-LABEL: bug18724
; UNROLL: store i32
; UNROLL: store i32
; UNROLL-NOSIMPLIFY-LABEL: bug18724
; UNROLL-NOSIMPLIFY: store i32
; UNROLL-NOSIMPLIFY: store i32
define void @bug18724() {
entry: