forked from OSchip/llvm-project
[SLP]Improve vectorization of stores.
Patch tries to improve the vectorization of stores. Originally, we just check the type and the base pointer of the store. Patch adds some extra checks to avoid non-profitable vectorization cases. It includes analysis of the scalar values to be stored and triggers the vectorization attempt only if the scalar values have same/alt opcode and are from same basic block, i.e. we don't end up immediately with the gather node, which is not profitable. This also improves compile time by filtering out non-profitable cases. Part of D57059. Differential Revision: https://reviews.llvm.org/D104122
This commit is contained in:
parent
a0ea367562
commit
c574d2fbac
|
@ -8674,16 +8674,103 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
|
|||
|
||||
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
|
||||
bool Changed = false;
|
||||
// Sort by type, base pointers and values operand. Value operands must be
|
||||
// compatible (have the same opcode, same parent), otherwise it is
|
||||
// definitely not profitable to try to vectorize them.
|
||||
auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
|
||||
if (V->getPointerOperandType()->getTypeID() <
|
||||
V2->getPointerOperandType()->getTypeID())
|
||||
return true;
|
||||
if (V->getPointerOperandType()->getTypeID() >
|
||||
V2->getPointerOperandType()->getTypeID())
|
||||
return false;
|
||||
// UndefValues are compatible with all other values.
|
||||
if (isa<UndefValue>(V->getValueOperand()) ||
|
||||
isa<UndefValue>(V2->getValueOperand()))
|
||||
return false;
|
||||
if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
|
||||
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
|
||||
DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
|
||||
DT->getNode(I1->getParent());
|
||||
DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
|
||||
DT->getNode(I2->getParent());
|
||||
assert(NodeI1 && "Should only process reachable instructions");
|
||||
assert(NodeI1 && "Should only process reachable instructions");
|
||||
assert((NodeI1 == NodeI2) ==
|
||||
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
|
||||
"Different nodes should have different DFS numbers");
|
||||
if (NodeI1 != NodeI2)
|
||||
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
|
||||
InstructionsState S = getSameOpcode({I1, I2});
|
||||
if (S.getOpcode())
|
||||
return false;
|
||||
return I1->getOpcode() < I2->getOpcode();
|
||||
}
|
||||
if (isa<Constant>(V->getValueOperand()) &&
|
||||
isa<Constant>(V2->getValueOperand()))
|
||||
return false;
|
||||
return V->getValueOperand()->getValueID() <
|
||||
V2->getValueOperand()->getValueID();
|
||||
};
|
||||
|
||||
auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {
|
||||
if (V1 == V2)
|
||||
return true;
|
||||
if (V1->getPointerOperandType() != V2->getPointerOperandType())
|
||||
return false;
|
||||
// Undefs are compatible with any other value.
|
||||
if (isa<UndefValue>(V1->getValueOperand()) ||
|
||||
isa<UndefValue>(V2->getValueOperand()))
|
||||
return true;
|
||||
if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
|
||||
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
|
||||
if (I1->getParent() != I2->getParent())
|
||||
return false;
|
||||
InstructionsState S = getSameOpcode({I1, I2});
|
||||
return S.getOpcode() > 0;
|
||||
}
|
||||
if (isa<Constant>(V1->getValueOperand()) &&
|
||||
isa<Constant>(V2->getValueOperand()))
|
||||
return true;
|
||||
return V1->getValueOperand()->getValueID() ==
|
||||
V2->getValueOperand()->getValueID();
|
||||
};
|
||||
|
||||
// Attempt to sort and vectorize each of the store-groups.
|
||||
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
|
||||
++it) {
|
||||
if (it->second.size() < 2)
|
||||
for (auto &Pair : Stores) {
|
||||
if (Pair.second.size() < 2)
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
|
||||
<< it->second.size() << ".\n");
|
||||
<< Pair.second.size() << ".\n");
|
||||
|
||||
Changed |= vectorizeStores(it->second, R);
|
||||
stable_sort(Pair.second, StoreSorter);
|
||||
|
||||
// Try to vectorize elements based on their compatibility.
|
||||
for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(),
|
||||
E = Pair.second.end();
|
||||
IncIt != E;) {
|
||||
|
||||
// Look for the next elements with the same type.
|
||||
ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt;
|
||||
Type *EltTy = (*IncIt)->getPointerOperand()->getType();
|
||||
|
||||
while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))
|
||||
++SameTypeIt;
|
||||
|
||||
// Try to vectorize them.
|
||||
unsigned NumElts = (SameTypeIt - IncIt);
|
||||
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("
|
||||
<< NumElts << ")\n");
|
||||
if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&
|
||||
vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {
|
||||
// Success start over because instructions might have been changed.
|
||||
Changed = true;
|
||||
}
|
||||
|
||||
// Start over at the next instruction of a different type (or the end).
|
||||
IncIt = SameTypeIt;
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
|
|
@ -19,13 +19,16 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur
|
|||
; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
|
||||
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
|
||||
; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
|
||||
; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
|
||||
; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]]
|
||||
; CHECK-NEXT: br label [[BLOCK1:%.*]]
|
||||
; CHECK: block1:
|
||||
; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 5
|
||||
|
@ -37,11 +40,11 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur
|
|||
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
|
||||
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
|
||||
; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4
|
||||
; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_10]], align 4
|
||||
; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4
|
||||
; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_11]], align 4
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>*
|
||||
; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>*
|
||||
; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>*
|
||||
; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4
|
||||
; CHECK-NEXT: ret i32 undef
|
||||
;
|
||||
%in.addr = getelementptr inbounds i32, i32* %in, i64 0
|
||||
|
|
Loading…
Reference in New Issue