[SLP]Improve vectorization of stores.

Patch tries to improve the vectorization of stores. Originally, we just
check the type and the base pointer of the store.
Patch adds some extra checks to avoid non-profitable vectorization
cases. It includes analysis of the scalar values to be stored and
triggers the vectorization attempt only if the scalar values have
same/alt opcode and are from same basic block, i.e. we don't end up
immediately with the gather node, which is not profitable.
This also improves compile time by filtering out non-profitable cases.

Part of D57059.

Differential Revision: https://reviews.llvm.org/D104122
This commit is contained in:
Alexey Bataev 2021-06-11 06:02:47 -07:00
parent a0ea367562
commit c574d2fbac
2 changed files with 105 additions and 15 deletions

View File

@ -8674,16 +8674,103 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
// Sort by type, base pointers and values operand. Value operands must be
// compatible (have the same opcode, same parent), otherwise it is
// definitely not profitable to try to vectorize them.
auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
if (V->getPointerOperandType()->getTypeID() <
V2->getPointerOperandType()->getTypeID())
return true;
if (V->getPointerOperandType()->getTypeID() >
V2->getPointerOperandType()->getTypeID())
return false;
// UndefValues are compatible with all other values.
if (isa<UndefValue>(V->getValueOperand()) ||
isa<UndefValue>(V2->getValueOperand()))
return false;
if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
DT->getNode(I1->getParent());
DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
DT->getNode(I2->getParent());
assert(NodeI1 && "Should only process reachable instructions");
assert(NodeI1 && "Should only process reachable instructions");
assert((NodeI1 == NodeI2) ==
(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
"Different nodes should have different DFS numbers");
if (NodeI1 != NodeI2)
return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
InstructionsState S = getSameOpcode({I1, I2});
if (S.getOpcode())
return false;
return I1->getOpcode() < I2->getOpcode();
}
if (isa<Constant>(V->getValueOperand()) &&
isa<Constant>(V2->getValueOperand()))
return false;
return V->getValueOperand()->getValueID() <
V2->getValueOperand()->getValueID();
};
auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {
if (V1 == V2)
return true;
if (V1->getPointerOperandType() != V2->getPointerOperandType())
return false;
// Undefs are compatible with any other value.
if (isa<UndefValue>(V1->getValueOperand()) ||
isa<UndefValue>(V2->getValueOperand()))
return true;
if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
if (I1->getParent() != I2->getParent())
return false;
InstructionsState S = getSameOpcode({I1, I2});
return S.getOpcode() > 0;
}
if (isa<Constant>(V1->getValueOperand()) &&
isa<Constant>(V2->getValueOperand()))
return true;
return V1->getValueOperand()->getValueID() ==
V2->getValueOperand()->getValueID();
};
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
++it) {
if (it->second.size() < 2)
for (auto &Pair : Stores) {
if (Pair.second.size() < 2)
continue;
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");
<< Pair.second.size() << ".\n");
Changed |= vectorizeStores(it->second, R);
stable_sort(Pair.second, StoreSorter);
// Try to vectorize elements based on their compatibility.
for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(),
E = Pair.second.end();
IncIt != E;) {
// Look for the next elements with the same type.
ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt;
Type *EltTy = (*IncIt)->getPointerOperand()->getType();
while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))
++SameTypeIt;
// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("
<< NumElts << ")\n");
if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&
vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {
// Success start over because instructions might have been changed.
Changed = true;
}
// Start over at the next instruction of a different type (or the end).
IncIt = SameTypeIt;
}
}
return Changed;
}

View File

@ -19,13 +19,16 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur
; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]]
; CHECK-NEXT: br label [[BLOCK1:%.*]]
; CHECK: block1:
; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 5
@ -37,11 +40,11 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur
; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4
; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_10]], align 4
; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4
; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_11]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>*
; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>*
; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>*
; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4
; CHECK-NEXT: ret i32 undef
;
%in.addr = getelementptr inbounds i32, i32* %in, i64 0