forked from OSchip/llvm-project
[LV] Move isLegalMasked* functions from Legality to CostModel
All SIMD architectures can emulate masked load/store/gather/scatter through element-wise condition check, scalar load/store, and insert/extract. Therefore, bailing out of vectorization as legality failure, when they return false, is incorrect. We should proceed to cost model and determine profitability. This patch is to address the vectorizer's architectural limitation described above. As such, I tried to keep the cost model and vectorize/don't-vectorize behavior nearly unchanged. Cost model tuning should be done separately. Please see http://lists.llvm.org/pipermail/llvm-dev/2018-January/120164.html for RFC and the discussions. Closes D43208. Patch by: Hideki Saito <hideki.saito@intel.com> llvm-svn: 326079
This commit is contained in:
parent
ed45836253
commit
9d1b2acaaa
|
@ -1648,58 +1648,12 @@ public:
|
||||||
|
|
||||||
bool hasStride(Value *V) { return LAI->hasStride(V); }
|
bool hasStride(Value *V) { return LAI->hasStride(V); }
|
||||||
|
|
||||||
/// Returns true if the target machine supports masked store operation
|
|
||||||
/// for the given \p DataType and kind of access to \p Ptr.
|
|
||||||
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
|
|
||||||
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if the target machine supports masked load operation
|
|
||||||
/// for the given \p DataType and kind of access to \p Ptr.
|
|
||||||
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
|
|
||||||
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if the target machine supports masked scatter operation
|
|
||||||
/// for the given \p DataType.
|
|
||||||
bool isLegalMaskedScatter(Type *DataType) {
|
|
||||||
return TTI->isLegalMaskedScatter(DataType);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if the target machine supports masked gather operation
|
|
||||||
/// for the given \p DataType.
|
|
||||||
bool isLegalMaskedGather(Type *DataType) {
|
|
||||||
return TTI->isLegalMaskedGather(DataType);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if the target machine can represent \p V as a masked gather
|
|
||||||
/// or scatter operation.
|
|
||||||
bool isLegalGatherOrScatter(Value *V) {
|
|
||||||
auto *LI = dyn_cast<LoadInst>(V);
|
|
||||||
auto *SI = dyn_cast<StoreInst>(V);
|
|
||||||
if (!LI && !SI)
|
|
||||||
return false;
|
|
||||||
auto *Ptr = getPointerOperand(V);
|
|
||||||
auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
|
|
||||||
return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if vector representation of the instruction \p I
|
/// Returns true if vector representation of the instruction \p I
|
||||||
/// requires mask.
|
/// requires mask.
|
||||||
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
|
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
|
||||||
|
|
||||||
unsigned getNumStores() const { return LAI->getNumStores(); }
|
unsigned getNumStores() const { return LAI->getNumStores(); }
|
||||||
unsigned getNumLoads() const { return LAI->getNumLoads(); }
|
unsigned getNumLoads() const { return LAI->getNumLoads(); }
|
||||||
unsigned getNumPredStores() const { return NumPredStores; }
|
|
||||||
|
|
||||||
/// Returns true if \p I is an instruction that will be scalarized with
|
|
||||||
/// predication. Such instructions include conditional stores and
|
|
||||||
/// instructions that may divide by zero.
|
|
||||||
bool isScalarWithPredication(Instruction *I);
|
|
||||||
|
|
||||||
/// Returns true if \p I is a memory instruction with consecutive memory
|
|
||||||
/// access that can be widened.
|
|
||||||
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
|
|
||||||
|
|
||||||
// Returns true if the NoNaN attribute is set on the function.
|
// Returns true if the NoNaN attribute is set on the function.
|
||||||
bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
|
bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
|
||||||
|
@ -1753,8 +1707,6 @@ private:
|
||||||
return LAI ? &LAI->getSymbolicStrides() : nullptr;
|
return LAI ? &LAI->getSymbolicStrides() : nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned NumPredStores = 0;
|
|
||||||
|
|
||||||
/// The loop that we evaluate.
|
/// The loop that we evaluate.
|
||||||
Loop *TheLoop;
|
Loop *TheLoop;
|
||||||
|
|
||||||
|
@ -2060,7 +2012,53 @@ public:
|
||||||
collectLoopScalars(VF);
|
collectLoopScalars(VF);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns true if the target machine supports masked store operation
|
||||||
|
/// for the given \p DataType and kind of access to \p Ptr.
|
||||||
|
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
|
||||||
|
return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the target machine supports masked load operation
|
||||||
|
/// for the given \p DataType and kind of access to \p Ptr.
|
||||||
|
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
|
||||||
|
return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the target machine supports masked scatter operation
|
||||||
|
/// for the given \p DataType.
|
||||||
|
bool isLegalMaskedScatter(Type *DataType) {
|
||||||
|
return TTI.isLegalMaskedScatter(DataType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the target machine supports masked gather operation
|
||||||
|
/// for the given \p DataType.
|
||||||
|
bool isLegalMaskedGather(Type *DataType) {
|
||||||
|
return TTI.isLegalMaskedGather(DataType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if the target machine can represent \p V as a masked gather
|
||||||
|
/// or scatter operation.
|
||||||
|
bool isLegalGatherOrScatter(Value *V) {
|
||||||
|
bool LI = isa<LoadInst>(V);
|
||||||
|
bool SI = isa<StoreInst>(V);
|
||||||
|
if (!LI && !SI)
|
||||||
|
return false;
|
||||||
|
auto *Ty = getMemInstValueType(V);
|
||||||
|
return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if \p I is an instruction that will be scalarized with
|
||||||
|
/// predication. Such instructions include conditional stores and
|
||||||
|
/// instructions that may divide by zero.
|
||||||
|
bool isScalarWithPredication(Instruction *I);
|
||||||
|
|
||||||
|
/// Returns true if \p I is a memory instruction with consecutive memory
|
||||||
|
/// access that can be widened.
|
||||||
|
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
unsigned NumPredStores = 0;
|
||||||
|
|
||||||
/// \return An upper bound for the vectorization factor, larger than zero.
|
/// \return An upper bound for the vectorization factor, larger than zero.
|
||||||
/// One is returned if vectorization should best be avoided due to cost.
|
/// One is returned if vectorization should best be avoided due to cost.
|
||||||
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
|
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
|
||||||
|
@ -2112,6 +2110,10 @@ private:
|
||||||
/// as a vector operation.
|
/// as a vector operation.
|
||||||
bool isConsecutiveLoadOrStore(Instruction *I);
|
bool isConsecutiveLoadOrStore(Instruction *I);
|
||||||
|
|
||||||
|
/// Returns true if an artificially high cost for emulated masked memrefs
|
||||||
|
/// should be used.
|
||||||
|
bool useEmulatedMaskMemRefHack(Instruction *I);
|
||||||
|
|
||||||
/// Create an analysis remark that explains why vectorization failed
|
/// Create an analysis remark that explains why vectorization failed
|
||||||
///
|
///
|
||||||
/// \p RemarkName is the identifier for the remark. \return the remark object
|
/// \p RemarkName is the identifier for the remark. \return the remark object
|
||||||
|
@ -5421,14 +5423,22 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
|
||||||
Scalars[VF].insert(Worklist.begin(), Worklist.end());
|
Scalars[VF].insert(Worklist.begin(), Worklist.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
|
bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
|
||||||
if (!blockNeedsPredication(I->getParent()))
|
if (!Legal->blockNeedsPredication(I->getParent()))
|
||||||
return false;
|
return false;
|
||||||
switch(I->getOpcode()) {
|
switch(I->getOpcode()) {
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
case Instruction::Store:
|
case Instruction::Load:
|
||||||
return !isMaskRequired(I);
|
case Instruction::Store: {
|
||||||
|
if (!Legal->isMaskRequired(I))
|
||||||
|
return false;
|
||||||
|
auto *Ptr = getPointerOperand(I);
|
||||||
|
auto *Ty = getMemInstValueType(I);
|
||||||
|
return isa<LoadInst>(I) ?
|
||||||
|
!(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
|
||||||
|
: !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
|
||||||
|
}
|
||||||
case Instruction::UDiv:
|
case Instruction::UDiv:
|
||||||
case Instruction::SDiv:
|
case Instruction::SDiv:
|
||||||
case Instruction::SRem:
|
case Instruction::SRem:
|
||||||
|
@ -5438,8 +5448,8 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
|
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
|
||||||
unsigned VF) {
|
unsigned VF) {
|
||||||
// Get and ensure we have a valid memory instruction.
|
// Get and ensure we have a valid memory instruction.
|
||||||
LoadInst *LI = dyn_cast<LoadInst>(I);
|
LoadInst *LI = dyn_cast<LoadInst>(I);
|
||||||
StoreInst *SI = dyn_cast<StoreInst>(I);
|
StoreInst *SI = dyn_cast<StoreInst>(I);
|
||||||
|
@ -5448,7 +5458,7 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
|
||||||
auto *Ptr = getPointerOperand(I);
|
auto *Ptr = getPointerOperand(I);
|
||||||
|
|
||||||
// In order to be widened, the pointer should be consecutive, first of all.
|
// In order to be widened, the pointer should be consecutive, first of all.
|
||||||
if (!isConsecutivePtr(Ptr))
|
if (!Legal->isConsecutivePtr(Ptr))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// If the instruction is a store located in a predicated block, it will be
|
// If the instruction is a store located in a predicated block, it will be
|
||||||
|
@ -5703,39 +5713,26 @@ bool LoopVectorizationLegality::blockCanBePredicated(
|
||||||
if (!LI)
|
if (!LI)
|
||||||
return false;
|
return false;
|
||||||
if (!SafePtrs.count(LI->getPointerOperand())) {
|
if (!SafePtrs.count(LI->getPointerOperand())) {
|
||||||
if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
|
|
||||||
isLegalMaskedGather(LI->getType())) {
|
|
||||||
MaskedOp.insert(LI);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// !llvm.mem.parallel_loop_access implies if-conversion safety.
|
// !llvm.mem.parallel_loop_access implies if-conversion safety.
|
||||||
if (IsAnnotatedParallel)
|
// Otherwise, record that the load needs (real or emulated) masking
|
||||||
continue;
|
// and let the cost model decide.
|
||||||
return false;
|
if (!IsAnnotatedParallel)
|
||||||
|
MaskedOp.insert(LI);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (I.mayWriteToMemory()) {
|
if (I.mayWriteToMemory()) {
|
||||||
auto *SI = dyn_cast<StoreInst>(&I);
|
auto *SI = dyn_cast<StoreInst>(&I);
|
||||||
// We only support predication of stores in basic blocks with one
|
|
||||||
// predecessor.
|
|
||||||
if (!SI)
|
if (!SI)
|
||||||
return false;
|
return false;
|
||||||
|
// Predicated store requires some form of masking:
|
||||||
// Build a masked store if it is legal for the target.
|
// 1) masked store HW instruction,
|
||||||
if (isLegalMaskedStore(SI->getValueOperand()->getType(),
|
// 2) emulation via load-blend-store (only if safe and legal to do so,
|
||||||
SI->getPointerOperand()) ||
|
// be aware on the race conditions), or
|
||||||
isLegalMaskedScatter(SI->getValueOperand()->getType())) {
|
// 3) element-by-element predicate check and scalar store.
|
||||||
MaskedOp.insert(SI);
|
MaskedOp.insert(SI);
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
|
|
||||||
bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
|
|
||||||
bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
|
|
||||||
|
|
||||||
if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
|
|
||||||
!isSinglePredecessor)
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
if (I.mayThrow())
|
if (I.mayThrow())
|
||||||
return false;
|
return false;
|
||||||
|
@ -6050,13 +6047,6 @@ void InterleavedAccessInfo::analyzeInterleaving(
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
|
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
|
||||||
if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
|
|
||||||
ORE->emit(createMissedAnalysis("ConditionalStore")
|
|
||||||
<< "store that is conditionally executed prevents vectorization");
|
|
||||||
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
|
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
|
||||||
// TODO: It may by useful to do since it's still likely to be dynamically
|
// TODO: It may by useful to do since it's still likely to be dynamically
|
||||||
// uniform if the target can skip.
|
// uniform if the target can skip.
|
||||||
|
@ -6183,9 +6173,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
|
||||||
VectorizationFactor
|
VectorizationFactor
|
||||||
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
|
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
|
||||||
float Cost = expectedCost(1).first;
|
float Cost = expectedCost(1).first;
|
||||||
#ifndef NDEBUG
|
|
||||||
const float ScalarCost = Cost;
|
const float ScalarCost = Cost;
|
||||||
#endif /* NDEBUG */
|
|
||||||
unsigned Width = 1;
|
unsigned Width = 1;
|
||||||
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
|
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
|
||||||
|
|
||||||
|
@ -6216,6 +6204,14 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!EnableCondStoresVectorization && NumPredStores) {
|
||||||
|
ORE->emit(createMissedAnalysis("ConditionalStore")
|
||||||
|
<< "store that is conditionally executed prevents vectorization");
|
||||||
|
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
|
||||||
|
Width = 1;
|
||||||
|
Cost = ScalarCost;
|
||||||
|
}
|
||||||
|
|
||||||
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
|
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
|
||||||
<< "LV: Vectorization seems to be not beneficial, "
|
<< "LV: Vectorization seems to be not beneficial, "
|
||||||
<< "but was forced by a user.\n");
|
<< "but was forced by a user.\n");
|
||||||
|
@ -6267,7 +6263,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
|
||||||
// optimization to non-pointer types.
|
// optimization to non-pointer types.
|
||||||
//
|
//
|
||||||
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
|
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
|
||||||
!Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
|
!Legal->isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
MinWidth = std::min(MinWidth,
|
MinWidth = std::min(MinWidth,
|
||||||
|
@ -6592,6 +6588,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
|
||||||
return RUs;
|
return RUs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
|
||||||
|
// TODO: Cost model for emulated masked load/store is completely
|
||||||
|
// broken. This hack guides the cost model to use an artificially
|
||||||
|
// high enough value to practically disable vectorization with such
|
||||||
|
// operations, except where previously deployed legality hack allowed
|
||||||
|
// using very low cost values. This is to avoid regressions coming simply
|
||||||
|
// from moving "masked load/store" check from legality to cost model.
|
||||||
|
// Masked Load/Gather emulation was previously never allowed.
|
||||||
|
// Limited number of Masked Store/Scatter emulation was allowed.
|
||||||
|
assert(isScalarWithPredication(I) &&
|
||||||
|
"Expecting a scalar emulated instruction");
|
||||||
|
return isa<LoadInst>(I) ||
|
||||||
|
(isa<StoreInst>(I) &&
|
||||||
|
NumPredStores > NumberOfStoresToPredicate);
|
||||||
|
}
|
||||||
|
|
||||||
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
|
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
|
||||||
// If we aren't vectorizing the loop, or if we've already collected the
|
// If we aren't vectorizing the loop, or if we've already collected the
|
||||||
// instructions to scalarize, there's nothing to do. Collection may already
|
// instructions to scalarize, there's nothing to do. Collection may already
|
||||||
|
@ -6612,11 +6624,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
|
||||||
if (!Legal->blockNeedsPredication(BB))
|
if (!Legal->blockNeedsPredication(BB))
|
||||||
continue;
|
continue;
|
||||||
for (Instruction &I : *BB)
|
for (Instruction &I : *BB)
|
||||||
if (Legal->isScalarWithPredication(&I)) {
|
if (isScalarWithPredication(&I)) {
|
||||||
ScalarCostsTy ScalarCosts;
|
ScalarCostsTy ScalarCosts;
|
||||||
if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
|
// Do not apply discount logic if hacked cost is needed
|
||||||
|
// for emulated masked memrefs.
|
||||||
|
if (!useEmulatedMaskMemRefHack(&I) &&
|
||||||
|
computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
|
||||||
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
|
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
|
||||||
|
|
||||||
// Remember that BB will remain after vectorization.
|
// Remember that BB will remain after vectorization.
|
||||||
PredicatedBBsAfterVectorization.insert(BB);
|
PredicatedBBsAfterVectorization.insert(BB);
|
||||||
}
|
}
|
||||||
|
@ -6651,7 +6665,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
|
||||||
|
|
||||||
// If the instruction is scalar with predication, it will be analyzed
|
// If the instruction is scalar with predication, it will be analyzed
|
||||||
// separately. We ignore it within the context of PredInst.
|
// separately. We ignore it within the context of PredInst.
|
||||||
if (Legal->isScalarWithPredication(I))
|
if (isScalarWithPredication(I))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// If any of the instruction's operands are uniform after vectorization,
|
// If any of the instruction's operands are uniform after vectorization,
|
||||||
|
@ -6705,7 +6719,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
|
||||||
|
|
||||||
// Compute the scalarization overhead of needed insertelement instructions
|
// Compute the scalarization overhead of needed insertelement instructions
|
||||||
// and phi nodes.
|
// and phi nodes.
|
||||||
if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
|
if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
|
||||||
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
|
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
|
||||||
true, false);
|
true, false);
|
||||||
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
|
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
|
||||||
|
@ -6848,9 +6862,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
|
||||||
// If we have a predicated store, it may not be executed for each vector
|
// If we have a predicated store, it may not be executed for each vector
|
||||||
// lane. Scale the cost by the probability of executing the predicated
|
// lane. Scale the cost by the probability of executing the predicated
|
||||||
// block.
|
// block.
|
||||||
if (Legal->isScalarWithPredication(I))
|
if (isScalarWithPredication(I)) {
|
||||||
Cost /= getReciprocalPredBlockProb();
|
Cost /= getReciprocalPredBlockProb();
|
||||||
|
|
||||||
|
if (useEmulatedMaskMemRefHack(I))
|
||||||
|
// Artificially setting to a high enough value to practically disable
|
||||||
|
// vectorization with such operations.
|
||||||
|
Cost = 3000000;
|
||||||
|
}
|
||||||
|
|
||||||
return Cost;
|
return Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6975,6 +6995,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
||||||
void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
|
void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
|
||||||
if (VF == 1)
|
if (VF == 1)
|
||||||
return;
|
return;
|
||||||
|
NumPredStores = 0;
|
||||||
for (BasicBlock *BB : TheLoop->blocks()) {
|
for (BasicBlock *BB : TheLoop->blocks()) {
|
||||||
// For each instruction in the old loop.
|
// For each instruction in the old loop.
|
||||||
for (Instruction &I : *BB) {
|
for (Instruction &I : *BB) {
|
||||||
|
@ -6982,6 +7003,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
|
||||||
if (!Ptr)
|
if (!Ptr)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
|
||||||
|
NumPredStores++;
|
||||||
if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
|
if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
|
||||||
// Scalar load + broadcast
|
// Scalar load + broadcast
|
||||||
unsigned Cost = getUniformMemOpCost(&I, VF);
|
unsigned Cost = getUniformMemOpCost(&I, VF);
|
||||||
|
@ -6990,7 +7013,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// We assume that widening is the best solution when possible.
|
// We assume that widening is the best solution when possible.
|
||||||
if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
|
if (memoryInstructionCanBeWidened(&I, VF)) {
|
||||||
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
|
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
|
||||||
int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
|
int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
|
||||||
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
|
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
|
||||||
|
@ -7017,7 +7040,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned GatherScatterCost =
|
unsigned GatherScatterCost =
|
||||||
Legal->isLegalGatherOrScatter(&I)
|
isLegalGatherOrScatter(&I)
|
||||||
? getGatherScatterCost(&I, VF) * NumAccesses
|
? getGatherScatterCost(&I, VF) * NumAccesses
|
||||||
: std::numeric_limits<unsigned>::max();
|
: std::numeric_limits<unsigned>::max();
|
||||||
|
|
||||||
|
@ -7178,7 +7201,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
|
||||||
// vector lane. Get the scalarization cost and scale this amount by the
|
// vector lane. Get the scalarization cost and scale this amount by the
|
||||||
// probability of executing the predicated block. If the instruction is not
|
// probability of executing the predicated block. If the instruction is not
|
||||||
// predicated, we fall through to the next case.
|
// predicated, we fall through to the next case.
|
||||||
if (VF > 1 && Legal->isScalarWithPredication(I)) {
|
if (VF > 1 && isScalarWithPredication(I)) {
|
||||||
unsigned Cost = 0;
|
unsigned Cost = 0;
|
||||||
|
|
||||||
// These instructions have a non-void type, so account for the phi nodes
|
// These instructions have a non-void type, so account for the phi nodes
|
||||||
|
@ -7799,7 +7822,7 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
|
||||||
|
|
||||||
bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
|
bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
|
||||||
VFRange &Range) {
|
VFRange &Range) {
|
||||||
if (Legal->isScalarWithPredication(I))
|
if (CM.isScalarWithPredication(I))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
auto IsVectorizableOpcode = [](unsigned Opcode) {
|
auto IsVectorizableOpcode = [](unsigned Opcode) {
|
||||||
|
@ -7906,7 +7929,7 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication(
|
||||||
[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
|
[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
|
||||||
Range);
|
Range);
|
||||||
|
|
||||||
bool IsPredicated = Legal->isScalarWithPredication(I);
|
bool IsPredicated = CM.isScalarWithPredication(I);
|
||||||
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
|
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
|
||||||
|
|
||||||
// Find if I uses a predicated instruction. If so, it will use its scalar
|
// Find if I uses a predicated instruction. If so, it will use its scalar
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
|
; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
|
||||||
; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
|
; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
|
||||||
|
|
||||||
; CHECK: remark: source.c:2:8: loop not vectorized: store that is conditionally executed prevents vectorization
|
; CHECK: remark: source.c:2:8: the cost-model indicates that vectorization is not beneficial
|
||||||
|
|
||||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
|
||||||
|
|
|
@ -37,8 +37,9 @@ for.end:
|
||||||
}
|
}
|
||||||
|
|
||||||
; However, we can't hoist loads whose address we have not seen unconditionally
|
; However, we can't hoist loads whose address we have not seen unconditionally
|
||||||
; accessed.
|
; accessed. One wide load is fine, but not the second.
|
||||||
; CHECK-LABEL: @dont_hoist_cond_load(
|
; CHECK-LABEL: @dont_hoist_cond_load(
|
||||||
|
; CHECK: load <2 x float>
|
||||||
; CHECK-NOT: load <2 x float>
|
; CHECK-NOT: load <2 x float>
|
||||||
|
|
||||||
define void @dont_hoist_cond_load() {
|
define void @dont_hoist_cond_load() {
|
||||||
|
|
Loading…
Reference in New Issue