forked from OSchip/llvm-project
Revert "Reapply commit r258404 with fix"
This reverts commit r258929, it caused PR26364. llvm-svn: 259148
This commit is contained in:
parent
c516ae1719
commit
b2416bd2a7
|
@ -15,24 +15,22 @@
|
|||
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "llvm/Transforms/Vectorize.h"
|
||||
#include "llvm/ADT/MapVector.h"
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/PostOrderIterator.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/GlobalsModRef.h"
|
||||
#include "llvm/Analysis/AssumptionCache.h"
|
||||
#include "llvm/Analysis/CodeMetrics.h"
|
||||
#include "llvm/Analysis/DemandedBits.h"
|
||||
#include "llvm/Analysis/GlobalsModRef.h"
|
||||
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
||||
#include "llvm/Analysis/ScalarEvolution.h"
|
||||
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/Analysis/VectorUtils.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
|
@ -47,7 +45,7 @@
|
|||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Transforms/Vectorize.h"
|
||||
#include "llvm/Analysis/VectorUtils.h"
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
@ -366,9 +364,9 @@ public:
|
|||
|
||||
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
|
||||
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
|
||||
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB)
|
||||
DominatorTree *Dt, AssumptionCache *AC)
|
||||
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
|
||||
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
|
||||
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
|
||||
Builder(Se->getContext()) {
|
||||
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
|
||||
}
|
||||
|
@ -402,7 +400,6 @@ public:
|
|||
BlockScheduling *BS = Iter.second.get();
|
||||
BS->clear();
|
||||
}
|
||||
MinBWs.clear();
|
||||
}
|
||||
|
||||
/// \brief Perform LICM and CSE on the newly generated gather sequences.
|
||||
|
@ -420,10 +417,6 @@ public:
|
|||
/// vectorization factors.
|
||||
unsigned getVectorElementSize(Value *V);
|
||||
|
||||
/// Compute the minimum type sizes required to represent the entries in a
|
||||
/// vectorizable tree.
|
||||
void computeMinimumValueSizes();
|
||||
|
||||
private:
|
||||
struct TreeEntry;
|
||||
|
||||
|
@ -921,14 +914,8 @@ private:
|
|||
AliasAnalysis *AA;
|
||||
LoopInfo *LI;
|
||||
DominatorTree *DT;
|
||||
AssumptionCache *AC;
|
||||
DemandedBits *DB;
|
||||
/// Instruction builder to construct the vectorized tree.
|
||||
IRBuilder<> Builder;
|
||||
|
||||
/// A map of scalar integer values to the smallest bit width with which they
|
||||
/// can legally be represented.
|
||||
MapVector<Value *, uint64_t> MinBWs;
|
||||
};
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -1484,12 +1471,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
ScalarTy = SI->getValueOperand()->getType();
|
||||
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
||||
|
||||
// If we have computed a smaller type for the expression, update VecTy so
|
||||
// that the costs will be accurate.
|
||||
if (MinBWs.count(VL[0]))
|
||||
VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
|
||||
VL.size());
|
||||
|
||||
if (E->NeedToGather) {
|
||||
if (allConstant(VL))
|
||||
return 0;
|
||||
|
@ -1818,19 +1799,9 @@ int BoUpSLP::getTreeCost() {
|
|||
if (EphValues.count(EU.User))
|
||||
continue;
|
||||
|
||||
// If we plan to rewrite the tree in a smaller type, we will need to sign
|
||||
// extend the extracted value back to the original type. Here, we account
|
||||
// for the extract and the added cost of the sign extend if needed.
|
||||
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
|
||||
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
|
||||
if (MinBWs.count(ScalarRoot)) {
|
||||
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
|
||||
VecTy = VectorType::get(MinTy, BundleWidth);
|
||||
ExtractCost +=
|
||||
TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(), MinTy);
|
||||
}
|
||||
ExtractCost +=
|
||||
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
|
||||
VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
|
||||
ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
|
||||
EU.Lane);
|
||||
}
|
||||
|
||||
Cost += getSpillCost();
|
||||
|
@ -2528,21 +2499,7 @@ Value *BoUpSLP::vectorizeTree() {
|
|||
}
|
||||
|
||||
Builder.SetInsertPoint(&F->getEntryBlock().front());
|
||||
auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
|
||||
|
||||
// If the vectorized tree can be rewritten in a smaller type, we truncate the
|
||||
// vectorized root. InstCombine will then rewrite the entire expression. We
|
||||
// sign extend the extracted values below.
|
||||
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
|
||||
if (MinBWs.count(ScalarRoot)) {
|
||||
if (auto *I = dyn_cast<Instruction>(VectorRoot))
|
||||
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
|
||||
auto BundleWidth = VectorizableTree[0].Scalars.size();
|
||||
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
|
||||
auto *VecTy = VectorType::get(MinTy, BundleWidth);
|
||||
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
|
||||
VectorizableTree[0].VectorizedValue = Trunc;
|
||||
}
|
||||
vectorizeTree(&VectorizableTree[0]);
|
||||
|
||||
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
|
||||
|
||||
|
@ -2575,8 +2532,6 @@ Value *BoUpSLP::vectorizeTree() {
|
|||
if (PH->getIncomingValue(i) == Scalar) {
|
||||
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
|
||||
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
|
||||
if (MinBWs.count(ScalarRoot))
|
||||
Ex = Builder.CreateSExt(Ex, Scalar->getType());
|
||||
CSEBlocks.insert(PH->getIncomingBlock(i));
|
||||
PH->setOperand(i, Ex);
|
||||
}
|
||||
|
@ -2584,16 +2539,12 @@ Value *BoUpSLP::vectorizeTree() {
|
|||
} else {
|
||||
Builder.SetInsertPoint(cast<Instruction>(User));
|
||||
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
|
||||
if (MinBWs.count(ScalarRoot))
|
||||
Ex = Builder.CreateSExt(Ex, Scalar->getType());
|
||||
CSEBlocks.insert(cast<Instruction>(User)->getParent());
|
||||
User->replaceUsesOfWith(Scalar, Ex);
|
||||
}
|
||||
} else {
|
||||
Builder.SetInsertPoint(&F->getEntryBlock().front());
|
||||
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
|
||||
if (MinBWs.count(ScalarRoot))
|
||||
Ex = Builder.CreateSExt(Ex, Scalar->getType());
|
||||
CSEBlocks.insert(&F->getEntryBlock());
|
||||
User->replaceUsesOfWith(Scalar, Ex);
|
||||
}
|
||||
|
@ -3162,7 +3113,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
|
|||
// If the current instruction is a load, update MaxWidth to reflect the
|
||||
// width of the loaded value.
|
||||
else if (isa<LoadInst>(I))
|
||||
MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));
|
||||
MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));
|
||||
|
||||
// Otherwise, we need to visit the operands of the instruction. We only
|
||||
// handle the interesting cases from buildTree here. If an operand is an
|
||||
|
@ -3189,166 +3140,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
|
|||
return MaxWidth;
|
||||
}
|
||||
|
||||
// Determine if a value V in a vectorizable expression Expr can be demoted to a
|
||||
// smaller type with a truncation. We collect the values that will be demoted
|
||||
// in ToDemote and additional roots that require investigating in Roots.
|
||||
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
|
||||
SmallVectorImpl<Value *> &ToDemote,
|
||||
SmallVectorImpl<Value *> &Roots) {
|
||||
|
||||
// We can always demote constants.
|
||||
if (isa<Constant>(V)) {
|
||||
ToDemote.push_back(V);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the value is not an instruction in the expression with only one use, it
|
||||
// cannot be demoted.
|
||||
auto *I = dyn_cast<Instruction>(V);
|
||||
if (!I || !I->hasOneUse() || !Expr.count(I))
|
||||
return false;
|
||||
|
||||
switch (I->getOpcode()) {
|
||||
|
||||
// We can always demote truncations and extensions. Since truncations can
|
||||
// seed additional demotion, we save the truncated value.
|
||||
case Instruction::Trunc:
|
||||
Roots.push_back(I->getOperand(0));
|
||||
case Instruction::ZExt:
|
||||
case Instruction::SExt:
|
||||
break;
|
||||
|
||||
// We can demote certain binary operations if we can demote both of their
|
||||
// operands.
|
||||
case Instruction::Add:
|
||||
case Instruction::Sub:
|
||||
case Instruction::Mul:
|
||||
case Instruction::And:
|
||||
case Instruction::Or:
|
||||
case Instruction::Xor:
|
||||
if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
|
||||
!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
|
||||
return false;
|
||||
break;
|
||||
|
||||
// We can demote selects if we can demote their true and false values.
|
||||
case Instruction::Select: {
|
||||
SelectInst *SI = cast<SelectInst>(I);
|
||||
if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
|
||||
!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
|
||||
// We can demote phis if we can demote all their incoming operands. Note that
|
||||
// we don't need to worry about cycles since we ensure single use above.
|
||||
case Instruction::PHI: {
|
||||
PHINode *PN = cast<PHINode>(I);
|
||||
for (Value *IncValue : PN->incoming_values())
|
||||
if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
|
||||
// Otherwise, conservatively give up.
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
// Record the value that we can demote.
|
||||
ToDemote.push_back(V);
|
||||
return true;
|
||||
}
|
||||
|
||||
void BoUpSLP::computeMinimumValueSizes() {
|
||||
auto &DL = F->getParent()->getDataLayout();
|
||||
|
||||
// If there are no external uses, the expression tree must be rooted by a
|
||||
// store. We can't demote in-memory values, so there is nothing to do here.
|
||||
if (ExternalUses.empty())
|
||||
return;
|
||||
|
||||
// We only attempt to truncate integer expressions.
|
||||
auto &TreeRoot = VectorizableTree[0].Scalars;
|
||||
auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
|
||||
if (!TreeRootIT)
|
||||
return;
|
||||
|
||||
// If the expression is not rooted by a store, these roots should have
|
||||
// external uses. We will rely on InstCombine to rewrite the expression in
|
||||
// the narrower type. However, InstCombine only rewrites single-use values.
|
||||
// This means that if a tree entry other than a root is used externally, it
|
||||
// must have multiple uses and InstCombine will not rewrite it. The code
|
||||
// below ensures that only the roots are used externally.
|
||||
SmallPtrSet<Value *, 16> Expr(TreeRoot.begin(), TreeRoot.end());
|
||||
for (auto &EU : ExternalUses)
|
||||
if (!Expr.erase(EU.Scalar))
|
||||
return;
|
||||
if (!Expr.empty())
|
||||
return;
|
||||
|
||||
// Collect the scalar values in one lane of the vectorizable expression. We
|
||||
// will use this context to determine which values can be demoted. If we see
|
||||
// a truncation, we mark it as seeding another demotion.
|
||||
for (auto &Entry : VectorizableTree)
|
||||
Expr.insert(Entry.Scalars[0]);
|
||||
|
||||
// Conservatively determine if we can actually truncate the root of the
|
||||
// expression. Collect the values that can be demoted in ToDemote and
|
||||
// additional roots that require investigating in Roots.
|
||||
SmallVector<Value *, 32> ToDemote;
|
||||
SmallVector<Value *, 2> Roots;
|
||||
if (!collectValuesToDemote(TreeRoot[0], Expr, ToDemote, Roots))
|
||||
return;
|
||||
|
||||
// The maximum bit width required to represent all the values that can be
|
||||
// demoted without loss of precision. It would be safe to truncate the root
|
||||
// of the expression to this width.
|
||||
auto MaxBitWidth = 8u;
|
||||
|
||||
// We first check if all the bits of the root are demanded. If they're not,
|
||||
// we can truncate the root to this narrower type.
|
||||
auto Mask = DB->getDemandedBits(cast<Instruction>(TreeRoot[0]));
|
||||
if (Mask.countLeadingZeros() > 0)
|
||||
MaxBitWidth = std::max<unsigned>(
|
||||
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
|
||||
|
||||
// If all the bits of the root are demanded, we can try a little harder to
|
||||
// compute a narrower type. This can happen, for example, if the roots are
|
||||
// getelementptr indices. InstCombine promotes these indices to the pointer
|
||||
// width. Thus, all their bits are technically demanded even though the
|
||||
// address computation might be vectorized in a smaller type.
|
||||
//
|
||||
// We start by looking at each entry that can be demoted. We compute the
|
||||
// maximum bit width required to store the scalar by using ValueTracking to
|
||||
// compute the number of high-order bits we can truncate.
|
||||
else
|
||||
for (auto *Scalar : ToDemote) {
|
||||
auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
|
||||
auto NumTypeBits = DL.getTypeSizeInBits(Scalar->getType());
|
||||
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
|
||||
}
|
||||
|
||||
// Round MaxBitWidth up to the next power-of-two.
|
||||
if (!isPowerOf2_64(MaxBitWidth))
|
||||
MaxBitWidth = NextPowerOf2(MaxBitWidth);
|
||||
|
||||
// If the maximum bit width we compute is less than the with of the roots'
|
||||
// type, we can proceed with the narrowing. Otherwise, do nothing.
|
||||
if (MaxBitWidth >= TreeRootIT->getBitWidth())
|
||||
return;
|
||||
|
||||
// If we can truncate the root, we must collect additional values that might
|
||||
// be demoted as a result. That is, those seeded by truncations we will
|
||||
// modify.
|
||||
while (!Roots.empty())
|
||||
collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
|
||||
|
||||
// Finally, map the values we can demote to the maximum bit with we computed.
|
||||
for (auto *Scalar : ToDemote)
|
||||
MinBWs[Scalar] = MaxBitWidth;
|
||||
}
|
||||
|
||||
/// The SLPVectorizer Pass.
|
||||
struct SLPVectorizer : public FunctionPass {
|
||||
typedef SmallVector<StoreInst *, 8> StoreList;
|
||||
|
@ -3370,7 +3161,6 @@ struct SLPVectorizer : public FunctionPass {
|
|||
LoopInfo *LI;
|
||||
DominatorTree *DT;
|
||||
AssumptionCache *AC;
|
||||
DemandedBits *DB;
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
if (skipOptnoneFunction(F))
|
||||
|
@ -3384,7 +3174,6 @@ struct SLPVectorizer : public FunctionPass {
|
|||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
||||
DB = &getAnalysis<DemandedBits>();
|
||||
|
||||
Stores.clear();
|
||||
GEPs.clear();
|
||||
|
@ -3414,7 +3203,7 @@ struct SLPVectorizer : public FunctionPass {
|
|||
|
||||
// Use the bottom up slp vectorizer to construct chains that start with
|
||||
// store instructions.
|
||||
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);
|
||||
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
|
||||
|
||||
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
|
||||
// delete instructions.
|
||||
|
@ -3457,7 +3246,6 @@ struct SLPVectorizer : public FunctionPass {
|
|||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<DemandedBits>();
|
||||
AU.addPreserved<LoopInfoWrapperPass>();
|
||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||
AU.addPreserved<AAResultsWrapperPass>();
|
||||
|
@ -3562,7 +3350,6 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
|
|||
ArrayRef<Value *> Operands = Chain.slice(i, VF);
|
||||
|
||||
R.buildTree(Operands);
|
||||
R.computeMinimumValueSizes();
|
||||
|
||||
int Cost = R.getTreeCost();
|
||||
|
||||
|
@ -3762,7 +3549,6 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
|||
Value *ReorderedOps[] = { Ops[1], Ops[0] };
|
||||
R.buildTree(ReorderedOps, None);
|
||||
}
|
||||
R.computeMinimumValueSizes();
|
||||
int Cost = R.getTreeCost();
|
||||
|
||||
if (Cost < -SLPCostThreshold) {
|
||||
|
@ -4029,7 +3815,6 @@ public:
|
|||
|
||||
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
|
||||
V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
|
||||
V.computeMinimumValueSizes();
|
||||
|
||||
// Estimate cost.
|
||||
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=PROFITABLE
|
||||
; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s --check-prefix=UNPROFITABLE
|
||||
; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64--linux-gnu"
|
||||
|
@ -19,13 +18,13 @@ target triple = "aarch64--linux-gnu"
|
|||
; return sum;
|
||||
; }
|
||||
|
||||
; PROFITABLE-LABEL: @gather_reduce_8x16_i32
|
||||
; CHECK-LABEL: @gather_reduce_8x16_i32
|
||||
;
|
||||
; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
|
||||
; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
|
||||
; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
|
||||
; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
|
||||
; PROFITABLE: sext i32 [[X]] to i64
|
||||
; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
|
||||
; CHECK: zext <8 x i16> [[L]] to <8 x i32>
|
||||
; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
|
||||
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
|
||||
; CHECK: sext i32 [[X]] to i64
|
||||
;
|
||||
define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
|
||||
entry:
|
||||
|
@ -138,18 +137,14 @@ for.body:
|
|||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
||||
|
||||
; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
|
||||
; CHECK-LABEL: @gather_reduce_8x16_i64
|
||||
;
|
||||
; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
|
||||
; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
|
||||
; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
|
||||
; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
|
||||
; UNPROFITABLE: sext i32 [[X]] to i64
|
||||
; CHECK-NOT: load <8 x i16>
|
||||
;
|
||||
; TODO: Although we can now vectorize this case while converting the i64
|
||||
; subtractions to i32, the cost model currently finds vectorization to be
|
||||
; unprofitable. The cost model is penalizing the sign and zero
|
||||
; extensions in the vectorized version, but they are actually free.
|
||||
; FIXME: We are currently unable to vectorize the case with i64 subtraction
|
||||
; because the zero extensions are too expensive. The solution here is to
|
||||
; convert the i64 subtractions to i32 subtractions during vectorization.
|
||||
; This would then match the case above.
|
||||
;
|
||||
define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue