forked from OSchip/llvm-project
[SLPVectorizer] Improved support of partial tree vectorization.
Currently SLP vectorizer tries to vectorize a binary operation and dies immediately after unsuccessful the first unsuccessfull attempt. Patch tries to improve the situation, trying to vectorize all binary operations of all children nodes in the binop tree. Differential Revision: https://reviews.llvm.org/D25517 llvm-svn: 288115
This commit is contained in:
parent
84569e6caa
commit
4fa063ebc9
|
@ -92,6 +92,12 @@ private:
|
|||
/// collected in GEPs.
|
||||
bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
|
||||
|
||||
/// Try to find horizontal reduction or otherwise vectorize a chain of binary
|
||||
/// operators.
|
||||
bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
|
||||
slpvectorizer::BoUpSLP &R,
|
||||
TargetTransformInfo *TTI);
|
||||
|
||||
/// \brief Scan the basic block and look for patterns that are likely to start
|
||||
/// a vectorization chain.
|
||||
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
|
||||
|
|
|
@ -3969,36 +3969,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
|
|||
if (!V)
|
||||
return false;
|
||||
|
||||
Value *P = V->getParent();
|
||||
|
||||
// Vectorize in current basic block only.
|
||||
auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
|
||||
auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
|
||||
if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
|
||||
return false;
|
||||
|
||||
// Try to vectorize V.
|
||||
if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
|
||||
if (tryToVectorizePair(Op0, Op1, R))
|
||||
return true;
|
||||
|
||||
BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
|
||||
BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
|
||||
auto *A = dyn_cast<BinaryOperator>(Op0);
|
||||
auto *B = dyn_cast<BinaryOperator>(Op1);
|
||||
// Try to skip B.
|
||||
if (B && B->hasOneUse()) {
|
||||
BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
|
||||
BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
|
||||
if (tryToVectorizePair(A, B0, R)) {
|
||||
auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
|
||||
auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
|
||||
if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
|
||||
return true;
|
||||
}
|
||||
if (tryToVectorizePair(A, B1, R)) {
|
||||
if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to skip A.
|
||||
if (A && A->hasOneUse()) {
|
||||
BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
|
||||
BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
|
||||
if (tryToVectorizePair(A0, B, R)) {
|
||||
auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
|
||||
auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
|
||||
if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
|
||||
return true;
|
||||
}
|
||||
if (tryToVectorizePair(A1, B, R)) {
|
||||
if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
/// \brief Generate a shuffle mask to be used in a reduction tree.
|
||||
|
@ -4449,29 +4453,143 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// Tracks instructons and its children.
|
||||
class WeakVHWithLevel final : public CallbackVH {
|
||||
/// Operand index of the instruction currently beeing analized.
|
||||
unsigned Level = 0;
|
||||
/// Is this the instruction that should be vectorized, or are we now
|
||||
/// processing children (i.e. operands of this instruction) for potential
|
||||
/// vectorization?
|
||||
bool IsInitial = true;
|
||||
|
||||
public:
|
||||
explicit WeakVHWithLevel() = default;
|
||||
WeakVHWithLevel(Value *V) : CallbackVH(V){};
|
||||
/// Restart children analysis each time it is repaced by the new instruction.
|
||||
void allUsesReplacedWith(Value *New) override {
|
||||
setValPtr(New);
|
||||
Level = 0;
|
||||
IsInitial = true;
|
||||
}
|
||||
/// Check if the instruction was not deleted during vectorization.
|
||||
bool isValid() const { return !getValPtr(); }
|
||||
/// Is the istruction itself must be vectorized?
|
||||
bool isInitial() const { return IsInitial; }
|
||||
/// Try to vectorize children.
|
||||
void clearInitial() { IsInitial = false; }
|
||||
/// Are all children processed already?
|
||||
bool isFinal() const {
|
||||
assert(getValPtr() &&
|
||||
(isa<Instruction>(getValPtr()) &&
|
||||
cast<Instruction>(getValPtr())->getNumOperands() >= Level));
|
||||
return getValPtr() &&
|
||||
cast<Instruction>(getValPtr())->getNumOperands() == Level;
|
||||
}
|
||||
/// Get next child operation.
|
||||
Value *nextOperand() {
|
||||
assert(getValPtr() && isa<Instruction>(getValPtr()) &&
|
||||
cast<Instruction>(getValPtr())->getNumOperands() > Level);
|
||||
return cast<Instruction>(getValPtr())->getOperand(Level++);
|
||||
}
|
||||
virtual ~WeakVHWithLevel() = default;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
/// \brief Attempt to reduce a horizontal reduction.
|
||||
/// If it is legal to match a horizontal reduction feeding
|
||||
/// the phi node P with reduction operators BI, then check if it
|
||||
/// can be done.
|
||||
/// the phi node P with reduction operators Root in a basic block BB, then check
|
||||
/// if it can be done.
|
||||
/// \returns true if a horizontal reduction was matched and reduced.
|
||||
/// \returns false if a horizontal reduction was not matched.
|
||||
static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
|
||||
BoUpSLP &R, TargetTransformInfo *TTI,
|
||||
unsigned MinRegSize) {
|
||||
static bool canBeVectorized(
|
||||
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
|
||||
TargetTransformInfo *TTI,
|
||||
const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
|
||||
if (!ShouldVectorizeHor)
|
||||
return false;
|
||||
|
||||
HorizontalReduction HorRdx(MinRegSize);
|
||||
if (!HorRdx.matchAssociativeReduction(P, BI))
|
||||
if (!Root)
|
||||
return false;
|
||||
|
||||
// If there is a sufficient number of reduction values, reduce
|
||||
// to a nearby power-of-2. Can safely generate oversized
|
||||
// vectors and rely on the backend to split them to legal sizes.
|
||||
HorRdx.ReduxWidth =
|
||||
std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
|
||||
if (Root->getParent() != BB)
|
||||
return false;
|
||||
SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
|
||||
SmallSet<Value *, 8> VisitedInstrs;
|
||||
bool Res = false;
|
||||
while (!Stack.empty()) {
|
||||
Value *V = Stack.back();
|
||||
if (!V) {
|
||||
Stack.pop_back();
|
||||
continue;
|
||||
}
|
||||
auto *Inst = dyn_cast<Instruction>(V);
|
||||
if (!Inst || isa<PHINode>(Inst)) {
|
||||
Stack.pop_back();
|
||||
continue;
|
||||
}
|
||||
if (Stack.back().isInitial()) {
|
||||
Stack.back().clearInitial();
|
||||
if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
|
||||
HorizontalReduction HorRdx(R.getMinVecRegSize());
|
||||
if (HorRdx.matchAssociativeReduction(P, BI)) {
|
||||
// If there is a sufficient number of reduction values, reduce
|
||||
// to a nearby power-of-2. Can safely generate oversized
|
||||
// vectors and rely on the backend to split them to legal sizes.
|
||||
HorRdx.ReduxWidth =
|
||||
std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
|
||||
|
||||
return HorRdx.tryToReduce(R, TTI);
|
||||
if (HorRdx.tryToReduce(R, TTI)) {
|
||||
Res = true;
|
||||
P = nullptr;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (P) {
|
||||
Inst = dyn_cast<Instruction>(BI->getOperand(0));
|
||||
if (Inst == P)
|
||||
Inst = dyn_cast<Instruction>(BI->getOperand(1));
|
||||
if (!Inst) {
|
||||
P = nullptr;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
P = nullptr;
|
||||
if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
|
||||
Res = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (Stack.back().isFinal()) {
|
||||
Stack.pop_back();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
|
||||
if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
|
||||
Stack.size() < RecursionMaxDepth)
|
||||
Stack.push_back(NextV);
|
||||
}
|
||||
return Res;
|
||||
}
|
||||
|
||||
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
|
||||
BasicBlock *BB, BoUpSLP &R,
|
||||
TargetTransformInfo *TTI) {
|
||||
if (!V)
|
||||
return false;
|
||||
auto *I = dyn_cast<Instruction>(V);
|
||||
if (!I)
|
||||
return false;
|
||||
|
||||
if (!isa<BinaryOperator>(I))
|
||||
P = nullptr;
|
||||
// Try to match and vectorize a horizontal reduction.
|
||||
return canBeVectorized(P, I, BB, R, TTI,
|
||||
[this](BinaryOperator *BI, BoUpSLP &R) -> bool {
|
||||
return tryToVectorize(BI, R);
|
||||
});
|
||||
}
|
||||
|
||||
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
||||
|
@ -4541,67 +4659,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
|||
if (P->getNumIncomingValues() != 2)
|
||||
return Changed;
|
||||
|
||||
Value *Rdx = getReductionValue(DT, P, BB, LI);
|
||||
|
||||
// Check if this is a Binary Operator.
|
||||
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
|
||||
if (!BI)
|
||||
continue;
|
||||
|
||||
// Try to match and vectorize a horizontal reduction.
|
||||
if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
|
||||
if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
|
||||
TTI)) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
|
||||
Value *Inst = BI->getOperand(0);
|
||||
if (Inst == P)
|
||||
Inst = BI->getOperand(1);
|
||||
|
||||
if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
|
||||
// We would like to start over since some instructions are deleted
|
||||
// and the iterator may become invalid value.
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ShouldStartVectorizeHorAtStore)
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(it))
|
||||
if (BinaryOperator *BinOp =
|
||||
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
|
||||
if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
|
||||
R.getMinVecRegSize()) ||
|
||||
tryToVectorize(BinOp, R)) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
if (ShouldStartVectorizeHorAtStore) {
|
||||
if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
|
||||
// Try to match and vectorize a horizontal reduction.
|
||||
if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
|
||||
TTI)) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to vectorize horizontal reductions feeding into a return.
|
||||
if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
|
||||
if (RI->getNumOperands() != 0)
|
||||
if (BinaryOperator *BinOp =
|
||||
dyn_cast<BinaryOperator>(RI->getOperand(0))) {
|
||||
DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
|
||||
if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
|
||||
R.getMinVecRegSize()) ||
|
||||
tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
|
||||
R)) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
|
||||
if (RI->getNumOperands() != 0) {
|
||||
// Try to match and vectorize a horizontal reduction.
|
||||
if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
|
||||
Changed = true;
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to vectorize trees that start at compare instructions.
|
||||
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
|
||||
|
@ -4614,16 +4707,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
|||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
|
||||
if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
|
||||
Changed = true;
|
||||
// We would like to start over since some instructions are deleted
|
||||
// and the iterator may become invalid value.
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
break;
|
||||
}
|
||||
for (int I = 0; I < 2; ++I) {
|
||||
if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
|
||||
Changed = true;
|
||||
// We would like to start over since some instructions are deleted
|
||||
// and the iterator may become invalid value.
|
||||
it = BB->begin();
|
||||
e = BB->end();
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
|
|
|
@ -11,26 +11,25 @@ define float @baz() {
|
|||
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
|
||||
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
|
||||
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
|
||||
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
|
||||
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP8]], [[ADD_1]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP9]], [[ADD_2]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
|
||||
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
|
||||
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
|
||||
; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
|
||||
; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]]
|
||||
; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]]
|
||||
; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP8]], [[ADD19_1]]
|
||||
; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP9]], [[ADD19_2]]
|
||||
; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
|
||||
; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
|
||||
; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
|
||||
; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
|
||||
; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
|
||||
; CHECK-NEXT: ret float [[ADD19_3]]
|
||||
;
|
||||
|
@ -68,40 +67,37 @@ define float @bazz() {
|
|||
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
|
||||
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
|
||||
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
|
||||
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
|
||||
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
|
||||
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
|
||||
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
|
||||
; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
|
||||
; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
|
||||
; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16
|
||||
; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
|
||||
; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4
|
||||
; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
|
||||
; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP14]], [[TMP13]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
|
||||
; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP16]], [[ADD19_1]]
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
|
||||
; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP17]], [[ADD19_2]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
|
||||
; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
|
||||
; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]]
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]]
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
|
||||
; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
|
||||
; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]]
|
||||
; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
|
||||
; CHECK-NEXT: ret float [[ADD19_3]]
|
||||
;
|
||||
|
@ -152,24 +148,20 @@ define float @bazzz() {
|
|||
; CHECK-LABEL: @bazzz(
|
||||
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
|
||||
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
|
||||
; CHECK-NEXT: store float [[TMP12]], float* @res, align 4
|
||||
; CHECK-NEXT: ret float [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
|
||||
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
|
||||
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
|
||||
; CHECK-NEXT: store float [[TMP8]], float* @res, align 4
|
||||
; CHECK-NEXT: ret float [[TMP8]]
|
||||
;
|
||||
entry:
|
||||
%0 = load i32, i32* @n, align 4
|
||||
|
@ -198,23 +190,19 @@ define i32 @foo() {
|
|||
; CHECK-LABEL: @foo(
|
||||
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
|
||||
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
|
||||
; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP12]] to i32
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
|
||||
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
|
||||
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
|
||||
; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
|
||||
; CHECK-NEXT: store i32 [[CONV4]], i32* @n, align 4
|
||||
; CHECK-NEXT: ret i32 [[CONV4]]
|
||||
;
|
||||
|
@ -244,8 +232,7 @@ entry:
|
|||
|
||||
define float @bar() {
|
||||
; CHECK-LABEL: @bar(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
|
||||
; CHECK: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
|
||||
|
|
Loading…
Reference in New Issue