forked from OSchip/llvm-project
[SLP]Improve cost model for the shuffled extracts.
Improved the calculation of the shuffled extracts, where possible. Need to calculate the cost for the extracted scalars if some users are not insertelements + improved the total estimation of the shuffled scalars used in insertelements build vectors. Differential Revision: https://reviews.llvm.org/D113782
This commit is contained in:
parent
b254c2e2c4
commit
afc9e7517a
|
@ -5401,7 +5401,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
|||
SmallVector<APInt> DemandedElts;
|
||||
for (ExternalUser &EU : ExternalUses) {
|
||||
// We only add extract cost once for the same scalar.
|
||||
if (!ExtractCostCalculated.insert(EU.Scalar).second)
|
||||
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
|
||||
!ExtractCostCalculated.insert(EU.Scalar).second)
|
||||
continue;
|
||||
|
||||
// Uses by ephemeral values are free (because the ephemeral value will be
|
||||
|
@ -5449,7 +5450,22 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
|||
if (It == FirstUsers.end()) {
|
||||
VF.push_back(FTy->getNumElements());
|
||||
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
|
||||
FirstUsers.push_back(EU.User);
|
||||
// Find the insertvector, vectorized in tree, if any.
|
||||
Value *Base = VU;
|
||||
while (isa<InsertElementInst>(Base)) {
|
||||
// Build the mask for the vectorized insertelement instructions.
|
||||
if (const TreeEntry *E = getTreeEntry(Base)) {
|
||||
VU = Base;
|
||||
do {
|
||||
int Idx = E->findLaneForValue(Base);
|
||||
ShuffleMask.back()[Idx] = Idx;
|
||||
Base = cast<InsertElementInst>(Base)->getOperand(0);
|
||||
} while (E == getTreeEntry(Base));
|
||||
break;
|
||||
}
|
||||
Base = cast<InsertElementInst>(Base)->getOperand(0);
|
||||
}
|
||||
FirstUsers.push_back(VU);
|
||||
DemandedElts.push_back(APInt::getZero(VF.back()));
|
||||
VecId = FirstUsers.size() - 1;
|
||||
} else {
|
||||
|
@ -5458,6 +5474,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
|||
int Idx = *InsertIdx;
|
||||
ShuffleMask[VecId][Idx] = EU.Lane;
|
||||
DemandedElts[VecId].setBit(Idx);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5481,47 +5498,86 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
|||
|
||||
InstructionCost SpillCost = getSpillCost();
|
||||
Cost += SpillCost + ExtractCost;
|
||||
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
|
||||
// For the very first element - simple shuffle of the source vector.
|
||||
int Limit = ShuffleMask[I].size() * 2;
|
||||
if (I == 0 &&
|
||||
all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
|
||||
!ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
|
||||
if (FirstUsers.size() == 1) {
|
||||
int Limit = ShuffleMask.front().size() * 2;
|
||||
if (all_of(ShuffleMask.front(), [Limit](int Idx) { return Idx < Limit; }) &&
|
||||
!ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
|
||||
InstructionCost C = TTI->getShuffleCost(
|
||||
TTI::SK_PermuteSingleSrc,
|
||||
cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
|
||||
cast<FixedVectorType>(FirstUsers.front()->getType()),
|
||||
ShuffleMask.front());
|
||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
||||
<< " for final shuffle of insertelement external users "
|
||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost += C;
|
||||
continue;
|
||||
}
|
||||
// Other elements - permutation of 2 vectors (the initial one and the next
|
||||
// Ith incoming vector).
|
||||
unsigned VF = ShuffleMask[I].size();
|
||||
for (unsigned Idx = 0; Idx < VF; ++Idx) {
|
||||
int &Mask = ShuffleMask[I][Idx];
|
||||
Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
|
||||
}
|
||||
InstructionCost C = TTI->getShuffleCost(
|
||||
TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
|
||||
ShuffleMask[I]);
|
||||
LLVM_DEBUG(
|
||||
dbgs()
|
||||
<< "SLP: Adding cost " << C
|
||||
<< " for final shuffle of vector node and external insertelement users "
|
||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost += C;
|
||||
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
||||
cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
|
||||
/*Insert*/ true,
|
||||
/*Extract*/ false);
|
||||
Cost -= InsertCost;
|
||||
cast<FixedVectorType>(FirstUsers.front()->getType()),
|
||||
DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
|
||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
|
||||
<< " for insertelements gather.\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost -= InsertCost;
|
||||
} else if (FirstUsers.size() >= 2) {
|
||||
unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
|
||||
// Combined masks of the first 2 vectors.
|
||||
SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
|
||||
copy(ShuffleMask.front(), CombinedMask.begin());
|
||||
APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
|
||||
auto *VecTy = FixedVectorType::get(
|
||||
cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
|
||||
MaxVF);
|
||||
for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
|
||||
if (ShuffleMask[1][I] != UndefMaskElem) {
|
||||
CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
|
||||
CombinedDemandedElts.setBit(I);
|
||||
}
|
||||
}
|
||||
InstructionCost C =
|
||||
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
|
||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
||||
<< " for final shuffle of vector node and external "
|
||||
"insertelement users "
|
||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost += C;
|
||||
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
||||
VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
|
||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
|
||||
<< " for insertelements gather.\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost -= InsertCost;
|
||||
for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
|
||||
// Other elements - permutation of 2 vectors (the initial one and the
|
||||
// next Ith incoming vector).
|
||||
unsigned VF = ShuffleMask[I].size();
|
||||
for (unsigned Idx = 0; Idx < VF; ++Idx) {
|
||||
int Mask = ShuffleMask[I][Idx];
|
||||
if (Mask != UndefMaskElem)
|
||||
CombinedMask[Idx] = MaxVF + Mask;
|
||||
else if (CombinedMask[Idx] != UndefMaskElem)
|
||||
CombinedMask[Idx] = Idx;
|
||||
}
|
||||
for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
|
||||
if (CombinedMask[Idx] != UndefMaskElem)
|
||||
CombinedMask[Idx] = Idx;
|
||||
InstructionCost C =
|
||||
TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
|
||||
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
|
||||
<< " for final shuffle of vector node and external "
|
||||
"insertelement users "
|
||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost += C;
|
||||
InstructionCost InsertCost = TTI->getScalarizationOverhead(
|
||||
cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
|
||||
/*Insert*/ true, /*Extract*/ false);
|
||||
LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
|
||||
<< " for insertelements gather.\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost -= InsertCost;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
|
@ -236,22 +236,23 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
|
|||
|
||||
define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
|
||||
; CHECK-LABEL: @fcmp_ord_uno_v4i32(
|
||||
; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
|
||||
; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
|
||||
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
|
||||
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
|
||||
; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
|
||||
; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
|
||||
; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
|
||||
; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
|
||||
; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
|
||||
; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
|
||||
; CHECK-NEXT: ret <4 x i32> [[R]]
|
||||
;
|
||||
|
|
|
@ -236,22 +236,23 @@ define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
|
|||
|
||||
define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
|
||||
; CHECK-LABEL: @fcmp_ord_uno_v4i32(
|
||||
; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
|
||||
; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
|
||||
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
|
||||
; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
|
||||
; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4
|
||||
; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]]
|
||||
; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
|
||||
; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
|
||||
; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D21]], i1 [[C3]], i32 3
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> <i32 3, i32 0>
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B3]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B0]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ord <2 x float> [[TMP5]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[D0:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i1> [[TMP4]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[D21:%.*]] = shufflevector <4 x i1> [[D0]], <4 x i1> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
|
||||
; CHECK-NEXT: [[D3:%.*]] = shufflevector <4 x i1> [[D21]], <4 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||
; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
|
||||
; CHECK-NEXT: ret <4 x i32> [[R]]
|
||||
;
|
||||
|
|
|
@ -54,12 +54,14 @@ define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceabl
|
|||
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
|
||||
; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4
|
||||
; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2
|
||||
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
|
||||
; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
|
||||
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I21]], float [[TMP4]], i32 3
|
||||
; CHECK-NEXT: ret <4 x float> [[I3]]
|
||||
;
|
||||
%gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
|
||||
|
|
|
@ -54,12 +54,14 @@ define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceabl
|
|||
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>*
|
||||
; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4
|
||||
; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2
|
||||
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3
|
||||
; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 5, i32 3>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
|
||||
; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I21]], float [[TMP4]], i32 3
|
||||
; CHECK-NEXT: ret <4 x float> [[I3]]
|
||||
;
|
||||
%gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0
|
||||
|
|
|
@ -30,7 +30,6 @@ define void @test(i32* nocapture %t2) {
|
|||
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
|
||||
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
|
||||
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
|
||||
; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270
|
||||
; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137
|
||||
; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
|
||||
; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
|
||||
|
@ -40,22 +39,24 @@ define void @test(i32* nocapture %t2) {
|
|||
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
|
||||
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
|
||||
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
|
||||
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1
|
||||
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
|
||||
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
|
||||
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7
|
||||
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
|
||||
; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
|
||||
|
|
|
@ -30,7 +30,6 @@ define void @test(i32* nocapture %t2) {
|
|||
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
|
||||
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
|
||||
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
|
||||
; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270
|
||||
; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137
|
||||
; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
|
||||
; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
|
||||
|
@ -40,22 +39,24 @@ define void @test(i32* nocapture %t2) {
|
|||
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
|
||||
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
|
||||
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
|
||||
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1
|
||||
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
|
||||
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T40]], i32 3
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T47]], i32 3
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0
|
||||
; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[TMP12]], i32 4
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1
|
||||
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP13]], i32 5
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP14]], i32 7
|
||||
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
|
||||
; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
|
||||
|
|
Loading…
Reference in New Issue