[SLP]Improve reductions vectorization.

The pattern matching and vectgorization for reductions was not very
effective. Some of of the possible reduction values were marked as
external arguments, SLP could not find some reduction patterns because
of too early attempt to vectorize pair of binops arguments, the cost of
consts reductions was not correct. Patch addresses these issues and
improves the analysis/cost estimation and vectorization of the
reductions.

The most significant changes in SLP.NumVectorInstructions:

Metric: SLP.NumVectorInstructions                                                                                                                                                                                                 [140/14396]

Program                                                                                        results  results0 diff
               test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test   920.00  3548.00 285.7%
                test-suite :: SingleSource/Benchmarks/BenchmarkGame/n-body.test    66.00   122.00  84.8%
      test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test   100.00   128.00  28.0%
 test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test   664.00   810.00  22.0%
                 test-suite :: MultiSource/Benchmarks/mafft/pairlocalalign.test   592.00   687.00  16.0%
  test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test   402.00   426.00   6.0%
                   test-suite :: MultiSource/Applications/JM/lencod/lencod.test  1665.00  1745.00   4.8%
  test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test   135.00   139.00   3.0%
 test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test   135.00   139.00   3.0%
                  test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test   388.00   397.00   2.3%
                   test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test   895.00   914.00   2.1%
    test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test   240.00   244.00   1.7%
           test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test   240.00   244.00   1.7%
             test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test   820.00   832.00   1.5%
              test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test   820.00   832.00   1.5%
       test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 14804.00 14914.00   0.7%
                        test-suite :: MultiSource/Benchmarks/Bullet/bullet.test  8125.00  8183.00   0.7%
           test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test  1330.00  1338.00   0.6%
            test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test  1330.00  1338.00   0.6%
         test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test  9832.00  9880.00   0.5%
         test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  5267.00  5291.00   0.5%
       test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  4018.00  4024.00   0.1%
      test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  4018.00  4024.00   0.1%
              test-suite :: External/SPEC/CFP2017speed/644.nab_s/644.nab_s.test   426.00   424.00  -0.5%
               test-suite :: External/SPEC/CFP2017rate/544.nab_r/544.nab_r.test   426.00   424.00  -0.5%
          test-suite :: External/SPEC/CINT2017rate/541.leela_r/541.leela_r.test   201.00   192.00  -4.5%
         test-suite :: External/SPEC/CINT2017speed/641.leela_s/641.leela_s.test   201.00   192.00  -4.5%

644.nab_s and 544.nab_r - reduced number of shuffles but increased number
of useful vectorized instructions.

641.leela_s and 541.leela_r - the function
`@_ZN9FastBoard25get_pattern3_augment_specEiib` is not inlined anymore
but its body gets vectorized successfully. Before, the function was
inlined twice and vectorized just after inlining, currently it is not
required. The vector code looks pretty similar, just like as it was before.

Differential Revision: https://reviews.llvm.org/D111574
This commit is contained in:
Alexey Bataev 2021-10-05 05:55:48 -07:00
parent 1e14b1a797
commit 7d8060bc19
14 changed files with 585 additions and 337 deletions

View File

@ -10582,7 +10582,7 @@ public:
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
InstructionCost ReductionCost =
getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF);
getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
if (!Cost.isValid()) {
LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
@ -10659,45 +10659,78 @@ public:
// Finish the reduction.
// Need to add extra arguments and not vectorized possible reduction
// values.
// Try to avoid dependencies between the scalar remainders after
// reductions.
auto &&FinalGen =
[this, &Builder,
&TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
unsigned Sz = InstVals.size();
SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
Sz % 2);
for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
Instruction *RedOp = InstVals[I + 1].first;
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
ReductionOpsListType Ops;
if (auto *Sel = dyn_cast<SelectInst>(RedOp))
Ops.emplace_back().push_back(Sel->getCondition());
Ops.emplace_back().push_back(RedOp);
Value *RdxVal1 = InstVals[I].second;
Value *StableRdxVal1 = RdxVal1;
auto It1 = TrackedVals.find(RdxVal1);
if (It1 != TrackedVals.end())
StableRdxVal1 = It1->second;
Value *RdxVal2 = InstVals[I + 1].second;
Value *StableRdxVal2 = RdxVal2;
auto It2 = TrackedVals.find(RdxVal2);
if (It2 != TrackedVals.end())
StableRdxVal2 = It2->second;
Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
StableRdxVal2, "op.rdx", Ops);
ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
}
if (Sz % 2 == 1)
ExtraReds[Sz / 2] = InstVals.back();
return ExtraReds;
};
SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
SmallPtrSet<Value *, 8> Visited;
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> Candidates = ReducedVals[I];
for (ArrayRef<Value *> Candidates : ReducedVals) {
for (Value *RdxVal : Candidates) {
if (!Visited.insert(RdxVal).second)
continue;
Value *StableRdxVal = RdxVal;
auto TVIt = TrackedVals.find(RdxVal);
if (TVIt != TrackedVals.end())
StableRdxVal = TVIt->second;
unsigned NumOps = VectorizedVals.lookup(RdxVal);
for (Instruction *RedOp :
makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
.drop_back(NumOps)) {
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
ReductionOpsListType Ops;
if (auto *Sel = dyn_cast<SelectInst>(RedOp))
Ops.emplace_back().push_back(Sel->getCondition());
Ops.emplace_back().push_back(RedOp);
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
StableRdxVal, "op.rdx", Ops);
}
.drop_back(NumOps))
ExtraReductions.emplace_back(RedOp, RdxVal);
}
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
ReductionOpsListType Ops;
if (auto *Sel = dyn_cast<SelectInst>(I))
Ops.emplace_back().push_back(Sel->getCondition());
Ops.emplace_back().push_back(I);
Value *StableRdxVal = Pair.first;
auto TVIt = TrackedVals.find(Pair.first);
if (TVIt != TrackedVals.end())
StableRdxVal = TVIt->second;
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
StableRdxVal, "op.rdx", Ops);
}
for (auto *I : Pair.second)
ExtraReductions.emplace_back(I, Pair.first);
}
// Iterate through all not-vectorized reduction values/extra arguments.
while (ExtraReductions.size() > 1) {
SmallVector<std::pair<Instruction *, Value *>> NewReds =
FinalGen(ExtraReductions);
ExtraReductions.swap(NewReds);
}
// Final reduction.
if (ExtraReductions.size() == 1) {
Instruction *RedOp = ExtraReductions.back().first;
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
ReductionOpsListType Ops;
if (auto *Sel = dyn_cast<SelectInst>(RedOp))
Ops.emplace_back().push_back(Sel->getCondition());
Ops.emplace_back().push_back(RedOp);
Value *RdxVal = ExtraReductions.back().second;
Value *StableRdxVal = RdxVal;
auto It = TrackedVals.find(RdxVal);
if (It != TrackedVals.end())
StableRdxVal = It->second;
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
StableRdxVal, "op.rdx", Ops);
}
ReductionRoot->replaceAllUsesWith(VectorizedTree);
@ -10739,12 +10772,16 @@ public:
private:
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
Value *FirstReducedVal, unsigned ReduxWidth,
FastMathFlags FMF) {
ArrayRef<Value *> ReducedVals,
unsigned ReduxWidth, FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost, ScalarCost;
InstructionCost VectorCost = 0, ScalarCost;
// If all of the reduced values are constant, the vector cost is 0, since
// the reduction value can be calculated at the compile time.
bool AllConsts = all_of(ReducedVals, isConstant);
switch (RdxKind) {
case RecurKind::Add:
case RecurKind::Mul:
@ -10754,17 +10791,22 @@ private:
case RecurKind::FAdd:
case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
VectorCost =
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
if (!AllConsts)
VectorCost =
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
break;
}
case RecurKind::FMax:
case RecurKind::FMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
/*IsUnsigned=*/false, CostKind);
if (!AllConsts) {
auto *VecCondTy =
cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
VectorCost =
TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
/*IsUnsigned=*/false, CostKind);
}
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +
@ -10777,11 +10819,14 @@ private:
case RecurKind::UMax:
case RecurKind::UMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
bool IsUnsigned =
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
CostKind);
if (!AllConsts) {
auto *VecCondTy =
cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
bool IsUnsigned =
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
IsUnsigned, CostKind);
}
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
SclCondTy, RdxPred, CostKind) +

View File

@ -22,11 +22,11 @@ define i32 @smaxv6() {
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP5]], i32 [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8
; GFX9-NEXT: ret i32 [[OP_EXTRA1]]
; GFX9-NEXT: ret i32 [[OP_RDX1]]
;
%load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@ -63,11 +63,11 @@ define i64 @sminv6() {
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
; GFX9-NEXT: [[OP_RDX:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i64 [[TMP5]], i64 [[SELECT1]]
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8
; GFX9-NEXT: ret i64 [[OP_EXTRA1]]
; GFX9-NEXT: ret i64 [[OP_RDX1]]
;
%load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
%load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
@ -206,11 +206,11 @@ define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP3]], i32 [[SELECT1]]
; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8
; GFX9-NEXT: ret i32 [[OP_EXTRA1]]
; GFX9-NEXT: ret i32 [[OP_RDX1]]
;
%vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
%elt1 = extractelement <2 x i32> %vload, i32 0

View File

@ -17,10 +17,10 @@ define void @mainTest(i32* %ptr) #0 {
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[TMP4]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP3]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP2]]
; CHECK-NEXT: [[OP_RDX3]] = add i32 [[OP_RDX2]], 1
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[OP_RDX3]] = add i32 [[TMP7]], [[OP_RDX2]]
; CHECK-NEXT: br label [[LOOP]]
; CHECK: bail_out:
; CHECK-NEXT: ret void

View File

@ -19,8 +19,8 @@ define void @test() #0 {
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP2]]
; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP7]], [[TMP3]]
; CHECK-NEXT: [[OP_RDX1]] = add i64 [[OP_RDX]], 0
; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP3]], 0
; CHECK-NEXT: [[OP_RDX1]] = add i64 [[TMP7]], [[OP_RDX]]
; CHECK-NEXT: br label [[LOOP]]
;
entry:

View File

@ -40,14 +40,14 @@ define void @Test(i32) {
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0
; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1
; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP31]], [[TMP33]]
; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP31]], [[TMP33]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP26]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]]
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[TMP26]], i32 1
; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP32]], [[TMP33]]
; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP32]], [[TMP33]]
; CHECK-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: br label [[LOOP]]
;
@ -88,10 +88,10 @@ define void @Test(i32) {
; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP26]]
; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]]
; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP26]]
; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]]
; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX1]], [[OP_RDX4]]
; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP26]], 14910
; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0
; FORCE_REDUCTION-NEXT: [[TMP32]] = insertelement <2 x i32> [[TMP31]], i32 [[VAL_43]], i32 1

View File

@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
; // PR42652
; unsigned long bitmask_16xi8(const char *src) {
@ -15,39 +15,105 @@
; }
define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
; CHECK-LABEL: @bitmask_16xi8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
; CHECK-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
; CHECK-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
; CHECK-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
; CHECK-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
; CHECK-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
; CHECK-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_13]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_14]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR_15]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]]
; CHECK-NEXT: ret i64 [[OP_RDX4]]
; SSE-LABEL: @bitmask_16xi8(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
; SSE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
; SSE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]]
; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]]
; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]]
; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]]
; SSE-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]]
; SSE-NEXT: ret i64 [[OP_RDX4]]
;
; AVX-LABEL: @bitmask_16xi8(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
; AVX-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; AVX-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
; AVX-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
; AVX-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
; AVX-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
; AVX-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
; AVX-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
; AVX-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
; AVX-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
; AVX-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
; AVX-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
; AVX-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
; AVX-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
; AVX-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
; AVX-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
; AVX-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]]
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]]
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]]
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]]
; AVX-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]]
; AVX-NEXT: ret i64 [[OP_RDX4]]
;
; AVX512-LABEL: @bitmask_16xi8(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
; AVX512-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
; AVX512-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
; AVX512-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
; AVX512-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
; AVX512-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
; AVX512-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1
; AVX512-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer
; AVX512-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i64> zeroinitializer, <2 x i64> <i64 8192, i64 16384>
; AVX512-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
; AVX512-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP10]], 0
; AVX512-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
; AVX512-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
; AVX512-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP11]], [[TMP12]]
; AVX512-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; AVX512-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP13]], [[TMP14]]
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]]
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]]
; AVX512-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]]
; AVX512-NEXT: ret i64 [[OP_RDX4]]
;
entry:
%0 = load i8, ptr %src, align 1
@ -132,33 +198,87 @@ entry:
}
define i64 @bitmask_4xi16(ptr nocapture noundef readonly %src) {
; CHECK-LABEL: @bitmask_4xi16(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
; CHECK-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
; CHECK-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
; CHECK-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
; CHECK-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
; CHECK-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
; CHECK-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
; CHECK-NEXT: ret i64 [[OP_RDX3]]
; SSE-LABEL: @bitmask_4xi16(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
; SSE-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
; SSE-NEXT: ret i64 [[OP_RDX3]]
;
; AVX-LABEL: @bitmask_4xi16(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
; AVX-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
; AVX-NEXT: ret i64 [[OP_RDX3]]
;
; AVX512-LABEL: @bitmask_4xi16(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer
; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> <i64 32, i64 64>
; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
; AVX512-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP7]], 0
; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]]
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]]
; AVX512-NEXT: ret i64 [[OP_RDX3]]
;
entry:
%0 = load i16, ptr %src, align 2
@ -203,33 +323,87 @@ entry:
}
define i64 @bitmask_8xi32(ptr nocapture noundef readonly %src) {
; CHECK-LABEL: @bitmask_8xi32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
; CHECK-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
; CHECK-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
; CHECK-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
; CHECK-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
; CHECK-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
; CHECK-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
; CHECK-NEXT: ret i64 [[OP_RDX3]]
; SSE-LABEL: @bitmask_8xi32(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
; SSE-NEXT: ret i64 [[OP_RDX3]]
;
; AVX-LABEL: @bitmask_8xi32(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
; AVX-NEXT: ret i64 [[OP_RDX3]]
;
; AVX512-LABEL: @bitmask_8xi32(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer
; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> <i64 32, i64 64>
; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
; AVX512-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP7]], 0
; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]]
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]]
; AVX512-NEXT: ret i64 [[OP_RDX3]]
;
entry:
%0 = load i32, ptr %src, align 4
@ -338,10 +512,10 @@ define i64 @bitmask_8xi64(ptr nocapture noundef readonly %src) {
; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0
; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
; SSE4-NEXT: ret i64 [[OP_RDX3]]
;
; AVX-LABEL: @bitmask_8xi64(
@ -366,12 +540,38 @@ define i64 @bitmask_8xi64(ptr nocapture noundef readonly %src) {
; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0
; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
; AVX-NEXT: ret i64 [[OP_RDX3]]
;
; AVX512-LABEL: @bitmask_8xi64(
; AVX512-NEXT: entry:
; AVX512-NEXT: [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5
; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_5]], align 8
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i64> [[TMP4]], zeroinitializer
; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> <i64 32, i64 64>
; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7
; AVX512-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8
; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP7]], 0
; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]]
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]]
; AVX512-NEXT: ret i64 [[OP_RDX3]]
;
entry:
%0 = load i64, ptr %src, align 8
%tobool.not = icmp ne i64 %0, 0

View File

@ -7,22 +7,20 @@ define i32 @crash_reordering_undefs() {
; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef
; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]]
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef
; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]]
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef
; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]]
; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef
; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef
; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef
; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]]
; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef
; CHECK-NEXT: ret i32 [[ADD11]]
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP0]], [[OP_RDX3]]
; CHECK-NEXT: ret i32 [[OP_RDX4]]
;
entry:
%or0 = or i64 undef, undef

View File

@ -18,8 +18,8 @@ define float @baz() {
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]]
; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4
; CHECK-NEXT: ret float [[OP_RDX1]]
;
@ -33,8 +33,8 @@ define float @baz() {
; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]]
; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4
; THRESHOLD-NEXT: ret float [[OP_RDX1]]
;
@ -79,24 +79,28 @@ define float @bazz() {
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV6]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]]
; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4
; CHECK-NEXT: ret float [[OP_RDX1]]
;
; THRESHOLD-LABEL: @bazz(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP0]], i32 1
; THRESHOLD-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP2]], <i32 3, i32 2>
; THRESHOLD-NEXT: [[TMP4:%.*]] = shl nsw <2 x i32> [[TMP2]], <i32 3, i32 2>
; THRESHOLD-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
; THRESHOLD-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float>
; THRESHOLD-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
; THRESHOLD-NEXT: [[TMP8:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; THRESHOLD-NEXT: [[TMP9:%.*]] = fmul fast <8 x float> [[TMP8]], [[TMP7]]
; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP9]])
; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP11]], [[TMP12]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP10]], [[OP_RDX]]
; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4
; THRESHOLD-NEXT: ret float [[OP_RDX1]]
;
@ -626,8 +630,8 @@ define float @loadadd31(float* nocapture readonly %x) {
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX1]], [[OP_RDX2]]
; CHECK-NEXT: ret float [[OP_RDX3]]
;
; THRESHOLD-LABEL: @loadadd31(
@ -649,9 +653,14 @@ define float @loadadd31(float* nocapture readonly %x) {
; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0
; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP6]], i32 1
; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1
; THRESHOLD-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[TMP12]], [[TMP14]]
; THRESHOLD-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
; THRESHOLD-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP16]], [[TMP17]]
; THRESHOLD-NEXT: ret float [[OP_RDX3]]
;
entry:
@ -755,9 +764,9 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP2]], [[OP_RDX1]]
; CHECK-NEXT: ret float [[OP_RDX2]]
;
; THRESHOLD-LABEL: @extra_args(
@ -767,9 +776,9 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP2]], [[OP_RDX1]]
; THRESHOLD-NEXT: ret float [[OP_RDX2]]
;
entry:
@ -811,12 +820,11 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
; CHECK-NEXT: ret float [[OP_RDX4]]
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float 3.000000e+00, [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float 1.000000e+01, [[OP_RDX]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
; CHECK-NEXT: ret float [[OP_RDX3]]
;
; THRESHOLD-LABEL: @extra_args_same_several_times(
; THRESHOLD-NEXT: entry:
@ -825,12 +833,11 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
; THRESHOLD-NEXT: ret float [[OP_RDX4]]
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float 3.000000e+00, [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float 1.000000e+01, [[OP_RDX]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
; THRESHOLD-NEXT: ret float [[OP_RDX3]]
;
entry:
%mul = mul nsw i32 %b, %a
@ -874,24 +881,28 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
; CHECK-NEXT: ret float [[OP_RDX3]]
;
; THRESHOLD-LABEL: @extra_args_no_replace(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 1
; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[MUL]], i32 0
; THRESHOLD-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x float>
; THRESHOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
; THRESHOLD-NEXT: [[TMP7:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[TMP6]], i32 0
; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast <2 x float> [[TMP5]], [[TMP7]]
; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP9]], [[TMP10]]
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
; THRESHOLD-NEXT: ret float [[OP_RDX3]]
;
entry:
@ -991,8 +1002,8 @@ define i32 @wobble(i32 %arg, i32 %bar) {
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
; CHECK-NEXT: [[OP_RDX:%.*]] = add nuw i32 [[TMP3]], [[ARG]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = add nsw i32 [[TMP6]], [[OP_RDX]]
; CHECK-NEXT: ret i32 [[OP_RDX2]]
;
; THRESHOLD-LABEL: @wobble(
@ -1006,8 +1017,8 @@ define i32 @wobble(i32 %arg, i32 %bar) {
; THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
; THRESHOLD-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
; THRESHOLD-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add nuw i32 [[TMP3]], [[ARG]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add nsw i32 [[TMP6]], [[OP_RDX]]
; THRESHOLD-NEXT: ret i32 [[OP_RDX2]]
;
bb:

View File

@ -873,15 +873,15 @@ define i32 @maxi8_mutiple_uses(i32) {
; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4
; AVX-NEXT: store i32 [[TMP14]], i32* @var, align 8
; AVX-NEXT: ret i32 [[OP_EXTRA1]]
; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
; AVX-NEXT: store i32 [[TMP10]], i32* @var, align 8
; AVX-NEXT: ret i32 [[OP_RDX5]]
;
; AVX2-LABEL: @maxi8_mutiple_uses(
; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@ -892,15 +892,15 @@ define i32 @maxi8_mutiple_uses(i32) {
; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4
; AVX2-NEXT: store i32 [[TMP14]], i32* @var, align 8
; AVX2-NEXT: ret i32 [[OP_EXTRA1]]
; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
; AVX2-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
; AVX2-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
; AVX2-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
; AVX2-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
; AVX2-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
; AVX2-NEXT: store i32 [[TMP10]], i32* @var, align 8
; AVX2-NEXT: ret i32 [[OP_RDX5]]
;
; THRESH-LABEL: @maxi8_mutiple_uses(
; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
@ -910,22 +910,22 @@ define i32 @maxi8_mutiple_uses(i32) {
; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
; THRESH-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]]
; THRESH-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP6]]
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1
; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP17]], i32 [[TMP18]]
; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
; THRESH-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 3, i32 4
; THRESH-NEXT: store i32 [[TMP20]], i32* @var, align 8
; THRESH-NEXT: ret i32 [[OP_EXTRA1]]
; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP3]], i32 1
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i32 1
; THRESH-NEXT: [[TMP13:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP12]]
; THRESH-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[TMP10]], <2 x i32> [[TMP12]]
; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
; THRESH-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP15]], i32 [[TMP16]]
; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP8]], [[OP_RDX3]]
; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP8]], i32 [[OP_RDX3]]
; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 3, i32 4
; THRESH-NEXT: store i32 [[TMP18]], i32* @var, align 8
; THRESH-NEXT: ret i32 [[OP_RDX5]]
;
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@ -1058,13 +1058,13 @@ define i32 @maxi8_wrong_parent(i32) {
; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
; AVX-NEXT: ret i32 [[OP_EXTRA1]]
; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
; AVX-NEXT: ret i32 [[OP_RDX5]]
;
; AVX2-LABEL: @maxi8_wrong_parent(
; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
@ -1077,13 +1077,13 @@ define i32 @maxi8_wrong_parent(i32) {
; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
; AVX2-NEXT: ret i32 [[OP_EXTRA1]]
; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
; AVX2-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
; AVX2-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
; AVX2-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
; AVX2-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
; AVX2-NEXT: ret i32 [[OP_RDX5]]
;
; THRESH-LABEL: @maxi8_wrong_parent(
; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
@ -1093,24 +1093,25 @@ define i32 @maxi8_wrong_parent(i32) {
; THRESH-NEXT: br label [[PP:%.*]]
; THRESH: pp:
; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
; THRESH-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
; THRESH-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
; THRESH-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> poison, i1 [[TMP12]], i32 0
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1
; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
; THRESH-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1
; THRESH-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
; THRESH-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1
; THRESH-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]]
; THRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
; THRESH-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP20]], i32 [[TMP21]]
; THRESH-NEXT: ret i32 [[OP_EXTRA1]]
; THRESH-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
; THRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
; THRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
; THRESH-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i1> poison, i1 [[OP_RDX]], i32 0
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i1> [[TMP11]], i1 [[TMP5]], i32 1
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP3]], i32 1
; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
; THRESH-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP4]], i32 1
; THRESH-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP14]], <2 x i32> [[TMP16]]
; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP18]], i32 [[TMP19]]
; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP8]], [[OP_RDX3]]
; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP8]], i32 [[OP_RDX3]]
; THRESH-NEXT: ret i32 [[OP_RDX5]]
;
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
@ -1434,8 +1435,8 @@ define void @PR49730() {
; AVX-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]]
; AVX-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef)
; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]])
; AVX-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
; AVX-NEXT: ret void
;
@ -1444,8 +1445,8 @@ define void @PR49730() {
; AVX2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]]
; AVX2-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef)
; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]])
; AVX2-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
; AVX2-NEXT: ret void
;
@ -1454,8 +1455,8 @@ define void @PR49730() {
; THRESH-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]]
; THRESH-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef)
; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]])
; THRESH-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
; THRESH-NEXT: ret void
;

View File

@ -172,10 +172,10 @@ define i64 @test_3() #0 {
; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP50]], [[TMP51]]
; CHECK-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[SHUFFLE]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TMP52]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX1]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[OP_RDX3]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[OP_RDX4]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[OP_RDX2]], [[OP_RDX3]]
; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[OP_RDX1]], [[OP_RDX4]]
; CHECK-NEXT: [[VAL64:%.*]] = add i32 undef, [[OP_RDX5]]
; CHECK-NEXT: [[VAL65:%.*]] = sext i32 [[VAL64]] to i64
; CHECK-NEXT: ret i64 [[VAL65]]

View File

@ -308,8 +308,8 @@ define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[S2]], i1 false
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false
; CHECK-NEXT: ret i1 [[OP_RDX1]]
;
%x0 = extractelement <4 x i32> %x, i32 0
@ -398,10 +398,10 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP9]], i1 [[TMP10]], i1 false
; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP11]], i1 false
; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false
; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false
; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 [[OP_RDX1]], i1 false
; SSE-NEXT: ret i1 [[OP_RDX2]]
;
; AVX-LABEL: @logical_and_icmp_clamp_partial(
@ -414,9 +414,9 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
; AVX-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
; AVX-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[C1]], i1 false
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C0]], i1 false
; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false
; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false
; AVX-NEXT: ret i1 [[OP_RDX2]]
;
%x0 = extractelement <4 x i32> %x, i32 0

View File

@ -22,25 +22,20 @@ define void @test() {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[TMP2]], undef
; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], undef
; CHECK-NEXT: [[I10:%.*]] = add i32 [[OP_RDX4]], undef
; CHECK-NEXT: [[I11:%.*]] = add i32 [[OP_RDX4]], [[I10]]
; CHECK-NEXT: [[OP_RDX6:%.*]] = add i32 [[TMP2]], undef
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>*
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP5]], undef
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], undef
; CHECK-NEXT: [[I18:%.*]] = add i32 [[OP_RDX2]], [[I11]]
; CHECK-NEXT: [[I19:%.*]] = add i32 [[OP_RDX2]], [[I18]]
; CHECK-NEXT: [[I20:%.*]] = add i32 undef, [[I19]]
; CHECK-NEXT: [[I21:%.*]] = add i32 undef, [[I20]]
; CHECK-NEXT: [[I22:%.*]] = add i32 undef, [[I21]]
; CHECK-NEXT: [[I23:%.*]] = add i32 undef, [[I22]]
; CHECK-NEXT: [[OP_RDX5:%.*]] = add i32 [[TMP5]], undef
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[OP_RDX6]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX6]], [[OP_RDX5]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[OP_RDX5]]
; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP6]], [[OP_RDX3]]
; CHECK-NEXT: br label [[IF_END]]
; CHECK: if.end:
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I23]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX4]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ]
; CHECK-NEXT: ret void
;
entry:

View File

@ -10,11 +10,9 @@ define void @_Z2azv() local_unnamed_addr {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP2]], undef
; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP2]], i32 undef
; CHECK-NEXT: [[OP_EXTRA2:%.*]] = icmp sgt i32 [[OP_EXTRA1]], undef
; CHECK-NEXT: [[OP_EXTRA3:%.*]] = select i1 [[OP_EXTRA2]], i32 [[OP_EXTRA1]], i32 undef
; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_EXTRA3]]
; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], undef
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 undef
; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX1]]
; CHECK-NEXT: [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef
; CHECK-NEXT: ret void
;

View File

@ -35,14 +35,14 @@ define void @n() local_unnamed_addr #0 {
; CHECK-NEXT: [[TMP19:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> [[TMP17]]
; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP20]])
; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP21]], [[B_0]]
; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP21]], i32 [[B_0]]
; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP21]], [[B_0]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP21]], i32 [[B_0]]
; CHECK-NEXT: [[SUB_116:%.*]] = sub i32 [[TMP15]], [[TMP1]]
; CHECK-NEXT: [[TMP22:%.*]] = icmp slt i32 [[SUB_116]], 0
; CHECK-NEXT: [[NEG_117:%.*]] = sub nsw i32 0, [[SUB_116]]
; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[NEG_117]], i32 [[SUB_116]]
; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP23]], [[OP_EXTRA1]]
; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP23]], i32 [[OP_EXTRA1]]
; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP23]], [[OP_RDX1]]
; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP23]], i32 [[OP_RDX1]]
; CHECK-NEXT: [[SUB_1_1:%.*]] = sub i32 [[TMP15]], [[TMP2]]
; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i32 [[SUB_1_1]], 0
; CHECK-NEXT: [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]]