forked from OSchip/llvm-project
[SLP]Improve reductions vectorization.
The pattern matching and vectgorization for reductions was not very effective. Some of of the possible reduction values were marked as external arguments, SLP could not find some reduction patterns because of too early attempt to vectorize pair of binops arguments, the cost of consts reductions was not correct. Patch addresses these issues and improves the analysis/cost estimation and vectorization of the reductions. The most significant changes in SLP.NumVectorInstructions: Metric: SLP.NumVectorInstructions [140/14396] Program results results0 diff test-suite :: SingleSource/Benchmarks/Adobe-C++/loop_unroll.test 920.00 3548.00 285.7% test-suite :: SingleSource/Benchmarks/BenchmarkGame/n-body.test 66.00 122.00 84.8% test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test 100.00 128.00 28.0% test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test 664.00 810.00 22.0% test-suite :: MultiSource/Benchmarks/mafft/pairlocalalign.test 592.00 687.00 16.0% test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test 402.00 426.00 6.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 1665.00 1745.00 4.8% test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 135.00 139.00 3.0% test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 135.00 139.00 3.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 388.00 397.00 2.3% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 895.00 914.00 2.1% test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test 240.00 244.00 1.7% test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test 240.00 244.00 1.7% test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 820.00 832.00 1.5% test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 820.00 832.00 1.5% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 14804.00 14914.00 0.7% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 8125.00 8183.00 0.7% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 1330.00 1338.00 0.6% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 1330.00 1338.00 0.6% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 9832.00 9880.00 0.5% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 5267.00 5291.00 0.5% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 4018.00 4024.00 0.1% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 4018.00 4024.00 0.1% test-suite :: External/SPEC/CFP2017speed/644.nab_s/644.nab_s.test 426.00 424.00 -0.5% test-suite :: External/SPEC/CFP2017rate/544.nab_r/544.nab_r.test 426.00 424.00 -0.5% test-suite :: External/SPEC/CINT2017rate/541.leela_r/541.leela_r.test 201.00 192.00 -4.5% test-suite :: External/SPEC/CINT2017speed/641.leela_s/641.leela_s.test 201.00 192.00 -4.5% 644.nab_s and 544.nab_r - reduced number of shuffles but increased number of useful vectorized instructions. 641.leela_s and 541.leela_r - the function `@_ZN9FastBoard25get_pattern3_augment_specEiib` is not inlined anymore but its body gets vectorized successfully. Before, the function was inlined twice and vectorized just after inlining, currently it is not required. The vector code looks pretty similar, just like as it was before. Differential Revision: https://reviews.llvm.org/D111574
This commit is contained in:
parent
1e14b1a797
commit
7d8060bc19
|
@ -10582,7 +10582,7 @@ public:
|
|||
// Estimate cost.
|
||||
InstructionCost TreeCost = V.getTreeCost(VL);
|
||||
InstructionCost ReductionCost =
|
||||
getReductionCost(TTI, VL[0], ReduxWidth, RdxFMF);
|
||||
getReductionCost(TTI, VL, ReduxWidth, RdxFMF);
|
||||
InstructionCost Cost = TreeCost + ReductionCost;
|
||||
if (!Cost.isValid()) {
|
||||
LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
|
||||
|
@ -10659,45 +10659,78 @@ public:
|
|||
// Finish the reduction.
|
||||
// Need to add extra arguments and not vectorized possible reduction
|
||||
// values.
|
||||
// Try to avoid dependencies between the scalar remainders after
|
||||
// reductions.
|
||||
auto &&FinalGen =
|
||||
[this, &Builder,
|
||||
&TrackedVals](ArrayRef<std::pair<Instruction *, Value *>> InstVals) {
|
||||
unsigned Sz = InstVals.size();
|
||||
SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
|
||||
Sz % 2);
|
||||
for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
|
||||
Instruction *RedOp = InstVals[I + 1].first;
|
||||
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
|
||||
ReductionOpsListType Ops;
|
||||
if (auto *Sel = dyn_cast<SelectInst>(RedOp))
|
||||
Ops.emplace_back().push_back(Sel->getCondition());
|
||||
Ops.emplace_back().push_back(RedOp);
|
||||
Value *RdxVal1 = InstVals[I].second;
|
||||
Value *StableRdxVal1 = RdxVal1;
|
||||
auto It1 = TrackedVals.find(RdxVal1);
|
||||
if (It1 != TrackedVals.end())
|
||||
StableRdxVal1 = It1->second;
|
||||
Value *RdxVal2 = InstVals[I + 1].second;
|
||||
Value *StableRdxVal2 = RdxVal2;
|
||||
auto It2 = TrackedVals.find(RdxVal2);
|
||||
if (It2 != TrackedVals.end())
|
||||
StableRdxVal2 = It2->second;
|
||||
Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
|
||||
StableRdxVal2, "op.rdx", Ops);
|
||||
ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
|
||||
}
|
||||
if (Sz % 2 == 1)
|
||||
ExtraReds[Sz / 2] = InstVals.back();
|
||||
return ExtraReds;
|
||||
};
|
||||
SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
|
||||
SmallPtrSet<Value *, 8> Visited;
|
||||
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
|
||||
ArrayRef<Value *> Candidates = ReducedVals[I];
|
||||
for (ArrayRef<Value *> Candidates : ReducedVals) {
|
||||
for (Value *RdxVal : Candidates) {
|
||||
if (!Visited.insert(RdxVal).second)
|
||||
continue;
|
||||
Value *StableRdxVal = RdxVal;
|
||||
auto TVIt = TrackedVals.find(RdxVal);
|
||||
if (TVIt != TrackedVals.end())
|
||||
StableRdxVal = TVIt->second;
|
||||
unsigned NumOps = VectorizedVals.lookup(RdxVal);
|
||||
for (Instruction *RedOp :
|
||||
makeArrayRef(ReducedValsToOps.find(RdxVal)->second)
|
||||
.drop_back(NumOps)) {
|
||||
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
|
||||
ReductionOpsListType Ops;
|
||||
if (auto *Sel = dyn_cast<SelectInst>(RedOp))
|
||||
Ops.emplace_back().push_back(Sel->getCondition());
|
||||
Ops.emplace_back().push_back(RedOp);
|
||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
|
||||
StableRdxVal, "op.rdx", Ops);
|
||||
}
|
||||
.drop_back(NumOps))
|
||||
ExtraReductions.emplace_back(RedOp, RdxVal);
|
||||
}
|
||||
}
|
||||
for (auto &Pair : ExternallyUsedValues) {
|
||||
// Add each externally used value to the final reduction.
|
||||
for (auto *I : Pair.second) {
|
||||
Builder.SetCurrentDebugLocation(I->getDebugLoc());
|
||||
ReductionOpsListType Ops;
|
||||
if (auto *Sel = dyn_cast<SelectInst>(I))
|
||||
Ops.emplace_back().push_back(Sel->getCondition());
|
||||
Ops.emplace_back().push_back(I);
|
||||
Value *StableRdxVal = Pair.first;
|
||||
auto TVIt = TrackedVals.find(Pair.first);
|
||||
if (TVIt != TrackedVals.end())
|
||||
StableRdxVal = TVIt->second;
|
||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
|
||||
StableRdxVal, "op.rdx", Ops);
|
||||
}
|
||||
for (auto *I : Pair.second)
|
||||
ExtraReductions.emplace_back(I, Pair.first);
|
||||
}
|
||||
// Iterate through all not-vectorized reduction values/extra arguments.
|
||||
while (ExtraReductions.size() > 1) {
|
||||
SmallVector<std::pair<Instruction *, Value *>> NewReds =
|
||||
FinalGen(ExtraReductions);
|
||||
ExtraReductions.swap(NewReds);
|
||||
}
|
||||
// Final reduction.
|
||||
if (ExtraReductions.size() == 1) {
|
||||
Instruction *RedOp = ExtraReductions.back().first;
|
||||
Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
|
||||
ReductionOpsListType Ops;
|
||||
if (auto *Sel = dyn_cast<SelectInst>(RedOp))
|
||||
Ops.emplace_back().push_back(Sel->getCondition());
|
||||
Ops.emplace_back().push_back(RedOp);
|
||||
Value *RdxVal = ExtraReductions.back().second;
|
||||
Value *StableRdxVal = RdxVal;
|
||||
auto It = TrackedVals.find(RdxVal);
|
||||
if (It != TrackedVals.end())
|
||||
StableRdxVal = It->second;
|
||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
|
||||
StableRdxVal, "op.rdx", Ops);
|
||||
}
|
||||
|
||||
ReductionRoot->replaceAllUsesWith(VectorizedTree);
|
||||
|
@ -10739,12 +10772,16 @@ public:
|
|||
private:
|
||||
/// Calculate the cost of a reduction.
|
||||
InstructionCost getReductionCost(TargetTransformInfo *TTI,
|
||||
Value *FirstReducedVal, unsigned ReduxWidth,
|
||||
FastMathFlags FMF) {
|
||||
ArrayRef<Value *> ReducedVals,
|
||||
unsigned ReduxWidth, FastMathFlags FMF) {
|
||||
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
||||
Value *FirstReducedVal = ReducedVals.front();
|
||||
Type *ScalarTy = FirstReducedVal->getType();
|
||||
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
|
||||
InstructionCost VectorCost, ScalarCost;
|
||||
InstructionCost VectorCost = 0, ScalarCost;
|
||||
// If all of the reduced values are constant, the vector cost is 0, since
|
||||
// the reduction value can be calculated at the compile time.
|
||||
bool AllConsts = all_of(ReducedVals, isConstant);
|
||||
switch (RdxKind) {
|
||||
case RecurKind::Add:
|
||||
case RecurKind::Mul:
|
||||
|
@ -10754,17 +10791,22 @@ private:
|
|||
case RecurKind::FAdd:
|
||||
case RecurKind::FMul: {
|
||||
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
|
||||
VectorCost =
|
||||
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
|
||||
if (!AllConsts)
|
||||
VectorCost =
|
||||
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
|
||||
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
|
||||
break;
|
||||
}
|
||||
case RecurKind::FMax:
|
||||
case RecurKind::FMin: {
|
||||
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
|
||||
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
|
||||
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
|
||||
/*IsUnsigned=*/false, CostKind);
|
||||
if (!AllConsts) {
|
||||
auto *VecCondTy =
|
||||
cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
|
||||
VectorCost =
|
||||
TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
|
||||
/*IsUnsigned=*/false, CostKind);
|
||||
}
|
||||
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
|
||||
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
|
||||
SclCondTy, RdxPred, CostKind) +
|
||||
|
@ -10777,11 +10819,14 @@ private:
|
|||
case RecurKind::UMax:
|
||||
case RecurKind::UMin: {
|
||||
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
|
||||
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
|
||||
bool IsUnsigned =
|
||||
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
|
||||
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
|
||||
CostKind);
|
||||
if (!AllConsts) {
|
||||
auto *VecCondTy =
|
||||
cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
|
||||
bool IsUnsigned =
|
||||
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
|
||||
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
|
||||
IsUnsigned, CostKind);
|
||||
}
|
||||
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
|
||||
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
|
||||
SclCondTy, RdxPred, CostKind) +
|
||||
|
|
|
@ -22,11 +22,11 @@ define i32 @smaxv6() {
|
|||
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
|
||||
; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
|
||||
; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
|
||||
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP5]], i32 [[SELECT1]]
|
||||
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
|
||||
; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8
|
||||
; GFX9-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; GFX9-NEXT: ret i32 [[OP_RDX1]]
|
||||
;
|
||||
%load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
|
||||
%load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
|
||||
|
@ -63,11 +63,11 @@ define i64 @sminv6() {
|
|||
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
|
||||
; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
|
||||
; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
|
||||
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_RDX:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i64 [[TMP5]], i64 [[SELECT1]]
|
||||
; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
|
||||
; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8
|
||||
; GFX9-NEXT: ret i64 [[OP_EXTRA1]]
|
||||
; GFX9-NEXT: ret i64 [[OP_RDX1]]
|
||||
;
|
||||
%load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
|
||||
%load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
|
||||
|
@ -206,11 +206,11 @@ define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
|
|||
; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
|
||||
; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
|
||||
; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
|
||||
; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
|
||||
; GFX9-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP3]], i32 [[SELECT1]]
|
||||
; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
|
||||
; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8
|
||||
; GFX9-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; GFX9-NEXT: ret i32 [[OP_RDX1]]
|
||||
;
|
||||
%vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
|
||||
%elt1 = extractelement <2 x i32> %vload, i32 0
|
||||
|
|
|
@ -17,10 +17,10 @@ define void @mainTest(i32* %ptr) #0 {
|
|||
; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP1]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP3]] to i64
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], [[TMP3]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], [[TMP2]]
|
||||
; CHECK-NEXT: [[OP_RDX3]] = add i32 [[OP_RDX2]], 1
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP4]], [[TMP3]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[TMP2]], 1
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[OP_RDX3]] = add i32 [[TMP7]], [[OP_RDX2]]
|
||||
; CHECK-NEXT: br label [[LOOP]]
|
||||
; CHECK: bail_out:
|
||||
; CHECK-NEXT: ret void
|
||||
|
|
|
@ -19,8 +19,8 @@ define void @test() #0 {
|
|||
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP7]], [[TMP3]]
|
||||
; CHECK-NEXT: [[OP_RDX1]] = add i64 [[OP_RDX]], 0
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP3]], 0
|
||||
; CHECK-NEXT: [[OP_RDX1]] = add i64 [[TMP7]], [[OP_RDX]]
|
||||
; CHECK-NEXT: br label [[LOOP]]
|
||||
;
|
||||
entry:
|
||||
|
|
|
@ -40,14 +40,14 @@ define void @Test(i32) {
|
|||
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP26]], i32 0
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP26]], i32 1
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP31]], [[TMP33]]
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP31]], [[TMP33]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP26]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]]
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[TMP26]], i32 1
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> <i32 poison, i32 14910>, i32 [[OP_RDX4]], i32 0
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP32]], [[TMP33]]
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP32]], [[TMP33]]
|
||||
; CHECK-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> <i32 0, i32 3>
|
||||
; CHECK-NEXT: br label [[LOOP]]
|
||||
;
|
||||
|
@ -88,10 +88,10 @@ define void @Test(i32) {
|
|||
; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]]
|
||||
; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]])
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[OP_RDX1]], [[TMP0]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[OP_RDX2]], [[TMP0]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX3]], [[TMP0]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX4]], [[TMP26]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP26]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]]
|
||||
; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX1]], [[OP_RDX4]]
|
||||
; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP26]], 14910
|
||||
; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0
|
||||
; FORCE_REDUCTION-NEXT: [[TMP32]] = insertelement <2 x i32> [[TMP31]], i32 [[VAL_43]], i32 1
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v2 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v3 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
|
||||
; RUN: opt < %s -basic-aa -slp-vectorizer -S -mtriple=x86_64-unknown -mcpu=x86-64-v4 -basic-aa -slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
|
||||
|
||||
; // PR42652
|
||||
; unsigned long bitmask_16xi8(const char *src) {
|
||||
|
@ -15,39 +15,105 @@
|
|||
; }
|
||||
|
||||
define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) {
|
||||
; CHECK-LABEL: @bitmask_16xi8(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
|
||||
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
|
||||
; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
|
||||
; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
|
||||
; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
|
||||
; CHECK-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
|
||||
; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
|
||||
; CHECK-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
|
||||
; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
|
||||
; CHECK-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_13]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_14]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR_15]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]]
|
||||
; CHECK-NEXT: ret i64 [[OP_RDX4]]
|
||||
; SSE-LABEL: @bitmask_16xi8(
|
||||
; SSE-NEXT: entry:
|
||||
; SSE-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
|
||||
; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
|
||||
; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
|
||||
; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
|
||||
; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
|
||||
; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
|
||||
; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
|
||||
; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
|
||||
; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
|
||||
; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
|
||||
; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
|
||||
; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
|
||||
; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
|
||||
; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
|
||||
; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
|
||||
; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
|
||||
; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
|
||||
; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
|
||||
; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
|
||||
; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
|
||||
; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
|
||||
; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
|
||||
; SSE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
|
||||
; SSE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
|
||||
; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]]
|
||||
; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]]
|
||||
; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]]
|
||||
; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]]
|
||||
; SSE-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]]
|
||||
; SSE-NEXT: ret i64 [[OP_RDX4]]
|
||||
;
|
||||
; AVX-LABEL: @bitmask_16xi8(
|
||||
; AVX-NEXT: entry:
|
||||
; AVX-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
|
||||
; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
|
||||
; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
|
||||
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
|
||||
; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
|
||||
; AVX-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
|
||||
; AVX-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
|
||||
; AVX-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
|
||||
; AVX-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
|
||||
; AVX-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
|
||||
; AVX-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
|
||||
; AVX-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1
|
||||
; AVX-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0
|
||||
; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192
|
||||
; AVX-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14
|
||||
; AVX-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1
|
||||
; AVX-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0
|
||||
; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384
|
||||
; AVX-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
|
||||
; AVX-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
|
||||
; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0
|
||||
; AVX-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
|
||||
; AVX-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
|
||||
; AVX-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP10]], [[TMP11]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_13]], [[OR_14]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]]
|
||||
; AVX-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]]
|
||||
; AVX-NEXT: ret i64 [[OP_RDX4]]
|
||||
;
|
||||
; AVX512-LABEL: @bitmask_16xi8(
|
||||
; AVX512-NEXT: entry:
|
||||
; AVX512-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1
|
||||
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0
|
||||
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1
|
||||
; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1
|
||||
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> <i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256>
|
||||
; AVX512-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9
|
||||
; AVX512-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1
|
||||
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> <i64 512, i64 1024, i64 2048, i64 4096>
|
||||
; AVX512-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13
|
||||
; AVX512-NEXT: [[TMP7:%.*]] = load <2 x i8>, ptr [[ARRAYIDX_13]], align 1
|
||||
; AVX512-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i64> zeroinitializer, <2 x i64> <i64 8192, i64 16384>
|
||||
; AVX512-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15
|
||||
; AVX512-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1
|
||||
; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP10]], 0
|
||||
; AVX512-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768
|
||||
; AVX512-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP3]])
|
||||
; AVX512-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP6]])
|
||||
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP11]], [[TMP12]]
|
||||
; AVX512-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
|
||||
; AVX512-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
|
||||
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP13]], [[TMP14]]
|
||||
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_15]], [[OR]]
|
||||
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX1]], [[OP_RDX2]]
|
||||
; AVX512-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX]], [[OP_RDX3]]
|
||||
; AVX512-NEXT: ret i64 [[OP_RDX4]]
|
||||
;
|
||||
entry:
|
||||
%0 = load i8, ptr %src, align 1
|
||||
|
@ -132,33 +198,87 @@ entry:
|
|||
}
|
||||
|
||||
define i64 @bitmask_4xi16(ptr nocapture noundef readonly %src) {
|
||||
; CHECK-LABEL: @bitmask_4xi16(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
|
||||
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
|
||||
; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
|
||||
; CHECK-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
|
||||
; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
|
||||
; CHECK-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
|
||||
; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
|
||||
; CHECK-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
|
||||
; CHECK-NEXT: ret i64 [[OP_RDX3]]
|
||||
; SSE-LABEL: @bitmask_4xi16(
|
||||
; SSE-NEXT: entry:
|
||||
; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
|
||||
; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
|
||||
; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
|
||||
; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
|
||||
; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
|
||||
; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
|
||||
; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
|
||||
; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
|
||||
; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
|
||||
; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
|
||||
; SSE-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
|
||||
; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
|
||||
; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
|
||||
; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
|
||||
; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
|
||||
; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
|
||||
; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
|
||||
; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
|
||||
; SSE-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
; AVX-LABEL: @bitmask_4xi16(
|
||||
; AVX-NEXT: entry:
|
||||
; AVX-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
|
||||
; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
|
||||
; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
|
||||
; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
|
||||
; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
|
||||
; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
|
||||
; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
|
||||
; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0
|
||||
; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
|
||||
; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6
|
||||
; AVX-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
|
||||
; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0
|
||||
; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
|
||||
; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
|
||||
; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
|
||||
; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0
|
||||
; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
|
||||
; AVX-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
; AVX512-LABEL: @bitmask_4xi16(
|
||||
; AVX512-NEXT: entry:
|
||||
; AVX512-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2
|
||||
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0
|
||||
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1
|
||||
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2
|
||||
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5
|
||||
; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_5]], align 2
|
||||
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i16> [[TMP4]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> <i64 32, i64 64>
|
||||
; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7
|
||||
; AVX512-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
|
||||
; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP7]], 0
|
||||
; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
|
||||
; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
|
||||
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]]
|
||||
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]]
|
||||
; AVX512-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%0 = load i16, ptr %src, align 2
|
||||
|
@ -203,33 +323,87 @@ entry:
|
|||
}
|
||||
|
||||
define i64 @bitmask_8xi32(ptr nocapture noundef readonly %src) {
|
||||
; CHECK-LABEL: @bitmask_8xi32(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
|
||||
; CHECK-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
|
||||
; CHECK-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
|
||||
; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
|
||||
; CHECK-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
|
||||
; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
|
||||
; CHECK-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
|
||||
; CHECK-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
|
||||
; CHECK-NEXT: ret i64 [[OP_RDX3]]
|
||||
; SSE-LABEL: @bitmask_8xi32(
|
||||
; SSE-NEXT: entry:
|
||||
; SSE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
|
||||
; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
|
||||
; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
|
||||
; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
|
||||
; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
|
||||
; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
|
||||
; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
|
||||
; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
|
||||
; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
|
||||
; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
|
||||
; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
|
||||
; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
|
||||
; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
|
||||
; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
|
||||
; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
|
||||
; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
|
||||
; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
|
||||
; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
|
||||
; SSE-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
; AVX-LABEL: @bitmask_8xi32(
|
||||
; AVX-NEXT: entry:
|
||||
; AVX-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
|
||||
; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
|
||||
; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
|
||||
; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
|
||||
; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
|
||||
; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
|
||||
; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4
|
||||
; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0
|
||||
; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32
|
||||
; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6
|
||||
; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4
|
||||
; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0
|
||||
; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64
|
||||
; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
|
||||
; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
|
||||
; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0
|
||||
; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
|
||||
; AVX-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
; AVX512-LABEL: @bitmask_8xi32(
|
||||
; AVX512-NEXT: entry:
|
||||
; AVX512-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4
|
||||
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0
|
||||
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1
|
||||
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4
|
||||
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5
|
||||
; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_5]], align 4
|
||||
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[TMP4]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> <i64 32, i64 64>
|
||||
; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7
|
||||
; AVX512-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4
|
||||
; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP7]], 0
|
||||
; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
|
||||
; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
|
||||
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]]
|
||||
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]]
|
||||
; AVX512-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%0 = load i32, ptr %src, align 4
|
||||
|
@ -338,10 +512,10 @@ define i64 @bitmask_8xi64(ptr nocapture noundef readonly %src) {
|
|||
; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0
|
||||
; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
|
||||
; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
|
||||
; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
|
||||
; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
|
||||
; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
|
||||
; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
|
||||
; SSE4-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
; AVX-LABEL: @bitmask_8xi64(
|
||||
|
@ -366,12 +540,38 @@ define i64 @bitmask_8xi64(ptr nocapture noundef readonly %src) {
|
|||
; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0
|
||||
; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OP_RDX]], [[OR_6]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX1]], [[OR_7]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]]
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[OR_5]], [[OR_6]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP7]], [[OP_RDX2]]
|
||||
; AVX-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
; AVX512-LABEL: @bitmask_8xi64(
|
||||
; AVX512-NEXT: entry:
|
||||
; AVX512-NEXT: [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8
|
||||
; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0
|
||||
; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64
|
||||
; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1
|
||||
; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8
|
||||
; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> <i64 2, i64 4, i64 8, i64 16>
|
||||
; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5
|
||||
; AVX512-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_5]], align 8
|
||||
; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <2 x i64> [[TMP4]], zeroinitializer
|
||||
; AVX512-NEXT: [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> zeroinitializer, <2 x i64> <i64 32, i64 64>
|
||||
; AVX512-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7
|
||||
; AVX512-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8
|
||||
; AVX512-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP7]], 0
|
||||
; AVX512-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128
|
||||
; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]])
|
||||
; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
|
||||
; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
|
||||
; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP9]], [[TMP10]]
|
||||
; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_7]], [[OR]]
|
||||
; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]]
|
||||
; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[TMP8]], [[OP_RDX2]]
|
||||
; AVX512-NEXT: ret i64 [[OP_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%0 = load i64, ptr %src, align 8
|
||||
%tobool.not = icmp ne i64 %0, 0
|
||||
|
|
|
@ -7,22 +7,20 @@ define i32 @crash_reordering_undefs() {
|
|||
; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef
|
||||
; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
|
||||
; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
|
||||
; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]]
|
||||
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef
|
||||
; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
|
||||
; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]]
|
||||
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef
|
||||
; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
|
||||
; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]]
|
||||
; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef
|
||||
; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef
|
||||
; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef
|
||||
; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef
|
||||
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
|
||||
; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
|
||||
; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]]
|
||||
; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef
|
||||
; CHECK-NEXT: ret i32 [[ADD11]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP0]], [[OP_RDX3]]
|
||||
; CHECK-NEXT: ret i32 [[OP_RDX4]]
|
||||
;
|
||||
entry:
|
||||
%or0 = or i64 undef, undef
|
||||
|
|
|
@ -18,8 +18,8 @@ define float @baz() {
|
|||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]]
|
||||
; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4
|
||||
; CHECK-NEXT: ret float [[OP_RDX1]]
|
||||
;
|
||||
|
@ -33,8 +33,8 @@ define float @baz() {
|
|||
; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
|
||||
; THRESHOLD-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
|
||||
; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[SHUFFLE]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]]
|
||||
; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX1]]
|
||||
;
|
||||
|
@ -79,24 +79,28 @@ define float @bazz() {
|
|||
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV6]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP4]], [[OP_RDX]]
|
||||
; CHECK-NEXT: store float [[OP_RDX1]], float* @res, align 4
|
||||
; CHECK-NEXT: ret float [[OP_RDX1]]
|
||||
;
|
||||
; THRESHOLD-LABEL: @bazz(
|
||||
; THRESHOLD-NEXT: entry:
|
||||
; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
|
||||
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
|
||||
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
|
||||
; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
|
||||
; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
|
||||
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
|
||||
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
|
||||
; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
|
||||
; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
|
||||
; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP0]], i32 1
|
||||
; THRESHOLD-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP2]], <i32 3, i32 2>
|
||||
; THRESHOLD-NEXT: [[TMP4:%.*]] = shl nsw <2 x i32> [[TMP2]], <i32 3, i32 2>
|
||||
; THRESHOLD-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
|
||||
; THRESHOLD-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float>
|
||||
; THRESHOLD-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
|
||||
; THRESHOLD-NEXT: [[TMP8:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
|
||||
; THRESHOLD-NEXT: [[TMP9:%.*]] = fmul fast <8 x float> [[TMP8]], [[TMP7]]
|
||||
; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP9]])
|
||||
; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP11]], [[TMP12]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP10]], [[OP_RDX]]
|
||||
; THRESHOLD-NEXT: store float [[OP_RDX1]], float* @res, align 4
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX1]]
|
||||
;
|
||||
|
@ -626,8 +630,8 @@ define float @loadadd31(float* nocapture readonly %x) {
|
|||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX1]], [[OP_RDX2]]
|
||||
; CHECK-NEXT: ret float [[OP_RDX3]]
|
||||
;
|
||||
; THRESHOLD-LABEL: @loadadd31(
|
||||
|
@ -649,9 +653,14 @@ define float @loadadd31(float* nocapture readonly %x) {
|
|||
; THRESHOLD-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
|
||||
; THRESHOLD-NEXT: [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP6]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP7]]
|
||||
; THRESHOLD-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[OP_RDX]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP6]], i32 1
|
||||
; THRESHOLD-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP7]], i32 1
|
||||
; THRESHOLD-NEXT: [[TMP15:%.*]] = fadd fast <2 x float> [[TMP12]], [[TMP14]]
|
||||
; THRESHOLD-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
|
||||
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP16]], [[TMP17]]
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX3]]
|
||||
;
|
||||
entry:
|
||||
|
@ -755,9 +764,9 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP2]], [[OP_RDX1]]
|
||||
; CHECK-NEXT: ret float [[OP_RDX2]]
|
||||
;
|
||||
; THRESHOLD-LABEL: @extra_args(
|
||||
|
@ -767,9 +776,9 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
|
|||
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
|
||||
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
|
||||
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 3.000000e+00
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP2]], [[OP_RDX1]]
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX2]]
|
||||
;
|
||||
entry:
|
||||
|
@ -811,12 +820,11 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
|
||||
; CHECK-NEXT: ret float [[OP_RDX4]]
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float 3.000000e+00, [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float 1.000000e+01, [[OP_RDX]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
|
||||
; CHECK-NEXT: ret float [[OP_RDX3]]
|
||||
;
|
||||
; THRESHOLD-LABEL: @extra_args_same_several_times(
|
||||
; THRESHOLD-NEXT: entry:
|
||||
|
@ -825,12 +833,11 @@ define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a
|
|||
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
|
||||
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
|
||||
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], 5.000000e+00
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], 5.000000e+00
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], 3.000000e+00
|
||||
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX4:%.*]] = fadd fast float [[OP_RDX3]], [[CONV]]
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX4]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float 3.000000e+00, [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float 1.000000e+01, [[OP_RDX]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX3]]
|
||||
;
|
||||
entry:
|
||||
%mul = mul nsw i32 %b, %a
|
||||
|
@ -874,24 +881,28 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[CONV]], [[CONV]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX]], [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
|
||||
; CHECK-NEXT: ret float [[OP_RDX3]]
|
||||
;
|
||||
; THRESHOLD-LABEL: @extra_args_no_replace(
|
||||
; THRESHOLD-NEXT: entry:
|
||||
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
|
||||
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
|
||||
; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
|
||||
; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X:%.*]] to <8 x float>*
|
||||
; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
|
||||
; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP1]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[CONVC]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], 3.000000e+00
|
||||
; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 1
|
||||
; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[MUL]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x float>
|
||||
; THRESHOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP7:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[TMP6]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast <2 x float> [[TMP5]], [[TMP7]]
|
||||
; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
|
||||
; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[TMP9]], [[TMP10]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[TMP2]], [[OP_RDX2]]
|
||||
; THRESHOLD-NEXT: ret float [[OP_RDX3]]
|
||||
;
|
||||
entry:
|
||||
|
@ -991,8 +1002,8 @@ define i32 @wobble(i32 %arg, i32 %bar) {
|
|||
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add nuw i32 [[TMP3]], [[ARG]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add nsw i32 [[TMP6]], [[OP_RDX]]
|
||||
; CHECK-NEXT: ret i32 [[OP_RDX2]]
|
||||
;
|
||||
; THRESHOLD-LABEL: @wobble(
|
||||
|
@ -1006,8 +1017,8 @@ define i32 @wobble(i32 %arg, i32 %bar) {
|
|||
; THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer
|
||||
; THRESHOLD-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
|
||||
; THRESHOLD-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add nsw i32 [[TMP6]], [[TMP3]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add nuw i32 [[OP_RDX]], [[ARG]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX:%.*]] = add nuw i32 [[TMP3]], [[ARG]]
|
||||
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = add nsw i32 [[TMP6]], [[OP_RDX]]
|
||||
; THRESHOLD-NEXT: ret i32 [[OP_RDX2]]
|
||||
;
|
||||
bb:
|
||||
|
|
|
@ -873,15 +873,15 @@ define i32 @maxi8_mutiple_uses(i32) {
|
|||
; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
|
||||
; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
|
||||
; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
|
||||
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
|
||||
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
|
||||
; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
|
||||
; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
|
||||
; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
|
||||
; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
|
||||
; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4
|
||||
; AVX-NEXT: store i32 [[TMP14]], i32* @var, align 8
|
||||
; AVX-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
|
||||
; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
|
||||
; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
|
||||
; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
|
||||
; AVX-NEXT: store i32 [[TMP10]], i32* @var, align 8
|
||||
; AVX-NEXT: ret i32 [[OP_RDX5]]
|
||||
;
|
||||
; AVX2-LABEL: @maxi8_mutiple_uses(
|
||||
; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
|
||||
|
@ -892,15 +892,15 @@ define i32 @maxi8_mutiple_uses(i32) {
|
|||
; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
|
||||
; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
|
||||
; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
|
||||
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
|
||||
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
|
||||
; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
|
||||
; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
|
||||
; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
|
||||
; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
|
||||
; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP4]], i32 3, i32 4
|
||||
; AVX2-NEXT: store i32 [[TMP14]], i32* @var, align 8
|
||||
; AVX2-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
|
||||
; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
|
||||
; AVX2-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
|
||||
; AVX2-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
|
||||
; AVX2-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
|
||||
; AVX2-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
|
||||
; AVX2-NEXT: [[TMP10:%.*]] = select i1 [[TMP4]], i32 3, i32 4
|
||||
; AVX2-NEXT: store i32 [[TMP10]], i32* @var, align 8
|
||||
; AVX2-NEXT: ret i32 [[OP_RDX5]]
|
||||
;
|
||||
; THRESH-LABEL: @maxi8_mutiple_uses(
|
||||
; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
|
||||
|
@ -910,22 +910,22 @@ define i32 @maxi8_mutiple_uses(i32) {
|
|||
; THRESH-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
|
||||
; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
|
||||
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]])
|
||||
; THRESH-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP6]]
|
||||
; THRESH-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP6]]
|
||||
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
|
||||
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP3]], i32 1
|
||||
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
|
||||
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP4]], i32 1
|
||||
; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
|
||||
; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
|
||||
; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
|
||||
; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
|
||||
; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
|
||||
; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP17]], i32 [[TMP18]]
|
||||
; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP15]], i32 1
|
||||
; THRESH-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i32 3, i32 4
|
||||
; THRESH-NEXT: store i32 [[TMP20]], i32* @var, align 8
|
||||
; THRESH-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; THRESH-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
|
||||
; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP3]], i32 1
|
||||
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
|
||||
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i32 1
|
||||
; THRESH-NEXT: [[TMP13:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP12]]
|
||||
; THRESH-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP13]], <2 x i32> [[TMP10]], <2 x i32> [[TMP12]]
|
||||
; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0
|
||||
; THRESH-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP14]], i32 1
|
||||
; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]]
|
||||
; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP15]], i32 [[TMP16]]
|
||||
; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP8]], [[OP_RDX3]]
|
||||
; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP8]], i32 [[OP_RDX3]]
|
||||
; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
|
||||
; THRESH-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 3, i32 4
|
||||
; THRESH-NEXT: store i32 [[TMP18]], i32* @var, align 8
|
||||
; THRESH-NEXT: ret i32 [[OP_RDX5]]
|
||||
;
|
||||
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
|
||||
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
|
||||
|
@ -1058,13 +1058,13 @@ define i32 @maxi8_wrong_parent(i32) {
|
|||
; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
|
||||
; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
|
||||
; AVX-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
|
||||
; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
|
||||
; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
|
||||
; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
|
||||
; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
|
||||
; AVX-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
|
||||
; AVX-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
|
||||
; AVX-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
|
||||
; AVX-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
|
||||
; AVX-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
|
||||
; AVX-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
|
||||
; AVX-NEXT: ret i32 [[OP_RDX5]]
|
||||
;
|
||||
; AVX2-LABEL: @maxi8_wrong_parent(
|
||||
; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
|
||||
|
@ -1077,13 +1077,13 @@ define i32 @maxi8_wrong_parent(i32) {
|
|||
; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
|
||||
; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
|
||||
; AVX2-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
|
||||
; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
|
||||
; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
|
||||
; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
|
||||
; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]]
|
||||
; AVX2-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]]
|
||||
; AVX2-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP13]], i32 [[TMP5]]
|
||||
; AVX2-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; AVX2-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
|
||||
; AVX2-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP8]]
|
||||
; AVX2-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[OP_RDX1]], [[TMP5]]
|
||||
; AVX2-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[OP_RDX1]], i32 [[TMP5]]
|
||||
; AVX2-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP9]], [[OP_RDX3]]
|
||||
; AVX2-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP9]], i32 [[OP_RDX3]]
|
||||
; AVX2-NEXT: ret i32 [[OP_RDX5]]
|
||||
;
|
||||
; THRESH-LABEL: @maxi8_wrong_parent(
|
||||
; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
|
||||
|
@ -1093,24 +1093,25 @@ define i32 @maxi8_wrong_parent(i32) {
|
|||
; THRESH-NEXT: br label [[PP:%.*]]
|
||||
; THRESH: pp:
|
||||
; THRESH-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
|
||||
; THRESH-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
|
||||
; THRESH-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
|
||||
; THRESH-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
|
||||
; THRESH-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
|
||||
; THRESH-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
|
||||
; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
|
||||
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> poison, i1 [[TMP12]], i32 0
|
||||
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1
|
||||
; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i32 0
|
||||
; THRESH-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1
|
||||
; THRESH-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
|
||||
; THRESH-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1
|
||||
; THRESH-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]]
|
||||
; THRESH-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
|
||||
; THRESH-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
|
||||
; THRESH-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
|
||||
; THRESH-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP20]], i32 [[TMP21]]
|
||||
; THRESH-NEXT: ret i32 [[OP_EXTRA1]]
|
||||
; THRESH-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8
|
||||
; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP6]])
|
||||
; THRESH-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
|
||||
; THRESH-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
|
||||
; THRESH-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
|
||||
; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i1> poison, i1 [[OP_RDX]], i32 0
|
||||
; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i1> [[TMP11]], i1 [[TMP5]], i32 1
|
||||
; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
|
||||
; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP3]], i32 1
|
||||
; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
|
||||
; THRESH-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP4]], i32 1
|
||||
; THRESH-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP14]], <2 x i32> [[TMP16]]
|
||||
; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0
|
||||
; THRESH-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
|
||||
; THRESH-NEXT: [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP18]], [[TMP19]]
|
||||
; THRESH-NEXT: [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP18]], i32 [[TMP19]]
|
||||
; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP8]], [[OP_RDX3]]
|
||||
; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP8]], i32 [[OP_RDX3]]
|
||||
; THRESH-NEXT: ret i32 [[OP_RDX5]]
|
||||
;
|
||||
%2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
|
||||
%3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
|
||||
|
@ -1434,8 +1435,8 @@ define void @PR49730() {
|
|||
; AVX-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]]
|
||||
; AVX-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
|
||||
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
|
||||
; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
|
||||
; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
|
||||
; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef)
|
||||
; AVX-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]])
|
||||
; AVX-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
|
||||
; AVX-NEXT: ret void
|
||||
;
|
||||
|
@ -1444,8 +1445,8 @@ define void @PR49730() {
|
|||
; AVX2-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]]
|
||||
; AVX2-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
|
||||
; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
|
||||
; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
|
||||
; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
|
||||
; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef)
|
||||
; AVX2-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]])
|
||||
; AVX2-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
|
||||
; AVX2-NEXT: ret void
|
||||
;
|
||||
|
@ -1454,8 +1455,8 @@ define void @PR49730() {
|
|||
; THRESH-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> poison, [[TMP1]]
|
||||
; THRESH-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
|
||||
; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
|
||||
; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[T12]])
|
||||
; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 undef)
|
||||
; THRESH-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[T12]], i32 undef)
|
||||
; THRESH-NEXT: [[TMP5:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP3]], i32 [[TMP4]])
|
||||
; THRESH-NEXT: [[T14:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 93)
|
||||
; THRESH-NEXT: ret void
|
||||
;
|
||||
|
|
|
@ -172,10 +172,10 @@ define i64 @test_3() #0 {
|
|||
; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 [[TMP50]], [[TMP51]]
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[SHUFFLE]])
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], [[TMP52]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[OP_RDX1]], [[TMP1]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[OP_RDX2]], [[TMP1]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[OP_RDX3]], [[TMP1]]
|
||||
; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[OP_RDX4]], [[TMP1]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = mul i32 [[TMP1]], [[TMP1]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = mul i32 [[TMP1]], [[TMP1]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = mul i32 [[OP_RDX2]], [[OP_RDX3]]
|
||||
; CHECK-NEXT: [[OP_RDX5:%.*]] = mul i32 [[OP_RDX1]], [[OP_RDX4]]
|
||||
; CHECK-NEXT: [[VAL64:%.*]] = add i32 undef, [[OP_RDX5]]
|
||||
; CHECK-NEXT: [[VAL65:%.*]] = sext i32 [[VAL64]] to i64
|
||||
; CHECK-NEXT: ret i64 [[VAL65]]
|
||||
|
|
|
@ -308,8 +308,8 @@ define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) {
|
|||
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 [[TMP8]], i1 false
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[S2]], i1 false
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 [[S2]], i1 false
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP7]], i1 [[OP_RDX]], i1 false
|
||||
; CHECK-NEXT: ret i1 [[OP_RDX1]]
|
||||
;
|
||||
%x0 = extractelement <4 x i32> %x, i32 0
|
||||
|
@ -398,10 +398,10 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
|
|||
; SSE-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
|
||||
; SSE-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP8]])
|
||||
; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
|
||||
; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP9]], i1 [[TMP10]], i1 false
|
||||
; SSE-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
|
||||
; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[TMP11]], i1 false
|
||||
; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
|
||||
; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false
|
||||
; SSE-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false
|
||||
; SSE-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 [[OP_RDX1]], i1 false
|
||||
; SSE-NEXT: ret i1 [[OP_RDX2]]
|
||||
;
|
||||
; AVX-LABEL: @logical_and_icmp_clamp_partial(
|
||||
|
@ -414,9 +414,9 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
|
|||
; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
|
||||
; AVX-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
|
||||
; AVX-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[C1]], i1 false
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C0]], i1 false
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[OP_RDX1]], i1 [[C2]], i1 false
|
||||
; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[C1]], i1 [[C0]], i1 false
|
||||
; AVX-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i1 [[C2]], i1 false
|
||||
; AVX-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP6]], i1 [[OP_RDX1]], i1 false
|
||||
; AVX-NEXT: ret i1 [[OP_RDX2]]
|
||||
;
|
||||
%x0 = extractelement <4 x i32> %x, i32 0
|
||||
|
|
|
@ -22,25 +22,20 @@ define void @test() {
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[I]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 8
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[TMP2]], undef
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[OP_RDX3]], undef
|
||||
; CHECK-NEXT: [[I10:%.*]] = add i32 [[OP_RDX4]], undef
|
||||
; CHECK-NEXT: [[I11:%.*]] = add i32 [[OP_RDX4]], [[I10]]
|
||||
; CHECK-NEXT: [[OP_RDX6:%.*]] = add i32 [[TMP2]], undef
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[I1]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP5]], undef
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX1]], undef
|
||||
; CHECK-NEXT: [[I18:%.*]] = add i32 [[OP_RDX2]], [[I11]]
|
||||
; CHECK-NEXT: [[I19:%.*]] = add i32 [[OP_RDX2]], [[I18]]
|
||||
; CHECK-NEXT: [[I20:%.*]] = add i32 undef, [[I19]]
|
||||
; CHECK-NEXT: [[I21:%.*]] = add i32 undef, [[I20]]
|
||||
; CHECK-NEXT: [[I22:%.*]] = add i32 undef, [[I21]]
|
||||
; CHECK-NEXT: [[I23:%.*]] = add i32 undef, [[I22]]
|
||||
; CHECK-NEXT: [[OP_RDX5:%.*]] = add i32 [[TMP5]], undef
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[OP_RDX6]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX6]], [[OP_RDX5]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[OP_RDX5]]
|
||||
; CHECK-NEXT: [[OP_RDX4:%.*]] = add i32 [[TMP6]], [[OP_RDX3]]
|
||||
; CHECK-NEXT: br label [[IF_END]]
|
||||
; CHECK: if.end:
|
||||
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[I23]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[R:%.*]] = phi i32 [ [[OP_RDX4]], [[FOR_COND_PREHEADER]] ], [ undef, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
|
|
@ -10,11 +10,9 @@ define void @_Z2azv() local_unnamed_addr {
|
|||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP1]])
|
||||
; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP2]], undef
|
||||
; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP2]], i32 undef
|
||||
; CHECK-NEXT: [[OP_EXTRA2:%.*]] = icmp sgt i32 [[OP_EXTRA1]], undef
|
||||
; CHECK-NEXT: [[OP_EXTRA3:%.*]] = select i1 [[OP_EXTRA2]], i32 [[OP_EXTRA1]], i32 undef
|
||||
; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_EXTRA3]]
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = icmp sgt i32 [[TMP2]], undef
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP2]], i32 undef
|
||||
; CHECK-NEXT: [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
|
|
|
@ -35,14 +35,14 @@ define void @n() local_unnamed_addr #0 {
|
|||
; CHECK-NEXT: [[TMP19:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> [[TMP17]]
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP20]])
|
||||
; CHECK-NEXT: [[OP_EXTRA:%.*]] = icmp slt i32 [[TMP21]], [[B_0]]
|
||||
; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP21]], i32 [[B_0]]
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP21]], [[B_0]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP21]], i32 [[B_0]]
|
||||
; CHECK-NEXT: [[SUB_116:%.*]] = sub i32 [[TMP15]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = icmp slt i32 [[SUB_116]], 0
|
||||
; CHECK-NEXT: [[NEG_117:%.*]] = sub nsw i32 0, [[SUB_116]]
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[NEG_117]], i32 [[SUB_116]]
|
||||
; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP23]], [[OP_EXTRA1]]
|
||||
; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP23]], i32 [[OP_EXTRA1]]
|
||||
; CHECK-NEXT: [[CMP12_118:%.*]] = icmp slt i32 [[TMP23]], [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP23]], i32 [[OP_RDX1]]
|
||||
; CHECK-NEXT: [[SUB_1_1:%.*]] = sub i32 [[TMP15]], [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = icmp slt i32 [[SUB_1_1]], 0
|
||||
; CHECK-NEXT: [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]]
|
||||
|
|
Loading…
Reference in New Issue