[GlobalISel] Port the udiv -> mul by constant combine.

This is a straight port from the equivalent DAG combine.

Differential Revision: https://reviews.llvm.org/D110890
This commit is contained in:
Amara Emerson 2021-09-28 23:41:11 -07:00
parent e356027016
commit 8bfc0e06dc
9 changed files with 1013 additions and 1492 deletions

View File

@ -602,6 +602,14 @@ public:
/// feeding a G_AND instruction \p MI.
bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
/// Given an G_UDIV \p MI expressing a divide by constant, return an
/// expression that implements it by multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
MachineInstr *buildUDivUsingMul(MachineInstr &MI);
/// Combine G_UDIV by constant into a multiply by magic constant.
bool matchUDivByConst(MachineInstr &MI);
void applyUDivByConst(MachineInstr &MI);
/// Try to transform \p MI by using all of the above
/// combine functions. Returns true if changed.
bool tryCombine(MachineInstr &MI);

View File

@ -397,6 +397,11 @@ bool isBuildVectorAllOnes(const MachineInstr &MI,
Optional<RegOrConstant> getVectorSplat(const MachineInstr &MI,
const MachineRegisterInfo &MRI);
/// Determines if \p MI defines a constant integer or a build vector of
/// constant integers. Treats undef values as constants.
bool isConstantOrConstantVector(MachineInstr &MI,
const MachineRegisterInfo &MRI);
/// Determines if \p MI defines a constant integer or a splat vector of
/// constant integers.
/// \returns the scalar constant or None.

View File

@ -694,6 +694,15 @@ def bitfield_extract_from_shr : GICombineRule<
def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg,
bitfield_extract_from_and,
bitfield_extract_from_shr]>;
def udiv_by_const : GICombineRule<
(defs root:$root),
(match (wip_match_opcode G_UDIV):$root,
[{ return Helper.matchUDivByConst(*${root}); }]),
(apply [{ Helper.applyUDivByConst(*${root}); }])>;
def intdiv_combines : GICombineGroup<[udiv_by_const]>;
def reassoc_ptradd : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (wip_match_opcode G_PTR_ADD):$root,
@ -761,7 +770,8 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
form_bitfield_extract, constant_fold, fabs_fneg_fold]>;
form_bitfield_extract, constant_fold, fabs_fneg_fold,
intdiv_combines]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and

View File

@ -30,6 +30,7 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/DivisionByConstantInfo.h"
#include "llvm/Support/MathExtras.h"
#include <tuple>
@ -4422,6 +4423,162 @@ bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
return true;
}
MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_UDIV);
auto &UDiv = cast<GenericMachineInstr>(MI);
Register Dst = UDiv.getReg(0);
Register LHS = UDiv.getReg(1);
Register RHS = UDiv.getReg(2);
LLT Ty = MRI.getType(Dst);
LLT ScalarTy = Ty.getScalarType();
const unsigned EltBits = ScalarTy.getScalarSizeInBits();
LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
auto &MIB = Builder;
MIB.setInstrAndDebugLoc(MI);
bool UseNPQ = false;
SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
auto BuildUDIVPattern = [&](const Constant *C) {
auto *CI = cast<ConstantInt>(C);
const APInt &Divisor = CI->getValue();
UnsignedDivisonByConstantInfo magics =
UnsignedDivisonByConstantInfo::get(Divisor);
unsigned PreShift = 0, PostShift = 0;
// If the divisor is even, we can avoid using the expensive fixup by
// shifting the divided value upfront.
if (magics.IsAdd != 0 && !Divisor[0]) {
PreShift = Divisor.countTrailingZeros();
// Get magic number for the shifted divisor.
magics =
UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
assert(magics.IsAdd == 0 && "Should use cheap fixup now");
}
APInt Magic = magics.Magic;
unsigned SelNPQ;
if (magics.IsAdd == 0 || Divisor.isOneValue()) {
assert(magics.ShiftAmount < Divisor.getBitWidth() &&
"We shouldn't generate an undefined shift!");
PostShift = magics.ShiftAmount;
SelNPQ = false;
} else {
PostShift = magics.ShiftAmount - 1;
SelNPQ = true;
}
PreShifts.push_back(
MIB.buildConstant(ScalarShiftAmtTy, PreShift).getReg(0));
MagicFactors.push_back(MIB.buildConstant(ScalarTy, Magic).getReg(0));
NPQFactors.push_back(
MIB.buildConstant(ScalarTy,
SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
: APInt::getZero(EltBits))
.getReg(0));
PostShifts.push_back(
MIB.buildConstant(ScalarShiftAmtTy, PostShift).getReg(0));
UseNPQ |= SelNPQ;
return true;
};
// Collect the shifts/magic values from each element.
bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
(void)Matched;
assert(Matched && "Expected unary predicate match to succeed");
Register PreShift, PostShift, MagicFactor, NPQFactor;
auto *RHSDef = getOpcodeDef<GBuildVector>(RHS, MRI);
if (RHSDef) {
PreShift = MIB.buildBuildVector(ShiftAmtTy, PreShifts).getReg(0);
MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0);
NPQFactor = MIB.buildBuildVector(Ty, NPQFactors).getReg(0);
PostShift = MIB.buildBuildVector(ShiftAmtTy, PostShifts).getReg(0);
} else {
assert(MRI.getType(RHS).isScalar() &&
"Non-build_vector operation should have been a scalar");
PreShift = PreShifts[0];
MagicFactor = MagicFactors[0];
PostShift = PostShifts[0];
}
Register Q = LHS;
Q = MIB.buildLShr(Ty, Q, PreShift).getReg(0);
// Multiply the numerator (operand 0) by the magic value.
Q = MIB.buildUMulH(Ty, Q, MagicFactor).getReg(0);
if (UseNPQ) {
Register NPQ = MIB.buildSub(Ty, LHS, Q).getReg(0);
// For vectors we might have a mix of non-NPQ/NPQ paths, so use
// G_UMULH to act as a SRL-by-1 for NPQ, else multiply by zero.
if (Ty.isVector())
NPQ = MIB.buildUMulH(Ty, NPQ, NPQFactor).getReg(0);
else
NPQ = MIB.buildLShr(Ty, NPQ, MIB.buildConstant(ShiftAmtTy, 1)).getReg(0);
Q = MIB.buildAdd(Ty, NPQ, Q).getReg(0);
}
Q = MIB.buildLShr(Ty, Q, PostShift).getReg(0);
auto One = MIB.buildConstant(Ty, 1);
auto IsOne = MIB.buildICmp(
CmpInst::Predicate::ICMP_EQ,
Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
return MIB.buildSelect(Ty, IsOne, LHS, Q);
}
bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_UDIV);
Register Dst = MI.getOperand(0).getReg();
Register RHS = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
auto *RHSDef = MRI.getVRegDef(RHS);
if (!isConstantOrConstantVector(*RHSDef, MRI))
return false;
auto &MF = *MI.getMF();
AttributeList Attr = MF.getFunction().getAttributes();
const auto &TLI = getTargetLowering();
LLVMContext &Ctx = MF.getFunction().getContext();
auto &DL = MF.getDataLayout();
if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr))
return false;
// Don't do this for minsize because the instruction sequence is usually
// larger.
if (MF.getFunction().hasMinSize())
return false;
// Don't do this if the types are not going to be legal.
if (LI) {
if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
return false;
if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
return false;
if (!isLegalOrBeforeLegalizer(
{TargetOpcode::G_ICMP,
{DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
DstTy}}))
return false;
}
auto CheckEltValue = [&](const Constant *C) {
if (auto *CI = dyn_cast_or_null<ConstantInt>(C))
return !CI->isZero();
return false;
};
return matchUnaryPredicate(MRI, RHS, CheckEltValue);
}
void CombinerHelper::applyUDivByConst(MachineInstr &MI) {
auto *NewMI = buildUDivUsingMul(MI);
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
bool CombinerHelper::tryCombine(MachineInstr &MI) {
if (tryCombineCopy(MI))
return true;

View File

@ -1016,6 +1016,23 @@ Optional<RegOrConstant> llvm::getVectorSplat(const MachineInstr &MI,
return RegOrConstant(Reg);
}
bool llvm::isConstantOrConstantVector(MachineInstr &MI,
const MachineRegisterInfo &MRI) {
Register Def = MI.getOperand(0).getReg();
if (auto C = getIConstantVRegValWithLookThrough(Def, MRI))
return true;
GBuildVector *BV = dyn_cast<GBuildVector>(&MI);
if (!BV)
return false;
for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) {
if (getIConstantVRegValWithLookThrough(BV->getSourceReg(SrcIdx), MRI) ||
getOpcodeDef<GImplicitDef>(BV->getSourceReg(SrcIdx), MRI))
continue;
return false;
}
return true;
}
Optional<APInt>
llvm::isConstantOrConstantSplatVector(MachineInstr &MI,
const MachineRegisterInfo &MRI) {

View File

@ -0,0 +1,287 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefixes=SDAG
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=GISEL
; These tests are taken from the combine-udiv.ll in X86.
define <8 x i16> @combine_vec_udiv_uniform(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_uniform:
; SDAG: // %bb.0:
; SDAG-NEXT: mov w8, #25645
; SDAG-NEXT: dup v1.8h, w8
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: usra v1.8h, v0.8h, #1
; SDAG-NEXT: ushr v0.8h, v1.8h, #4
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_uniform:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI0_1
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI0_1]
; GISEL-NEXT: adrp x8, .LCPI0_0
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
; GISEL-NEXT: umull2 v3.4s, v0.8h, v1.8h
; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h
; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h
; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h
; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h
; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h
; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; GISEL-NEXT: add v0.8h, v0.8h, v1.8h
; GISEL-NEXT: ushr v0.8h, v0.8h, #4
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23, i16 23>
ret <8 x i16> %1
}
define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI1_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; SDAG-NEXT: adrp x8, .LCPI1_1
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_1]
; SDAG-NEXT: adrp x8, .LCPI1_2
; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI1_2]
; SDAG-NEXT: ushl v1.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v4.4s, v1.8h, v2.8h
; SDAG-NEXT: umull v1.4s, v1.4h, v2.4h
; SDAG-NEXT: adrp x8, .LCPI1_3
; SDAG-NEXT: uzp2 v1.8h, v1.8h, v4.8h
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI1_3]
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v4.4s, v0.8h, v3.8h
; SDAG-NEXT: umull v0.4s, v0.4h, v3.4h
; SDAG-NEXT: uzp2 v0.8h, v0.8h, v4.8h
; SDAG-NEXT: add v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ushl v0.8h, v0.8h, v2.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI1_5
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI1_5]
; GISEL-NEXT: adrp x8, .LCPI1_4
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_4]
; GISEL-NEXT: adrp x8, .LCPI1_3
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_3]
; GISEL-NEXT: adrp x8, .LCPI1_1
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI1_1]
; GISEL-NEXT: adrp x8, .LCPI1_0
; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI1_0]
; GISEL-NEXT: adrp x8, .LCPI1_2
; GISEL-NEXT: neg v2.8h, v2.8h
; GISEL-NEXT: ldr q6, [x8, :lo12:.LCPI1_2]
; GISEL-NEXT: ushl v2.8h, v0.8h, v2.8h
; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h
; GISEL-NEXT: umull2 v5.4s, v2.8h, v3.8h
; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h
; GISEL-NEXT: uzp2 v2.8h, v2.8h, v5.8h
; GISEL-NEXT: sub v3.8h, v0.8h, v2.8h
; GISEL-NEXT: umull2 v5.4s, v3.8h, v6.8h
; GISEL-NEXT: umull v3.4s, v3.4h, v6.4h
; GISEL-NEXT: uzp2 v3.8h, v3.8h, v5.8h
; GISEL-NEXT: neg v4.8h, v4.8h
; GISEL-NEXT: shl v1.8h, v1.8h, #15
; GISEL-NEXT: add v2.8h, v3.8h, v2.8h
; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h
; GISEL-NEXT: sshr v1.8h, v1.8h, #15
; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 23, i16 34, i16 -23, i16 56, i16 128, i16 -1, i16 -256, i16 -32768>
ret <8 x i16> %1
}
define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform2:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI2_0
; SDAG-NEXT: adrp x9, .LCPI2_1
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
; SDAG-NEXT: ldr q2, [x9, :lo12:.LCPI2_1]
; SDAG-NEXT: adrp x8, .LCPI2_2
; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI2_2]
; SDAG-NEXT: ushl v0.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v1.4s, v0.8h, v2.8h
; SDAG-NEXT: umull v0.4s, v0.4h, v2.4h
; SDAG-NEXT: uzp2 v0.8h, v0.8h, v1.8h
; SDAG-NEXT: ushl v0.8h, v0.8h, v3.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform2:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI2_4
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_4]
; GISEL-NEXT: adrp x8, .LCPI2_3
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_3]
; GISEL-NEXT: adrp x8, .LCPI2_1
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI2_1]
; GISEL-NEXT: adrp x8, .LCPI2_0
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI2_0]
; GISEL-NEXT: adrp x8, .LCPI2_2
; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI2_2]
; GISEL-NEXT: neg v2.8h, v2.8h
; GISEL-NEXT: ushl v2.8h, v0.8h, v2.8h
; GISEL-NEXT: cmeq v1.8h, v1.8h, v4.8h
; GISEL-NEXT: umull2 v4.4s, v2.8h, v5.8h
; GISEL-NEXT: umull v2.4s, v2.4h, v5.4h
; GISEL-NEXT: neg v3.8h, v3.8h
; GISEL-NEXT: shl v1.8h, v1.8h, #15
; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h
; GISEL-NEXT: ushl v2.8h, v2.8h, v3.8h
; GISEL-NEXT: sshr v1.8h, v1.8h, #15
; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 -34, i16 35, i16 36, i16 -37, i16 38, i16 -39, i16 40, i16 -41>
ret <8 x i16> %1
}
define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform3:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI3_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
; SDAG-NEXT: adrp x8, .LCPI3_1
; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI3_1]
; SDAG-NEXT: umull2 v2.4s, v0.8h, v1.8h
; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
; SDAG-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; SDAG-NEXT: sub v0.8h, v0.8h, v1.8h
; SDAG-NEXT: usra v1.8h, v0.8h, #1
; SDAG-NEXT: ushl v0.8h, v1.8h, v3.8h
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform3:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI3_4
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_4]
; GISEL-NEXT: adrp x8, .LCPI3_3
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_3]
; GISEL-NEXT: adrp x8, .LCPI3_2
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI3_2]
; GISEL-NEXT: adrp x8, .LCPI3_1
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_1]
; GISEL-NEXT: adrp x8, .LCPI3_0
; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI3_0]
; GISEL-NEXT: umull2 v6.4s, v0.8h, v2.8h
; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h
; GISEL-NEXT: uzp2 v2.8h, v2.8h, v6.8h
; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h
; GISEL-NEXT: sub v5.8h, v0.8h, v2.8h
; GISEL-NEXT: umull2 v6.4s, v5.8h, v3.8h
; GISEL-NEXT: umull v3.4s, v5.4h, v3.4h
; GISEL-NEXT: uzp2 v3.8h, v3.8h, v6.8h
; GISEL-NEXT: neg v4.8h, v4.8h
; GISEL-NEXT: shl v1.8h, v1.8h, #15
; GISEL-NEXT: add v2.8h, v3.8h, v2.8h
; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h
; GISEL-NEXT: sshr v1.8h, v1.8h, #15
; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
ret <8 x i16> %1
}
define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
; SDAG-LABEL: combine_vec_udiv_nonuniform4:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI4_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI4_0]
; SDAG-NEXT: adrp x8, .LCPI4_1
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_1]
; SDAG-NEXT: adrp x8, .LCPI4_2
; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
; SDAG-NEXT: adrp x8, .LCPI4_3
; SDAG-NEXT: ldr q4, [x8, :lo12:.LCPI4_3]
; SDAG-NEXT: umull2 v5.8h, v0.16b, v1.16b
; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b
; SDAG-NEXT: uzp2 v1.16b, v1.16b, v5.16b
; SDAG-NEXT: ushl v1.16b, v1.16b, v2.16b
; SDAG-NEXT: and v1.16b, v1.16b, v3.16b
; SDAG-NEXT: and v0.16b, v0.16b, v4.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
;
; GISEL-LABEL: combine_vec_udiv_nonuniform4:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI4_3
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_3]
; GISEL-NEXT: adrp x8, .LCPI4_0
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI4_0]
; GISEL-NEXT: adrp x8, .LCPI4_2
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_2]
; GISEL-NEXT: adrp x8, .LCPI4_1
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_1]
; GISEL-NEXT: cmeq v1.16b, v1.16b, v2.16b
; GISEL-NEXT: umull2 v2.8h, v0.16b, v3.16b
; GISEL-NEXT: umull v3.8h, v0.8b, v3.8b
; GISEL-NEXT: neg v4.16b, v4.16b
; GISEL-NEXT: uzp2 v2.16b, v3.16b, v2.16b
; GISEL-NEXT: shl v1.16b, v1.16b, #7
; GISEL-NEXT: ushl v2.16b, v2.16b, v4.16b
; GISEL-NEXT: sshr v1.16b, v1.16b, #7
; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b
; GISEL-NEXT: ret
%div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
ret <16 x i8> %div
}
define <8 x i16> @pr38477(<8 x i16> %a0) {
; SDAG-LABEL: pr38477:
; SDAG: // %bb.0:
; SDAG-NEXT: adrp x8, .LCPI5_0
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0]
; SDAG-NEXT: adrp x8, .LCPI5_1
; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_1]
; SDAG-NEXT: adrp x8, .LCPI5_2
; SDAG-NEXT: umull2 v4.4s, v0.8h, v1.8h
; SDAG-NEXT: umull v1.4s, v0.4h, v1.4h
; SDAG-NEXT: uzp2 v1.8h, v1.8h, v4.8h
; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
; SDAG-NEXT: adrp x8, .LCPI5_3
; SDAG-NEXT: sub v4.8h, v0.8h, v1.8h
; SDAG-NEXT: umull2 v5.4s, v4.8h, v2.8h
; SDAG-NEXT: umull v2.4s, v4.4h, v2.4h
; SDAG-NEXT: ldr q4, [x8, :lo12:.LCPI5_3]
; SDAG-NEXT: adrp x8, .LCPI5_4
; SDAG-NEXT: uzp2 v2.8h, v2.8h, v5.8h
; SDAG-NEXT: ldr q5, [x8, :lo12:.LCPI5_4]
; SDAG-NEXT: add v1.8h, v2.8h, v1.8h
; SDAG-NEXT: ushl v1.8h, v1.8h, v3.8h
; SDAG-NEXT: and v1.16b, v1.16b, v4.16b
; SDAG-NEXT: and v0.16b, v0.16b, v5.16b
; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b
; SDAG-NEXT: ret
;
; GISEL-LABEL: pr38477:
; GISEL: // %bb.0:
; GISEL-NEXT: adrp x8, .LCPI5_4
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_4]
; GISEL-NEXT: adrp x8, .LCPI5_3
; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI5_3]
; GISEL-NEXT: adrp x8, .LCPI5_2
; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2]
; GISEL-NEXT: adrp x8, .LCPI5_1
; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1]
; GISEL-NEXT: adrp x8, .LCPI5_0
; GISEL-NEXT: ldr q5, [x8, :lo12:.LCPI5_0]
; GISEL-NEXT: umull2 v6.4s, v0.8h, v2.8h
; GISEL-NEXT: umull v2.4s, v0.4h, v2.4h
; GISEL-NEXT: uzp2 v2.8h, v2.8h, v6.8h
; GISEL-NEXT: cmeq v1.8h, v1.8h, v5.8h
; GISEL-NEXT: sub v5.8h, v0.8h, v2.8h
; GISEL-NEXT: umull2 v6.4s, v5.8h, v3.8h
; GISEL-NEXT: umull v3.4s, v5.4h, v3.4h
; GISEL-NEXT: uzp2 v3.8h, v3.8h, v6.8h
; GISEL-NEXT: neg v4.8h, v4.8h
; GISEL-NEXT: shl v1.8h, v1.8h, #15
; GISEL-NEXT: add v2.8h, v3.8h, v2.8h
; GISEL-NEXT: ushl v2.8h, v2.8h, v4.8h
; GISEL-NEXT: sshr v1.8h, v1.8h, #15
; GISEL-NEXT: bif v0.16b, v2.16b, v1.16b
; GISEL-NEXT: ret
%1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>
ret <8 x i16> %1
}

View File

@ -0,0 +1,353 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s
---
name: udiv_by_scalar_const
body: |
bb.1:
liveins: $w0
; CHECK-LABEL: name: udiv_by_scalar_const
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 818089009
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[C1]]
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C2]](s32)
; CHECK-NEXT: $w0 = COPY [[LSHR1]](s32)
%0:_(s32) = COPY $w0
%cst:_(s32) = G_CONSTANT i32 42
%2:_(s32) = G_UDIV %0(s32), %cst(s32)
$w0 = COPY %2(s32)
...
---
name: combine_vec_udiv_uniform
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: combine_vec_udiv_uniform
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]]
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>)
; CHECK-NEXT: $q0 = COPY [[LSHR]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
%1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %2(s16), %2(s16), %2(s16), %2(s16), %2(s16), %2(s16), %2(s16)
%3:_(<8 x s16>) = G_UDIV %0, %1
$q0 = COPY %3(<8 x s16>)
RET_ReallyLR implicit $q0
...
---
name: combine_vec_udiv_nonuniform
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: combine_vec_udiv_nonuniform
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 23
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 34
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -23
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 56
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 128
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -256
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16)
; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855
; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195
; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 3
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 512
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639
; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C15]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16)
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C11]](s16), [[C13]](s16), [[C16]](s16), [[C17]](s16), [[C18]](s16), [[C20]](s16), [[C21]](s16)
; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C7]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16)
; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C10]](s16), [[C12]](s16), [[C14]](s16), [[C8]](s16), [[C8]](s16), [[C19]](s16), [[C19]](s16), [[C8]](s16)
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR1]](<8 x s16>)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR2]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR3]]
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR4]](<8 x s16>)
; CHECK-NEXT: [[C22:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR5]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]]
; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 23
%3:_(s16) = G_CONSTANT i16 34
%4:_(s16) = G_CONSTANT i16 -23
%5:_(s16) = G_CONSTANT i16 56
%6:_(s16) = G_CONSTANT i16 128
%7:_(s16) = G_CONSTANT i16 -1
%8:_(s16) = G_CONSTANT i16 -256
%9:_(s16) = G_CONSTANT i16 -32768
%1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16)
%10:_(<8 x s16>) = G_UDIV %0, %1
$q0 = COPY %10(<8 x s16>)
RET_ReallyLR implicit $q0
...
---
name: combine_vec_udiv_nonuniform2
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: combine_vec_udiv_nonuniform2
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -34
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 35
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 36
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -37
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 38
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -39
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 40
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -41
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16)
; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 13
; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5617
; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7281
; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10347
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13107
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32747
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16)
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C12]](s16), [[C14]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16), [[C19]](s16), [[C20]](s16)
; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C11]](s16), [[C13]](s16), [[C13]](s16), [[C16]](s16), [[C13]](s16), [[C11]](s16), [[C13]](s16), [[C16]](s16)
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR1]](<8 x s16>)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR2]]
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR3]](<8 x s16>)
; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]]
; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 -34
%3:_(s16) = G_CONSTANT i16 35
%4:_(s16) = G_CONSTANT i16 36
%5:_(s16) = G_CONSTANT i16 -37
%6:_(s16) = G_CONSTANT i16 38
%7:_(s16) = G_CONSTANT i16 -39
%8:_(s16) = G_CONSTANT i16 40
%9:_(s16) = G_CONSTANT i16 -41
%1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16)
%10:_(<8 x s16>) = G_UDIV %0, %1
$q0 = COPY %10(<8 x s16>)
RET_ReallyLR implicit $q0
...
---
name: combine_vec_udiv_nonuniform3
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: combine_vec_udiv_nonuniform3
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 7
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 23
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 27
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 31
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 47
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 63
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16)
; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363
; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645
; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351
; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137
; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 5
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 517
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16), [[C15]](s16), [[C16]](s16), [[C18]](s16), [[C19]](s16)
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16), [[C9]](s16)
; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C10]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C17]](s16), [[C17]](s16), [[C20]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]]
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>)
; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16), [[C21]](s16)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR]]
; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 7
%3:_(s16) = G_CONSTANT i16 23
%4:_(s16) = G_CONSTANT i16 25
%5:_(s16) = G_CONSTANT i16 27
%6:_(s16) = G_CONSTANT i16 31
%7:_(s16) = G_CONSTANT i16 47
%8:_(s16) = G_CONSTANT i16 63
%9:_(s16) = G_CONSTANT i16 127
%1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16)
%10:_(<8 x s16>) = G_UDIV %0, %1
$q0 = COPY %10(<8 x s16>)
RET_ReallyLR implicit $q0
...
---
name: combine_vec_udiv_nonuniform4
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: combine_vec_udiv_nonuniform4
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -64
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 -85
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s8) = G_CONSTANT i8 7
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8)
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C4]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<16 x s8>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]]
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<16 x s8>) = G_LSHR [[UMULH]], [[BUILD_VECTOR2]](<16 x s8>)
; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<16 x s8>), [[BUILD_VECTOR3]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<16 x s8>) = G_SELECT [[ICMP]](<16 x s1>), [[COPY]], [[LSHR]]
; CHECK-NEXT: $q0 = COPY [[SELECT]](<16 x s8>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<16 x s8>) = COPY $q0
%2:_(s8) = G_CONSTANT i8 -64
%3:_(s8) = G_CONSTANT i8 1
%1:_(<16 x s8>) = G_BUILD_VECTOR %2(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8), %3(s8)
%4:_(<16 x s8>) = G_UDIV %0, %1
$q0 = COPY %4(<16 x s8>)
RET_ReallyLR implicit $q0
...
---
name: pr38477
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$q0' }
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: pr38477
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 119
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 73
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -111
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 118
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 32
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 31
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16)
; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 4957
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768
; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8079
; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 4103
; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 12
; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 16385
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 -29991
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C9]](s16), [[C12]](s16), [[C13]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16), [[C19]](s16)
; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C10]](s16)
; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C11]](s16), [[C11]](s16), [[C14]](s16), [[C16]](s16), [[C11]](s16), [[C8]](s16), [[C20]](s16)
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]]
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]]
; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]]
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]]
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>)
; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR]]
; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<8 x s16>) = COPY $q0
%2:_(s16) = G_CONSTANT i16 1
%3:_(s16) = G_CONSTANT i16 119
%4:_(s16) = G_CONSTANT i16 73
%5:_(s16) = G_CONSTANT i16 -111
%6:_(s16) = G_CONSTANT i16 -3
%7:_(s16) = G_CONSTANT i16 118
%8:_(s16) = G_CONSTANT i16 32
%9:_(s16) = G_CONSTANT i16 31
%1:_(<8 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16), %6(s16), %7(s16), %8(s16), %9(s16)
%10:_(<8 x s16>) = G_UDIV %0, %1
$q0 = COPY %10(<8 x s16>)
RET_ReallyLR implicit $q0
...

View File

@ -222,117 +222,21 @@ define i32 @v_udiv_i32_pow2k_denom(i32 %num) {
; CHECK-LABEL: v_udiv_i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_movk_i32 s6, 0x1000
; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x45800000
; CHECK-NEXT: v_mov_b32_e32 v2, 0xfffff000
; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1
; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v1
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; CHECK-NEXT: v_subrev_i32_e64 v2, s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; CHECK-NEXT: v_mov_b32_e32 v1, 0x100000
; CHECK-NEXT: v_mul_hi_u32 v0, v0, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i32 %num, 4096
ret i32 %result
}
define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) {
; GISEL-LABEL: v_udiv_v2i32_pow2k_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s4, 0x1000
; GISEL-NEXT: v_mov_b32_e32 v2, 0x1000
; GISEL-NEXT: v_mov_b32_e32 v3, 0xfffff000
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s4
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
; GISEL-NEXT: v_mul_lo_u32 v3, v3, v5
; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v3, v5, v3
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3
; GISEL-NEXT: v_lshlrev_b32_e32 v5, 12, v4
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v3
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v3
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GISEL-NEXT: v_subrev_i32_e64 v5, s[4:5], s4, v0
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i32_pow2k_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_movk_i32 s8, 0x1000
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000
; CGP-NEXT: s_movk_i32 s4, 0xf000
; CGP-NEXT: v_mov_b32_e32 v3, 0xfffff000
; CGP-NEXT: v_mov_b32_e32 v4, 0x1000
; CGP-NEXT: v_rcp_iflag_f32_e32 v5, 0x45800000
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
; CGP-NEXT: v_mul_lo_u32 v6, s4, v2
; CGP-NEXT: v_mul_lo_u32 v3, v3, v5
; CGP-NEXT: v_mul_hi_u32 v6, v2, v6
; CGP-NEXT: v_mul_hi_u32 v3, v5, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
; CGP-NEXT: v_mul_hi_u32 v3, v1, v3
; CGP-NEXT: v_lshlrev_b32_e32 v5, 12, v2
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2
; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v3
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0
; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v1
; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v4
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v3
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
; CHECK-LABEL: v_udiv_v2i32_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s4, 0x100000
; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4
; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
}
@ -341,25 +245,12 @@ define i32 @v_udiv_i32_oddk_denom(i32 %num) {
; CHECK-LABEL: v_udiv_i32_oddk_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s6, 0x12d8fb
; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, 0x4996c7d8
; CHECK-NEXT: v_mov_b32_e32 v2, 0xffed2705
; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1
; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881
; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
; CHECK-NEXT: v_mul_lo_u32 v2, v1, s6
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; CHECK-NEXT: v_subrev_i32_e64 v2, s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = udiv i32 %num, 1235195
ret i32 %result
@ -369,87 +260,34 @@ define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
; GISEL-LABEL: v_udiv_v2i32_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb
; GISEL-NEXT: v_mov_b32_e32 v2, 0x12d8fb
; GISEL-NEXT: v_mov_b32_e32 v3, 0xffed2705
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, s8
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
; GISEL-NEXT: v_mul_lo_u32 v3, v3, v5
; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v3, v5, v3
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3
; GISEL-NEXT: v_mul_lo_u32 v5, v4, s8
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
; GISEL-NEXT: v_mul_lo_u32 v7, v3, v2
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v3
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GISEL-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v3
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
; GISEL-NEXT: s_mov_b32 s4, 0xb2a50881
; GISEL-NEXT: s_brev_b32 s5, 1
; GISEL-NEXT: v_mul_hi_u32 v2, v0, s4
; GISEL-NEXT: v_mul_hi_u32 v3, v1, s4
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_mul_hi_u32 v0, v0, s5
; GISEL-NEXT: v_mul_hi_u32 v1, v1, s5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; GISEL-NEXT: v_lshrrev_b32_e32 v1, 20, v1
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i32_oddk_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: s_mov_b32 s8, 0x12d8fb
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8
; CGP-NEXT: s_mov_b32 s4, 0xffed2705
; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, 0x4996c7d8
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
; CGP-NEXT: v_mul_lo_u32 v5, s4, v2
; CGP-NEXT: v_mul_lo_u32 v6, s4, v4
; CGP-NEXT: v_mul_hi_u32 v5, v2, v5
; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
; CGP-NEXT: v_mul_lo_u32 v5, v2, s8
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2
; CGP-NEXT: v_mul_lo_u32 v7, v4, s8
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; CGP-NEXT: v_subrev_i32_e64 v5, s[4:5], s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e64 v6, s[6:7], v1, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
; CGP-NEXT: s_mov_b32 s4, 0xb2a50881
; CGP-NEXT: v_mul_hi_u32 v2, v0, s4
; CGP-NEXT: v_mul_hi_u32 v3, v1, s4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; CGP-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; CGP-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; CGP-NEXT: v_lshrrev_b32_e32 v0, 20, v0
; CGP-NEXT: v_lshrrev_b32_e32 v1, 20, v1
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = udiv <2 x i32> %num, <i32 1235195, i32 1235195>
ret <2 x i32> %result

File diff suppressed because it is too large Load Diff