[GlobalISel] Add sdiv exact (X, constant) -> mul combine.

This port of the SDAG optimization is only for exact sdiv case.

Differential Revision: https://reviews.llvm.org/D130517
This commit is contained in:
Amara Emerson 2022-07-25 14:21:40 -07:00
parent ebd0249fcf
commit 4cf3db41da
5 changed files with 251 additions and 3 deletions

View File

@ -647,6 +647,13 @@ public:
bool matchUDivByConst(MachineInstr &MI);
void applyUDivByConst(MachineInstr &MI);
/// Given an G_SDIV \p MI expressing a signed divide by constant, return an
/// expression that implements it by multiplying by a magic number.
/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
MachineInstr *buildSDivUsingMul(MachineInstr &MI);
bool matchSDivByConst(MachineInstr &MI);
void applySDivByConst(MachineInstr &MI);
// G_UMULH x, (1 << c)) -> x >> (bitwidth - c)
bool matchUMulHToLShr(MachineInstr &MI);
void applyUMulHToLShr(MachineInstr &MI);

View File

@ -764,7 +764,13 @@ def udiv_by_const : GICombineRule<
[{ return Helper.matchUDivByConst(*${root}); }]),
(apply [{ Helper.applyUDivByConst(*${root}); }])>;
def intdiv_combines : GICombineGroup<[udiv_by_const]>;
def sdiv_by_const : GICombineRule<
(defs root:$root),
(match (wip_match_opcode G_SDIV):$root,
[{ return Helper.matchSDivByConst(*${root}); }]),
(apply [{ Helper.applySDivByConst(*${root}); }])>;
def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const]>;
def reassoc_ptradd : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),

View File

@ -4935,6 +4935,108 @@ void CombinerHelper::applyUDivByConst(MachineInstr &MI) {
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
bool CombinerHelper::matchSDivByConst(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
Register Dst = MI.getOperand(0).getReg();
Register RHS = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(Dst);
auto &MF = *MI.getMF();
AttributeList Attr = MF.getFunction().getAttributes();
const auto &TLI = getTargetLowering();
LLVMContext &Ctx = MF.getFunction().getContext();
auto &DL = MF.getDataLayout();
if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr))
return false;
// Don't do this for minsize because the instruction sequence is usually
// larger.
if (MF.getFunction().hasMinSize())
return false;
// If the sdiv has an 'exact' flag we can use a simpler lowering.
if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
return matchUnaryPredicate(
MRI, RHS, [](const Constant *C) { return C && !C->isZeroValue(); });
}
// Don't support the general case for now.
return false;
}
void CombinerHelper::applySDivByConst(MachineInstr &MI) {
auto *NewMI = buildSDivUsingMul(MI);
replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
}
MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
auto &SDiv = cast<GenericMachineInstr>(MI);
Register Dst = SDiv.getReg(0);
Register LHS = SDiv.getReg(1);
Register RHS = SDiv.getReg(2);
LLT Ty = MRI.getType(Dst);
LLT ScalarTy = Ty.getScalarType();
LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
auto &MIB = Builder;
MIB.setInstrAndDebugLoc(MI);
bool UseSRA = false;
SmallVector<Register, 16> Shifts, Factors;
auto *RHSDef = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).hasValue();
auto BuildSDIVPattern = [&](const Constant *C) {
// Don't recompute inverses for each splat element.
if (IsSplat && !Factors.empty()) {
Shifts.push_back(Shifts[0]);
Factors.push_back(Factors[0]);
return true;
}
auto *CI = cast<ConstantInt>(C);
APInt Divisor = CI->getValue();
unsigned Shift = Divisor.countTrailingZeros();
if (Shift) {
Divisor.ashrInPlace(Shift);
UseSRA = true;
}
// Calculate the multiplicative inverse modulo BW.
// 2^W requires W + 1 bits, so we have to extend and then truncate.
unsigned W = Divisor.getBitWidth();
APInt Factor = Divisor.zext(W + 1)
.multiplicativeInverse(APInt::getSignedMinValue(W + 1))
.trunc(W);
Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
return true;
};
// Collect all magic values from the build vector.
bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern);
(void)Matched;
assert(Matched && "Expected unary predicate match to succeed");
Register Shift, Factor;
if (Ty.isVector()) {
Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
} else {
Shift = Shifts[0];
Factor = Factors[0];
}
Register Res = LHS;
if (UseSRA)
Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
return MIB.buildMul(Ty, Res, Factor);
}
bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) {
assert(MI.getOpcode() == TargetOpcode::G_UMULH);
Register RHS = MI.getOperand(2).getReg();

View File

@ -1077,7 +1077,7 @@ Optional<APInt> llvm::getIConstantSplatVal(const Register Reg,
return None;
}
Optional<APInt> getIConstantSplatVal(const MachineInstr &MI,
Optional<APInt> llvm::getIConstantSplatVal(const MachineInstr &MI,
const MachineRegisterInfo &MRI) {
return getIConstantSplatVal(MI.getOperand(0).getReg(), MRI);
}

View File

@ -0,0 +1,133 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s
--- |
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
define void @sdiv_exact() { ret void }
define void @sdiv_noexact() { ret void }
define void @sdiv_exact_minsize() #0 { ret void }
define void @div_v4s32() { ret void }
define void @div_v4s32_splat() { ret void }
attributes #0 = { minsize }
...
---
name: sdiv_exact
body: |
bb.1:
liveins: $w0
; CHECK-LABEL: name: sdiv_exact
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = exact G_ASHR [[COPY]], [[C]](s32)
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[ASHR]], [[C1]]
; CHECK-NEXT: $w0 = COPY [[MUL]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
%2:_(s32) = exact G_SDIV %0, %1
$w0 = COPY %2(s32)
RET_ReallyLR implicit $w0
...
---
name: sdiv_noexact
body: |
bb.1:
liveins: $w0
; CHECK-LABEL: name: sdiv_noexact
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 104
; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY]], [[C]]
; CHECK-NEXT: $w0 = COPY [[SDIV]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
%2:_(s32) = G_SDIV %0, %1
$w0 = COPY %2(s32)
RET_ReallyLR implicit $w0
...
---
name: sdiv_exact_minsize
body: |
bb.1:
liveins: $w0
; CHECK-LABEL: name: sdiv_exact_minsize
; CHECK: liveins: $w0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 104
; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = exact G_SDIV [[COPY]], [[C]]
; CHECK-NEXT: $w0 = COPY [[SDIV]](s32)
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:_(s32) = COPY $w0
%1:_(s32) = G_CONSTANT i32 104
%2:_(s32) = exact G_SDIV %0, %1
$w0 = COPY %2(s32)
RET_ReallyLR implicit $w0
...
---
name: div_v4s32
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: div_v4s32
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C1]](s32), [[C2]](s32)
; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = exact G_ASHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[ASHR]], [[BUILD_VECTOR1]]
; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%c1:_(s32) = G_CONSTANT i32 104
%c2:_(s32) = G_CONSTANT i32 72
%1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c2(s32), %c1(s32), %c2(s32)
%3:_(<4 x s32>) = exact G_SDIV %0, %1
$q0 = COPY %3(<4 x s32>)
RET_ReallyLR implicit $q0
...
---
name: div_v4s32_splat
body: |
bb.1:
liveins: $q0
; CHECK-LABEL: name: div_v4s32_splat
; CHECK: liveins: $q0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(<4 x s32>) = exact G_ASHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>)
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[ASHR]], [[BUILD_VECTOR1]]
; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%c1:_(s32) = G_CONSTANT i32 104
%1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c1(s32), %c1(s32), %c1(s32)
%3:_(<4 x s32>) = exact G_SDIV %0, %1
$q0 = COPY %3(<4 x s32>)
RET_ReallyLR implicit $q0
...