[RISCV] Use MULHU for more division by constant cases.

D113805 improved handling of i32 divu/remu on RV64. The basic idea
from that can be extended to (mul (and X, C2), C1) where C2 is any
mask constant.

We can replace the and with an SLLI by shifting by the number of
leading zeros in C2 if we also shift C1 left by XLen - lzcnt(C1)
bits. This will give the full product XLen additional trailing zeros,
putting the result in the output of MULHU. If we can't use ANDI,
ZEXT.H, or ZEXT.W, this will avoid materializing C2 in a register.

The downside is it make take 1 additional instruction to create C1.
But since that's not on the critical path, it can hopefully be
interleaved with other operations.

The previous tablegen pattern is replaced by custom isel code.

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D115310
This commit is contained in:
Craig Topper 2021-12-09 08:53:46 -08:00
parent 04b2f6ea8a
commit 6f7de819b9
6 changed files with 83 additions and 38 deletions

View File

@ -718,6 +718,71 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
break;
}
case ISD::MUL: {
// Special case for calculating (mul (and X, C2), C1) where the full product
// fits in XLen bits. We can shift X left by the number of leading zeros in
// C2 and shift C1 left by XLen-lzcnt(C2). This will ensure the final
// product has XLen trailing zeros, putting it in the output of MULHU. This
// can avoid materializing a constant in a register for C2.
// RHS should be a constant.
auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
if (!N1C || !N1C->hasOneUse())
break;
// LHS should be an AND with constant.
SDValue N0 = Node->getOperand(0);
if (N0.getOpcode() != ISD::AND || !isa<ConstantSDNode>(N0.getOperand(1)))
break;
uint64_t C2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
// Constant should be a mask.
if (!isMask_64(C2))
break;
// This should be the only use of the AND unless we will use
// (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND
// constants.
if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF))
break;
// If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this
// optimization.
if (isInt<12>(C2) ||
(C2 == UINT64_C(0xFFFF) &&
(Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) ||
(C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba()))
break;
// We need to shift left the AND input and C1 by a total of XLen bits.
// How far left do we need to shift the AND input?
unsigned XLen = Subtarget->getXLen();
unsigned LeadingZeros = XLen - (64 - countLeadingZeros(C2));
// The constant gets shifted by the remaining amount unless that would
// shift bits out.
uint64_t C1 = N1C->getZExtValue();
unsigned ConstantShift = XLen - LeadingZeros;
if (ConstantShift > (XLen - (64 - countLeadingZeros(C1))))
break;
uint64_t ShiftedC1 = C1 << ConstantShift;
// If this RV32, we need to sign extend the constant.
if (XLen == 32)
ShiftedC1 = SignExtend64(ShiftedC1, 32);
// Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))).
SDNode *Imm = selectImm(CurDAG, DL, ShiftedC1, *Subtarget);
SDNode *SLLI =
CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
CurDAG->getTargetConstant(LeadingZeros, DL, VT));
SDNode *MULHU = CurDAG->getMachineNode(RISCV::MULHU, DL, VT,
SDValue(SLLI, 0), SDValue(Imm, 0));
ReplaceNode(Node, MULHU);
return;
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntNo = Node->getConstantOperandVal(0);
switch (IntNo) {

View File

@ -96,14 +96,6 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
(REMW GPR:$rs1, GPR:$rs2)>;
} // Predicates = [HasStdExtM, IsRV64]
// Pattern to detect constants with no more than 32 active bits that can't
// be materialized with lui+addiw.
def uimm32_not_simm32 : PatLeaf<(XLenVT GPR:$a), [{
auto *C = dyn_cast<ConstantSDNode>(N);
return C && C->hasOneUse() && isUInt<32>(C->getZExtValue()) &&
!isInt<32>(C->getSExtValue());
}]>;
let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
// Special case for calculating the full 64-bit product of a 32x32 unsigned
// multiply where the inputs aren't known to be zero extended. We can shift the
@ -111,9 +103,4 @@ let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
// zeroing the upper 32 bits.
def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
(MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
// The RHS could also be a constant that is hard to materialize. By shifting
// left we can allow constant materialization to use LUI+ADDIW via
// hasAllWUsers.
def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), uimm32_not_simm32:$rs2)),
(MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
} // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]

View File

@ -498,12 +498,9 @@ define i16 @udiv16_constant(i16 %a) nounwind {
;
; RV32IM-LABEL: udiv16_constant:
; RV32IM: # %bb.0:
; RV32IM-NEXT: lui a1, 16
; RV32IM-NEXT: addi a1, a1, -1
; RV32IM-NEXT: and a0, a0, a1
; RV32IM-NEXT: lui a1, 13
; RV32IM-NEXT: addi a1, a1, -819
; RV32IM-NEXT: mul a0, a0, a1
; RV32IM-NEXT: slli a0, a0, 16
; RV32IM-NEXT: lui a1, 838864
; RV32IM-NEXT: mulhu a0, a0, a1
; RV32IM-NEXT: srli a0, a0, 18
; RV32IM-NEXT: ret
;
@ -522,12 +519,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
;
; RV64IM-LABEL: udiv16_constant:
; RV64IM: # %bb.0:
; RV64IM-NEXT: lui a1, 16
; RV64IM-NEXT: addiw a1, a1, -1
; RV64IM-NEXT: and a0, a0, a1
; RV64IM-NEXT: lui a1, 13
; RV64IM-NEXT: addiw a1, a1, -819
; RV64IM-NEXT: mul a0, a0, a1
; RV64IM-NEXT: lui a1, 52429
; RV64IM-NEXT: slli a1, a1, 4
; RV64IM-NEXT: slli a0, a0, 48
; RV64IM-NEXT: mulhu a0, a0, a1
; RV64IM-NEXT: srli a0, a0, 18
; RV64IM-NEXT: ret
%1 = udiv i16 %a, 5

View File

@ -21,12 +21,10 @@ define signext i32 @wobble() nounwind {
; CHECK-NEXT: lui a2, %hi(global.1)
; CHECK-NEXT: sw a0, %lo(global.1)(a2)
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: lui a1, 16
; CHECK-NEXT: addiw a1, a1, -1
; CHECK-NEXT: and a1, a0, a1
; CHECK-NEXT: lui a2, 13
; CHECK-NEXT: addiw a2, a2, -819
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: slli a1, a0, 48
; CHECK-NEXT: lui a2, 52429
; CHECK-NEXT: slli a2, a2, 4
; CHECK-NEXT: mulhu a1, a1, a2
; CHECK-NEXT: srli a1, a1, 18
; CHECK-NEXT: lui a2, %hi(global.3)
; CHECK-NEXT: li a3, 5

View File

@ -48,10 +48,10 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
; RV64IM-LABEL: fold_urem_positive_odd:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
; RV64IM-NEXT: srli a1, a1, 32
; RV64IM-NEXT: lui a2, 364242
; RV64IM-NEXT: addiw a2, a2, 777
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 32
; RV64IM-NEXT: subw a2, a0, a1
; RV64IM-NEXT: srliw a2, a2, 1
@ -179,10 +179,10 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
; RV64IM-LABEL: combine_urem_udiv:
; RV64IM: # %bb.0:
; RV64IM-NEXT: slli a1, a0, 32
; RV64IM-NEXT: srli a1, a1, 32
; RV64IM-NEXT: lui a2, 364242
; RV64IM-NEXT: addiw a2, a2, 777
; RV64IM-NEXT: mul a1, a1, a2
; RV64IM-NEXT: slli a2, a2, 32
; RV64IM-NEXT: mulhu a1, a1, a2
; RV64IM-NEXT: srli a1, a1, 32
; RV64IM-NEXT: subw a2, a0, a1
; RV64IM-NEXT: srliw a2, a2, 1

View File

@ -1201,10 +1201,10 @@ define zeroext i1 @umulo2.i32(i32 %v1, i32* %res) {
;
; RV64-LABEL: umulo2.i32:
; RV64: # %bb.0: # %entry
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: li a2, 13
; RV64-NEXT: mul a2, a0, a2
; RV64-NEXT: slli a2, a2, 32
; RV64-NEXT: slli a0, a0, 32
; RV64-NEXT: mulhu a2, a0, a2
; RV64-NEXT: srli a0, a2, 32
; RV64-NEXT: snez a0, a0
; RV64-NEXT: sw a2, 0(a1)