forked from OSchip/llvm-project
R600: Match 24-bit arithmetic patterns in a Target DAGCombine
Moving these patterns from TableGen files to PerformDAGCombine() should allow us to generate better code by eliminating unnecessary shifts and extensions earlier. This also fixes a bug where the MAD pattern was calling SimplifyDemandedBits with a 24-bit mask on the first operand even when the full pattern wasn't being matched. This occasionally resulted in some instructions being incorrectly deleted from the program. v2: - Fix bug with 64-bit mul llvm-svn: 205731
This commit is contained in:
parent
3cbe014027
commit
50122a5890
|
@ -59,9 +59,6 @@ private:
|
|||
bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
|
||||
bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
|
||||
bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
|
||||
SDValue SimplifyI24(SDValue &Op);
|
||||
bool SelectI24(SDValue Addr, SDValue &Op);
|
||||
bool SelectU24(SDValue Addr, SDValue &Op);
|
||||
|
||||
static bool checkType(const Value *ptr, unsigned int addrspace);
|
||||
|
||||
|
@ -600,49 +597,6 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
|
|||
return true;
|
||||
}
|
||||
|
||||
SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
|
||||
APInt Demanded = APInt(32, 0x00FFFFFF);
|
||||
APInt KnownZero, KnownOne;
|
||||
TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
|
||||
const TargetLowering *TLI = getTargetLowering();
|
||||
if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
|
||||
CurDAG->ReplaceAllUsesWith(Op, TLO.New);
|
||||
CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
|
||||
return SimplifyI24(TLO.New);
|
||||
} else {
|
||||
return Op;
|
||||
}
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
|
||||
|
||||
assert(Op.getValueType() == MVT::i32);
|
||||
|
||||
if (CurDAG->ComputeNumSignBits(Op) == 9) {
|
||||
I24 = SimplifyI24(Op);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
|
||||
APInt KnownZero;
|
||||
APInt KnownOne;
|
||||
CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
|
||||
|
||||
assert (Op.getValueType() == MVT::i32);
|
||||
|
||||
// ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
|
||||
// i32. These smaller types are legal to use with the i24 instructions.
|
||||
if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
|
||||
Op.getOpcode() == ISD::ANY_EXTEND ||
|
||||
ISD::isEXTLoad(Op.getNode())) {
|
||||
U24 = SimplifyI24(Op);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
|
||||
const AMDGPUTargetLowering& Lowering =
|
||||
(*(const AMDGPUTargetLowering*)getTargetLowering());
|
||||
|
|
|
@ -227,6 +227,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
|
|||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
|
||||
|
||||
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
|
||||
|
||||
setTargetDAGCombine(ISD::MUL);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1107,6 +1109,86 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
|
|||
return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Custom DAG optimizations
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
static bool isU24(SDValue Op, SelectionDAG &DAG) {
|
||||
APInt KnownZero, KnownOne;
|
||||
EVT VT = Op.getValueType();
|
||||
DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
|
||||
|
||||
return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
|
||||
}
|
||||
|
||||
static bool isI24(SDValue Op, SelectionDAG &DAG) {
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
// In order for this to be a signed 24-bit value, bit 23, must
|
||||
// be a sign bit.
|
||||
return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
|
||||
// as unsigned 24-bit values.
|
||||
(VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
|
||||
}
|
||||
|
||||
static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
|
||||
APInt KnownZero, KnownOne;
|
||||
TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
|
||||
if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
|
||||
DCI.CommitTargetLoweringOpt(TLO);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
SDLoc DL(N);
|
||||
|
||||
switch(N->getOpcode()) {
|
||||
default: break;
|
||||
case ISD::MUL: {
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
SDValue Mul;
|
||||
|
||||
// FIXME: Add support for 24-bit multiply with 64-bit output on SI.
|
||||
if (VT.isVector() || VT.getSizeInBits() > 32)
|
||||
break;
|
||||
|
||||
if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
|
||||
N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
|
||||
N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
|
||||
Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
|
||||
} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
|
||||
N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
|
||||
N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
|
||||
Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
|
||||
|
||||
return Reg;
|
||||
}
|
||||
case AMDGPUISD::MUL_I24:
|
||||
case AMDGPUISD::MUL_U24: {
|
||||
SDValue N0 = N->getOperand(0);
|
||||
SDValue N1 = N->getOperand(1);
|
||||
simplifyI24(N0, DCI);
|
||||
simplifyI24(N1, DCI);
|
||||
return SDValue();
|
||||
}
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Helper functions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1203,6 +1285,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
NODE_NAME_CASE(BFE_I32)
|
||||
NODE_NAME_CASE(BFI)
|
||||
NODE_NAME_CASE(BFM)
|
||||
NODE_NAME_CASE(MUL_U24)
|
||||
NODE_NAME_CASE(MUL_I24)
|
||||
NODE_NAME_CASE(URECIP)
|
||||
NODE_NAME_CASE(DOT4)
|
||||
NODE_NAME_CASE(EXPORT)
|
||||
|
|
|
@ -140,6 +140,8 @@ public:
|
|||
/// We don't want to shrink f64/f32 constants.
|
||||
bool ShouldShrinkFPConstant(EVT VT) const;
|
||||
|
||||
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
private:
|
||||
void InitAMDILLowering();
|
||||
SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
@ -188,6 +190,8 @@ enum {
|
|||
BFE_I32, // Extract range of bits with sign extension to 32-bits.
|
||||
BFI, // (src0 & src1) | (~src0 & src2)
|
||||
BFM, // Insert a range of bits into a 32-bit word.
|
||||
MUL_U24,
|
||||
MUL_I24,
|
||||
TEXTURE_FETCH,
|
||||
EXPORT,
|
||||
CONST_ADDRESS,
|
||||
|
|
|
@ -92,3 +92,11 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
|
|||
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
|
||||
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
|
||||
|
||||
// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when
|
||||
// performing the mulitply. The result is a 32-bit value.
|
||||
def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
|
||||
[SDNPCommutative]
|
||||
>;
|
||||
def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
|
||||
[SDNPCommutative]
|
||||
>;
|
||||
|
|
|
@ -253,9 +253,6 @@ def FP_ONE : PatLeaf <
|
|||
[{return N->isExactlyValue(1.0);}]
|
||||
>;
|
||||
|
||||
def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
|
||||
def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
|
||||
|
||||
let isCodeGenOnly = 1, isPseudo = 1 in {
|
||||
|
||||
let usesCustomInserter = 1 in {
|
||||
|
|
|
@ -77,6 +77,15 @@ public:
|
|||
return hasBFE();
|
||||
}
|
||||
|
||||
bool hasMulU24() const {
|
||||
return (getGeneration() >= EVERGREEN);
|
||||
}
|
||||
|
||||
bool hasMulI24() const {
|
||||
return (getGeneration() >= SOUTHERN_ISLANDS ||
|
||||
hasCaymanISA());
|
||||
}
|
||||
|
||||
bool IsIRStructurizerEnabled() const;
|
||||
bool isIfCvtEnabled() const;
|
||||
unsigned getWavefrontSize() const;
|
||||
|
|
|
@ -21,10 +21,10 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
|
|||
let Predicates = [isCayman] in {
|
||||
|
||||
def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
|
||||
[(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
|
||||
[(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))], VecALU
|
||||
>;
|
||||
def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
|
||||
[(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
|
||||
[(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
|
||||
>;
|
||||
|
||||
let isVector = 1 in {
|
||||
|
|
|
@ -294,7 +294,7 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
|
|||
>;
|
||||
|
||||
def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
|
||||
[(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
|
||||
[(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))], VecALU
|
||||
>;
|
||||
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
|
||||
def : ROTRPattern <BIT_ALIGN_INT_eg>;
|
||||
|
@ -309,7 +309,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>;
|
|||
def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
|
||||
def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
|
||||
def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
|
||||
[(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
|
||||
[(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
|
||||
>;
|
||||
def DOT4_eg : DOT4_Common<0xBE>;
|
||||
defm CUBE_eg : CUBE_Common<0xC0>;
|
||||
|
|
|
@ -1526,6 +1526,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
||||
switch (N->getOpcode()) {
|
||||
default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
||||
// (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
|
||||
case ISD::FP_ROUND: {
|
||||
SDValue Arg = N->getOperand(0);
|
||||
|
|
|
@ -963,7 +963,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
|||
EVT VT = N->getValueType(0);
|
||||
|
||||
switch (N->getOpcode()) {
|
||||
default: break;
|
||||
default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
|
||||
case ISD::SELECT_CC: {
|
||||
ConstantSDNode *True, *False;
|
||||
// i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
|
||||
|
|
|
@ -946,11 +946,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
|
|||
|
||||
|
||||
defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
|
||||
[(set i32:$dst, (mul I24:$src0, I24:$src1))]
|
||||
[(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
|
||||
>;
|
||||
//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
|
||||
defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
|
||||
[(set i32:$dst, (mul U24:$src0, U24:$src1))]
|
||||
[(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
|
||||
>;
|
||||
//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
|
||||
|
||||
|
@ -1046,10 +1046,10 @@ let neverHasSideEffects = 1 in {
|
|||
def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
|
||||
def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
|
||||
def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
|
||||
[(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
|
||||
[(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))]
|
||||
>;
|
||||
def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
|
||||
[(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
|
||||
[(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))]
|
||||
>;
|
||||
|
||||
} // End neverHasSideEffects
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
|
||||
|
||||
; EG-CHECK: @i32_mad24
|
||||
; FUNC-LABEL: @i32_mad24
|
||||
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
|
||||
; EG-CHECK: MULLO_INT
|
||||
; CM-CHECK: MULADD_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
|
||||
; SI-CHECK: V_MAD_I32_I24
|
||||
; EG: MULLO_INT
|
||||
; Make sure we aren't masking the inputs.
|
||||
; CM-NOT: AND
|
||||
; CM: MULADD_INT24
|
||||
; SI-NOT: AND
|
||||
; SI: V_MAD_I32_I24
|
||||
define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
|
||||
entry:
|
||||
%0 = shl i32 %a, 8
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
|
||||
|
||||
; EG-CHECK-LABEL: @u32_mad24
|
||||
; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
|
||||
; SI-CHECK-LABEL: @u32_mad24
|
||||
; SI-CHECK: V_MAD_U32_U24
|
||||
; FUNC-LABEL: @u32_mad24
|
||||
; EG: MULADD_UINT24
|
||||
; SI: V_MAD_U32_U24
|
||||
|
||||
define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
|
||||
entry:
|
||||
|
@ -19,18 +18,14 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; EG-CHECK-LABEL: @i16_mad24
|
||||
; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
|
||||
; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
|
||||
; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
|
||||
; FUNC-LABEL: @i16_mad24
|
||||
; The order of A and B does not matter.
|
||||
; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
|
||||
; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
|
||||
; The result must be sign-extended
|
||||
; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
|
||||
; EG-CHECK: 16
|
||||
; SI-CHECK-LABEL: @i16_mad24
|
||||
; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
|
||||
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
|
||||
; EG: 16
|
||||
; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16
|
||||
|
||||
define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
|
||||
entry:
|
||||
|
@ -41,18 +36,13 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; EG-CHECK-LABEL: @i8_mad24
|
||||
; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
|
||||
; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
|
||||
; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
|
||||
; The order of A and B does not matter.
|
||||
; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]]
|
||||
; FUNC-LABEL: @i8_mad24
|
||||
; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]]
|
||||
; The result must be sign-extended
|
||||
; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
|
||||
; EG-CHECK: 8
|
||||
; SI-CHECK-LABEL: @i8_mad24
|
||||
; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
|
||||
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
|
||||
; EG: 8
|
||||
; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8
|
||||
|
||||
define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
|
||||
entry:
|
||||
|
@ -62,3 +52,24 @@ entry:
|
|||
store i32 %2, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; This tests for a bug where the mad_u24 pattern matcher would call
|
||||
; SimplifyDemandedBits on the first operand of the mul instruction
|
||||
; assuming that the pattern would be matched to a 24-bit mad. This
|
||||
; led to some instructions being incorrectly erased when the entire
|
||||
; 24-bit mad pattern wasn't being matched.
|
||||
|
||||
; Check that the select instruction is not deleted.
|
||||
; FUNC-LABEL: @i24_i32_i32_mad
|
||||
; EG: CNDE_INT
|
||||
; SI: V_CNDMASK
|
||||
define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
|
||||
entry:
|
||||
%0 = ashr i32 %a, 8
|
||||
%1 = icmp ne i32 %c, 0
|
||||
%2 = select i1 %1, i32 %0, i32 34
|
||||
%3 = mul i32 %2, %c
|
||||
%4 = add i32 %3, %d
|
||||
store i32 %4, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
|
||||
|
||||
; EG-CHECK: @i32_mul24
|
||||
; FUNC-LABEL: @i32_mul24
|
||||
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
|
||||
; EG-CHECK: MULLO_INT
|
||||
; CM-CHECK: MUL_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W
|
||||
; SI-CHECK: V_MUL_I32_I24
|
||||
; EG: MULLO_INT
|
||||
; Make sure we are not masking the inputs
|
||||
; CM-NOT: AND
|
||||
; CM: MUL_INT24
|
||||
; SI-NOT: AND
|
||||
; SI: V_MUL_I32_I24
|
||||
define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
entry:
|
||||
%0 = shl i32 %a, 8
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
|
||||
|
||||
; FIXME: Move this test into mul_uint24.ll once i64 mul is supported.
|
||||
; XFAIL: *
|
||||
|
||||
; Multiply with 24-bit inputs and 64-bit output
|
||||
; FUNC_LABEL: @mul24_i64
|
||||
; EG; MUL_UINT24
|
||||
; EG: MULHI
|
||||
; SI: V_MUL_U32_U24
|
||||
; FIXME: SI support 24-bit mulhi
|
||||
; SI: V_MUL_HI_U32
|
||||
define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||
entry:
|
||||
%0 = shl i64 %a, 40
|
||||
%a_24 = lshr i64 %0, 40
|
||||
%1 = shl i64 %b, 40
|
||||
%b_24 = lshr i64 %1, 40
|
||||
%2 = mul i64 %a_24, %b_24
|
||||
store i64 %2, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
|
@ -1,11 +1,10 @@
|
|||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
|
||||
; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
|
||||
|
||||
; EG-CHECK-LABEL: @u32_mul24
|
||||
; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
|
||||
; SI-CHECK-LABEL: @u32_mul24
|
||||
; SI-CHECK: V_MUL_U32_U24
|
||||
; FUNC-LABEL: @u32_mul24
|
||||
; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
|
||||
; SI: V_MUL_U32_U24
|
||||
|
||||
define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||
entry:
|
||||
|
@ -18,17 +17,13 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; EG-CHECK-LABEL: @i16_mul24
|
||||
; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
|
||||
; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
|
||||
; The order of A and B does not matter.
|
||||
; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
|
||||
; FUNC-LABEL: @i16_mul24
|
||||
; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
|
||||
; The result must be sign-extended
|
||||
; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
|
||||
; EG-CHECK: 16
|
||||
; SI-CHECK-LABEL: @i16_mul24
|
||||
; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
|
||||
; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
|
||||
; EG: 16
|
||||
; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16,
|
||||
define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
||||
entry:
|
||||
%0 = mul i16 %a, %b
|
||||
|
@ -37,16 +32,12 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; EG-CHECK-LABEL: @i8_mul24
|
||||
; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
|
||||
; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
|
||||
; The order of A and B does not matter.
|
||||
; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]]
|
||||
; FUNC-LABEL: @i8_mul24
|
||||
; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
|
||||
; The result must be sign-extended
|
||||
; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
|
||||
; SI-CHECK-LABEL: @i8_mul24
|
||||
; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
|
||||
; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
|
||||
; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||
; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8,
|
||||
|
||||
define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue