From 50122a58906e36184fd2a0f95944fbca6c6b08f0 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 7 Apr 2014 19:45:41 +0000 Subject: [PATCH] R600: Match 24-bit arithmetic patterns in a Target DAGCombine Moving these patterns from TableGen files to PerformDAGCombine() should allow us to generate better code by eliminating unnecessary shifts and extensions earlier. This also fixes a bug where the MAD pattern was calling SimplifyDemandedBits with a 24-bit mask on the first operand even when the full pattern wasn't being matched. This occasionally resulted in some instructions being incorrectly deleted from the program. v2: - Fix bug with 64-bit mul llvm-svn: 205731 --- llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 46 ---------- llvm/lib/Target/R600/AMDGPUISelLowering.cpp | 84 +++++++++++++++++++ llvm/lib/Target/R600/AMDGPUISelLowering.h | 4 + llvm/lib/Target/R600/AMDGPUInstrInfo.td | 8 ++ llvm/lib/Target/R600/AMDGPUInstructions.td | 3 - llvm/lib/Target/R600/AMDGPUSubtarget.h | 9 ++ llvm/lib/Target/R600/CaymanInstructions.td | 4 +- llvm/lib/Target/R600/EvergreenInstructions.td | 4 +- llvm/lib/Target/R600/R600ISelLowering.cpp | 1 + llvm/lib/Target/R600/SIISelLowering.cpp | 2 +- llvm/lib/Target/R600/SIInstructions.td | 8 +- llvm/test/CodeGen/R600/mad_int24.ll | 17 ++-- llvm/test/CodeGen/R600/mad_uint24.ll | 67 ++++++++------- llvm/test/CodeGen/R600/mul_int24.ll | 17 ++-- llvm/test/CodeGen/R600/mul_uint24-i64.ll | 24 ++++++ llvm/test/CodeGen/R600/mul_uint24.ll | 43 ++++------ 16 files changed, 215 insertions(+), 126 deletions(-) create mode 100644 llvm/test/CodeGen/R600/mul_uint24-i64.ll diff --git a/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 9a15d4d34faa..f0ff0f9e146f 100644 --- a/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -59,9 +59,6 @@ private: bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); - SDValue SimplifyI24(SDValue &Op); - bool SelectI24(SDValue Addr, SDValue &Op); - bool SelectU24(SDValue Addr, SDValue &Op); static bool checkType(const Value *ptr, unsigned int addrspace); @@ -600,49 +597,6 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } -SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) { - APInt Demanded = APInt(32, 0x00FFFFFF); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true); - const TargetLowering *TLI = getTargetLowering(); - if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) { - CurDAG->ReplaceAllUsesWith(Op, TLO.New); - CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode()); - return SimplifyI24(TLO.New); - } else { - return Op; - } -} - -bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) { - - assert(Op.getValueType() == MVT::i32); - - if (CurDAG->ComputeNumSignBits(Op) == 9) { - I24 = SimplifyI24(Op); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) { - APInt KnownZero; - APInt KnownOne; - CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne); - - assert (Op.getValueType() == MVT::i32); - - // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than - // i32. These smaller types are legal to use with the i24 instructions. - if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 || - Op.getOpcode() == ISD::ANY_EXTEND || - ISD::isEXTLoad(Op.getNode())) { - U24 = SimplifyI24(Op); - return true; - } - return false; -} - void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = (*(const AMDGPUTargetLowering*)getTargetLowering()); diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp index 0c8850030eb5..1fed068c9851 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp @@ -227,6 +227,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); + + setTargetDAGCombine(ISD::MUL); } //===----------------------------------------------------------------------===// @@ -1107,6 +1109,86 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG); } +//===----------------------------------------------------------------------===// +// Custom DAG optimizations +//===----------------------------------------------------------------------===// + +static bool isU24(SDValue Op, SelectionDAG &DAG) { + APInt KnownZero, KnownOne; + EVT VT = Op.getValueType(); + DAG.ComputeMaskedBits(Op, KnownZero, KnownOne); + + return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; +} + +static bool isI24(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated + // as unsigned 24-bit values. + (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; +} + +static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { + + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Op.getValueType(); + + APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, true, true); + if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); +} + +SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + switch(N->getOpcode()) { + default: break; + case ISD::MUL: { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue Mul; + + // FIXME: Add support for 24-bit multiply with 64-bit output on SI. + if (VT.isVector() || VT.getSizeInBits() > 32) + break; + + if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { + N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); + } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { + N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); + N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); + Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); + } else { + break; + } + + SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT); + + return Reg; + } + case AMDGPUISD::MUL_I24: + case AMDGPUISD::MUL_U24: { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + simplifyI24(N0, DCI); + simplifyI24(N1, DCI); + return SDValue(); + } + } + return SDValue(); +} + //===----------------------------------------------------------------------===// // Helper functions //===----------------------------------------------------------------------===// @@ -1203,6 +1285,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) + NODE_NAME_CASE(MUL_U24) + NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h index a01961601de9..61b5fe4ad9a2 100644 --- a/llvm/lib/Target/R600/AMDGPUISelLowering.h +++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h @@ -140,6 +140,8 @@ public: /// We don't want to shrink f64/f32 constants. bool ShouldShrinkFPConstant(EVT VT) const; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + private: void InitAMDILLowering(); SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const; @@ -188,6 +190,8 @@ enum { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. + MUL_U24, + MUL_I24, TEXTURE_FETCH, EXPORT, CONST_ADDRESS, diff --git a/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/llvm/lib/Target/R600/AMDGPUInstrInfo.td index 69d80592cf86..258d5a60ef37 100644 --- a/llvm/lib/Target/R600/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/R600/AMDGPUInstrInfo.td @@ -92,3 +92,11 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; +// Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when +// performing the mulitply. The result is a 32-bit value. +def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, + [SDNPCommutative] +>; +def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, + [SDNPCommutative] +>; diff --git a/llvm/lib/Target/R600/AMDGPUInstructions.td b/llvm/lib/Target/R600/AMDGPUInstructions.td index 505fc81347b0..cea7a90e6579 100644 --- a/llvm/lib/Target/R600/AMDGPUInstructions.td +++ b/llvm/lib/Target/R600/AMDGPUInstructions.td @@ -253,9 +253,6 @@ def FP_ONE : PatLeaf < [{return N->isExactlyValue(1.0);}] >; -def U24 : ComplexPattern; -def I24 : ComplexPattern; - let isCodeGenOnly = 1, isPseudo = 1 in { let usesCustomInserter = 1 in { diff --git a/llvm/lib/Target/R600/AMDGPUSubtarget.h b/llvm/lib/Target/R600/AMDGPUSubtarget.h index 8874d14c18cb..7cf102cff668 100644 --- a/llvm/lib/Target/R600/AMDGPUSubtarget.h +++ b/llvm/lib/Target/R600/AMDGPUSubtarget.h @@ -77,6 +77,15 @@ public: return hasBFE(); } + bool hasMulU24() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasMulI24() const { + return (getGeneration() >= SOUTHERN_ISLANDS || + hasCaymanISA()); + } + bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; diff --git a/llvm/lib/Target/R600/CaymanInstructions.td b/llvm/lib/Target/R600/CaymanInstructions.td index acd7bdecdcbe..837d6025339f 100644 --- a/llvm/lib/Target/R600/CaymanInstructions.td +++ b/llvm/lib/Target/R600/CaymanInstructions.td @@ -21,10 +21,10 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">; let Predicates = [isCayman] in { def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", - [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU + [(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))], VecALU >; def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", - [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU >; let isVector = 1 in { diff --git a/llvm/lib/Target/R600/EvergreenInstructions.td b/llvm/lib/Target/R600/EvergreenInstructions.td index dec6da6a5132..7153b70b5312 100644 --- a/llvm/lib/Target/R600/EvergreenInstructions.td +++ b/llvm/lib/Target/R600/EvergreenInstructions.td @@ -294,7 +294,7 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT", >; def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24", - [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU + [(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))], VecALU >; def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>; def : ROTRPattern ; @@ -309,7 +309,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>; def MUL_LIT_eg : MUL_LIT_Common<0x1F>; def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24", - [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU >; def DOT4_eg : DOT4_Common<0xBE>; defm CUBE_eg : CUBE_Common<0xC0>; diff --git a/llvm/lib/Target/R600/R600ISelLowering.cpp b/llvm/lib/Target/R600/R600ISelLowering.cpp index 349146e1b6dd..1e6582296ac7 100644 --- a/llvm/lib/Target/R600/R600ISelLowering.cpp +++ b/llvm/lib/Target/R600/R600ISelLowering.cpp @@ -1526,6 +1526,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) case ISD::FP_ROUND: { SDValue Arg = N->getOperand(0); diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp index 58526d27a996..b9295ff466ce 100644 --- a/llvm/lib/Target/R600/SIISelLowering.cpp +++ b/llvm/lib/Target/R600/SIISelLowering.cpp @@ -963,7 +963,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, EVT VT = N->getValueType(0); switch (N->getOpcode()) { - default: break; + default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SELECT_CC: { ConstantSDNode *True, *False; // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index 5e8ef4f4a4e7..2afe828be7b3 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -946,11 +946,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", - [(set i32:$dst, (mul I24:$src0, I24:$src1))] + [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))] >; //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", - [(set i32:$dst, (mul U24:$src0, U24:$src1))] + [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))] >; //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; @@ -1046,10 +1046,10 @@ let neverHasSideEffects = 1 in { def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>; def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", - [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))] + [(set i32:$dst, (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2))] >; def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", - [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))] + [(set i32:$dst, (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2))] >; } // End neverHasSideEffects diff --git a/llvm/test/CodeGen/R600/mad_int24.ll b/llvm/test/CodeGen/R600/mad_int24.ll index df063ece35ae..abb52907b9b8 100644 --- a/llvm/test/CodeGen/R600/mad_int24.ll +++ b/llvm/test/CodeGen/R600/mad_int24.ll @@ -1,12 +1,15 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; EG-CHECK: @i32_mad24 +; FUNC-LABEL: @i32_mad24 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs. -; EG-CHECK: MULLO_INT -; CM-CHECK: MULADD_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X -; SI-CHECK: V_MAD_I32_I24 +; EG: MULLO_INT +; Make sure we aren't masking the inputs. +; CM-NOT: AND +; CM: MULADD_INT24 +; SI-NOT: AND +; SI: V_MAD_I32_I24 define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: %0 = shl i32 %a, 8 diff --git a/llvm/test/CodeGen/R600/mad_uint24.ll b/llvm/test/CodeGen/R600/mad_uint24.ll index 3dcadc93d286..0f0893bd53c4 100644 --- a/llvm/test/CodeGen/R600/mad_uint24.ll +++ b/llvm/test/CodeGen/R600/mad_uint24.ll @@ -1,11 +1,10 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; EG-CHECK-LABEL: @u32_mad24 -; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X -; SI-CHECK-LABEL: @u32_mad24 -; SI-CHECK: V_MAD_U32_U24 +; FUNC-LABEL: @u32_mad24 +; EG: MULADD_UINT24 +; SI: V_MAD_U32_U24 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) { entry: @@ -19,18 +18,14 @@ entry: ret void } -; EG-CHECK-LABEL: @i16_mad24 -; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40 -; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44 -; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48 +; FUNC-LABEL: @i16_mad24 ; The order of A and B does not matter. -; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]] +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] ; The result must be sign-extended -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x -; EG-CHECK: 16 -; SI-CHECK-LABEL: @i16_mad24 -; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16 +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG: 16 +; SI: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: V_BFE_I32 v{{[0-9]}}, [[MAD]], 0, 16 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) { entry: @@ -41,18 +36,13 @@ entry: ret void } -; EG-CHECK-LABEL: @i8_mad24 -; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40 -; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44 -; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48 -; The order of A and B does not matter. -; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]], [[A]], [[B]], [[C]] +; FUNC-LABEL: @i8_mad24 +; EG: MULADD_UINT24 {{[* ]*}}T{{[0-9]}}.[[MAD_CHAN:[XYZW]]] ; The result must be sign-extended -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x -; EG-CHECK: 8 -; SI-CHECK-LABEL: @i8_mad24 -; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8 +; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x +; EG: 8 +; SI: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) { entry: @@ -62,3 +52,24 @@ entry: store i32 %2, i32 addrspace(1)* %out ret void } + +; This tests for a bug where the mad_u24 pattern matcher would call +; SimplifyDemandedBits on the first operand of the mul instruction +; assuming that the pattern would be matched to a 24-bit mad. This +; led to some instructions being incorrectly erased when the entire +; 24-bit mad pattern wasn't being matched. + +; Check that the select instruction is not deleted. +; FUNC-LABEL: @i24_i32_i32_mad +; EG: CNDE_INT +; SI: V_CNDMASK +define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) { +entry: + %0 = ashr i32 %a, 8 + %1 = icmp ne i32 %c, 0 + %2 = select i1 %1, i32 %0, i32 34 + %3 = mul i32 %2, %c + %4 = add i32 %3, %d + store i32 %4, i32 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/R600/mul_int24.ll b/llvm/test/CodeGen/R600/mul_int24.ll index 66a1a9e5bd99..046911ba147d 100644 --- a/llvm/test/CodeGen/R600/mul_int24.ll +++ b/llvm/test/CodeGen/R600/mul_int24.ll @@ -1,12 +1,15 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; EG-CHECK: @i32_mul24 +; FUNC-LABEL: @i32_mul24 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs. -; EG-CHECK: MULLO_INT -; CM-CHECK: MUL_INT24 {{[ *]*}}T{{[0-9].[XYZW]}}, KC0[2].Z, KC0[2].W -; SI-CHECK: V_MUL_I32_I24 +; EG: MULLO_INT +; Make sure we are not masking the inputs +; CM-NOT: AND +; CM: MUL_INT24 +; SI-NOT: AND +; SI: V_MUL_I32_I24 define void @i32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: %0 = shl i32 %a, 8 diff --git a/llvm/test/CodeGen/R600/mul_uint24-i64.ll b/llvm/test/CodeGen/R600/mul_uint24-i64.ll new file mode 100644 index 000000000000..95b3bcbf8e86 --- /dev/null +++ b/llvm/test/CodeGen/R600/mul_uint24-i64.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC + +; FIXME: Move this test into mul_uint24.ll once i64 mul is supported. +; XFAIL: * + +; Multiply with 24-bit inputs and 64-bit output +; FUNC_LABEL: @mul24_i64 +; EG; MUL_UINT24 +; EG: MULHI +; SI: V_MUL_U32_U24 +; FIXME: SI support 24-bit mulhi +; SI: V_MUL_HI_U32 +define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { +entry: + %0 = shl i64 %a, 40 + %a_24 = lshr i64 %0, 40 + %1 = shl i64 %b, 40 + %b_24 = lshr i64 %1, 40 + %2 = mul i64 %a_24, %b_24 + store i64 %2, i64 addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/R600/mul_uint24.ll b/llvm/test/CodeGen/R600/mul_uint24.ll index a4139619bfae..27b3717f6b75 100644 --- a/llvm/test/CodeGen/R600/mul_uint24.ll +++ b/llvm/test/CodeGen/R600/mul_uint24.ll @@ -1,11 +1,10 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK -; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC -; EG-CHECK-LABEL: @u32_mul24 -; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W -; SI-CHECK-LABEL: @u32_mul24 -; SI-CHECK: V_MUL_U32_U24 +; FUNC-LABEL: @u32_mul24 +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W +; SI: V_MUL_U32_U24 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) { entry: @@ -18,17 +17,13 @@ entry: ret void } -; EG-CHECK-LABEL: @i16_mul24 -; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40 -; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44 -; The order of A and B does not matter. -; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]] +; FUNC-LABEL: @i16_mul24 +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; The result must be sign-extended -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -; EG-CHECK: 16 -; SI-CHECK-LABEL: @i16_mul24 -; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16, +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x +; EG: 16 +; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 16, define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) { entry: %0 = mul i16 %a, %b @@ -37,16 +32,12 @@ entry: ret void } -; EG-CHECK-LABEL: @i8_mul24 -; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40 -; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44 -; The order of A and B does not matter. -; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]], [[A]], [[B]] +; FUNC-LABEL: @i8_mul24 +; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]] ; The result must be sign-extended -; EG-CHECK: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x -; SI-CHECK-LABEL: @i8_mul24 -; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; SI-CHECK: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8, +; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x +; SI: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: V_BFE_I32 v{{[0-9]}}, [[MUL]], 0, 8, define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) { entry: