From eb522e68bc8ee92d9ee38aced7719e3a1789b631 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 27 Feb 2017 22:15:25 +0000 Subject: [PATCH] AMDGPU: Support v2i16/v2f16 packed operations llvm-svn: 296396 --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 18 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 69 ++- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 11 +- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 10 + llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 75 ++- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 106 ++++- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 25 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 20 + llvm/lib/Target/AMDGPU/SIInstructions.td | 63 +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 30 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 283 +++++++++++ llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 33 +- llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll | 152 ++++++ .../CodeGen/AMDGPU/extract_vector_elt-i16.ll | 59 ++- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 92 +++- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 58 ++- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 29 +- llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll | 107 +++++ llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 117 +++-- llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 106 ++++- llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 44 +- llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 31 +- llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 60 ++- llvm/test/CodeGen/AMDGPU/immv216.ll | 447 ++++++++++++++++++ .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 152 +++++- .../CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll | 1 + llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 150 ++++++ llvm/test/CodeGen/AMDGPU/max.i16.ll | 78 ++- llvm/test/CodeGen/AMDGPU/min.ll | 303 +++++++++--- llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 229 +++++++++ llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 188 ++++++++ llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 13 +- llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 140 ++++-- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 150 ++++++ llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 216 +++++++++ llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 278 +++++++++++ 37 files changed, 3591 insertions(+), 366 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/add.v2i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/immv216.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/pack.v2f16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/pack.v2i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 23f124b637ff..0652dacd9b0a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -181,12 +181,20 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { - if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && - T->getIntegerBitWidth() <= 16) + const IntegerType *IntTy = dyn_cast(T); + if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) return true; - if (!T->isVectorTy()) - return false; - return needsPromotionToI32(cast(T)->getElementType()); + + if (const VectorType *VT = dyn_cast(T)) { + // TODO: The set of packed operations is more limited, so may want to + // promote some anyway. + if (ST->hasVOP3PInsts()) + return false; + + return needsPromotionToI32(VT->getElementType()); + } + + return false; } // Return true if the op promoted to i32 should have nsw set. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e02ced04f089..fddf94339a1e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -159,6 +159,10 @@ private: SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); @@ -305,6 +309,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { llvm_unreachable("invalid vector size"); } +static bool getConstantValue(SDValue N, uint32_t &Out) { + if (const ConstantSDNode *C = dyn_cast(N)) { + Out = C->getAPIntValue().getZExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast(N)) { + Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); + return true; + } + + return false; +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -356,7 +374,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); + + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + if (Opc == ISD::BUILD_VECTOR) { + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + uint32_t K = LHSVal | (RHSVal << 16); + CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, + CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + return; + } + } + + break; + } + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { @@ -1565,7 +1600,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1579,7 +1613,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; } @@ -1633,6 +1666,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + Src = In; + + // FIXME: Look for on separate components + if (Src.getOpcode() == ISD::FNEG) { + Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = Src.getOperand(0); + } + + // Packed instructions do not have abs modifiers. + + // FIXME: Handle abs/neg of individual components. + // FIXME: Handle swizzling with op_sel + Mods |= SISrcMods::OP_SEL_1; + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp and op_sel + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3PMods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f28afa89bd27..edaab0063daa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -644,12 +644,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && - VT == MVT::f16); + + // Packed operations do not have a fabs modifier. + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - return isFAbsFree(VT); + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16) || + (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index d0c628775246..ba2aed68fb82 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -75,6 +75,12 @@ def brtarget : Operand; // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -87,6 +93,7 @@ class HasOneUseTernaryOp : PatFrag< [{ return N->hasOneUse(); }] >; +def trunc_oneuse : HasOneUseUnaryOp; let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp; @@ -101,6 +108,8 @@ def xor_oneuse : HasOneUseBinOp; } // Properties = [SDNPCommutative, SDNPAssociative] def sub_oneuse : HasOneUseBinOp; + +def srl_oneuse : HasOneUseBinOp; def shl_oneuse : HasOneUseBinOp; def select_oneuse : HasOneUseTernaryOp