diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index db7a4ab521a9..84768e632d74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -799,6 +799,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; +def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"!FeatureVOP3P">; + def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e8720e7a4c56..1b3b9d1e851c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3007,7 +3007,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue X = LHS->getOperand(0); if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && - isTypeLegal(MVT::v2i16)) { + isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { // Prefer build_vector as the canonical form if packed types are legal. // (shl ([asz]ext i16:x), 16 -> build_vector 0, x SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, @@ -3818,12 +3818,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // TODO: Generalize and move to DAGCombiner SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast(Src)) { - assert(Src.getValueType() == MVT::i64); - SDLoc SL(N); - uint64_t CVal = C->getZExtValue(); - return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + if (Src.getValueType() == MVT::i64) { + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } } if (ConstantFPSDNode *C = dyn_cast(Src)) { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 999d5534050f..038a4aa076ac 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1060,14 +1060,14 @@ defm : MUBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat; - defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; + defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; defm : MUBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. @@ -1547,14 +1547,14 @@ defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat; - defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; + defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; defm : MTBUF_LoadIntrinsicPat; } // End HasPackedD16VMem. diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index fb89e7c8d0e6..3779e751ec78 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -610,10 +610,7 @@ multiclass ImageDimD16Helper; - // used on gfx810 - def _packed_v2 : ImageDimPattern; - // used on gfx900 - def _packed_v2_gfx9 : ImageDimPattern; + def _packed_v2 : ImageDimPattern; def _packed_v4 : ImageDimPattern; } // End HasPackedD16VMem. } @@ -717,7 +714,7 @@ multiclass ImageSampleAltPatterns { } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageSampleDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageSampleDataPatterns(opcode # _V2), v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -780,7 +777,7 @@ multiclass ImageLoadAltPatterns { } // End HasUnPackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageLoadDataPatterns(opcode # _V1), i32, "_D16">; + defm : ImageLoadDataPatterns(opcode # _V1), f16, "_D16">; defm : ImageLoadDataPatterns(opcode # _V2), v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -865,8 +862,8 @@ defm : ImageLoadAltPatterns; defm : ImageLoadAltPatterns; // Image store. -defm : ImageStorePatterns; -defm : ImageStorePatterns; +defm : ImageStorePatterns; +defm : ImageStorePatterns; defm : ImageStoreAltPatterns; defm : ImageStoreAltPatterns; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 41103970b09f..a074e557f245 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -139,9 +139,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); - } - if (Subtarget->hasVOP3PInsts()) { + // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); } @@ -174,7 +173,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -423,9 +421,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f16, Legal); if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - } - if (Subtarget->hasVOP3PInsts()) { for (MVT VT : {MVT::v2i16, MVT::v2f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { @@ -472,11 +468,34 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); setOperationAction(ISD::XOR, MVT::v2i16, Promote); AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2i16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2f16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v4i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + + if (!Subtarget->hasVOP3PInsts()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + } + + if (Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::ADD, MVT::v2i16, Legal); setOperationAction(ISD::SUB, MVT::v2i16, Legal); setOperationAction(ISD::MUL, MVT::v2i16, Legal); @@ -489,25 +508,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMAX, MVT::v2i16, Legal); setOperationAction(ISD::FADD, MVT::v2f16, Legal); - setOperationAction(ISD::FNEG, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); - // This isn't really legal, but this avoids the legalizer unrolling it (and - // allows matching fneg (fabs x) patterns) - setOperationAction(ISD::FABS, MVT::v2f16, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + } - setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); } else { + // Legalization hack. setOperationAction(ISD::SELECT, MVT::v2i16, Custom); setOperationAction(ISD::SELECT, MVT::v2f16, Custom); } @@ -3514,205 +3531,72 @@ static unsigned getImageOpcode(unsigned IID) { return 0; } -static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL, - SelectionDAG &DAG, bool Unpacked) { +static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, + const SDLoc &DL, + SelectionDAG &DAG, bool Unpacked) { + if (!LoadVT.isVector()) + return Result; + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. // Truncate to v2i16/v4i16. EVT IntLoadVT = LoadVT.changeTypeToInteger(); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result); + + // Workaround legalizer not scalarizing truncate after vector op + // legalization byt not creating intermediate vector trunc. + SmallVector Elts; + DAG.ExtractVectorElements(Result, Elts); + for (SDValue &Elt : Elts) + Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); + // Bitcast to original type (v2f16/v4f16). - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } + // Cast back to the original packed type. return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } -// This is to lower INTRINSIC_W_CHAIN with illegal result types. -SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op, - SDValue &Chain, SelectionDAG &DAG) const { - EVT LoadVT = Op.getValueType(); - // TODO: handle v3f16. - if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16) - return SDValue(); +SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, + MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic) const { + SDLoc DL(M); + SmallVector Ops; + Ops.reserve(M->getNumOperands()); + + Ops.push_back(M->getOperand(0)); + if (IsIntrinsic) + Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32)); + + // Skip 1, as it is the intrinsic ID. + for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I) + Ops.push_back(M->getOperand(I)); bool Unpacked = Subtarget->hasUnpackedD16VMem(); - EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; - EVT EquivLoadVT = Unpacked ? UnpackedLoadVT : - getEquivalentMemType(*DAG.getContext(), LoadVT); + EVT LoadVT = M->getValueType(0); + + EVT UnpackedLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; + EVT EquivLoadVT = LoadVT; + if (LoadVT.isVector()) { + EquivLoadVT = Unpacked ? UnpackedLoadVT : + getEquivalentMemType(*DAG.getContext(), LoadVT); + } + // Change from v4f16/v2f16 to EquivLoadVT. SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); - SDValue Res; - SDLoc DL(Op); - MemSDNode *M = cast(Op); - unsigned IID = cast(Op.getOperand(1))->getZExtValue(); - switch (IID) { - case Intrinsic::amdgcn_tbuffer_load: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, - VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - case Intrinsic::amdgcn_buffer_load_format: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - case Intrinsic::amdgcn_image_load: - case Intrinsic::amdgcn_image_load_mip: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vaddr - Op.getOperand(3), // rsrc - Op.getOperand(4), // dmask - Op.getOperand(5), // glc - Op.getOperand(6), // slc - Op.getOperand(7), // lwe - Op.getOperand(8) // da - }; - unsigned Opc = getImageOpcode(IID); - Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - // Basic sample. - case Intrinsic::amdgcn_image_sample: - case Intrinsic::amdgcn_image_sample_cl: - case Intrinsic::amdgcn_image_sample_d: - case Intrinsic::amdgcn_image_sample_d_cl: - case Intrinsic::amdgcn_image_sample_l: - case Intrinsic::amdgcn_image_sample_b: - case Intrinsic::amdgcn_image_sample_b_cl: - case Intrinsic::amdgcn_image_sample_lz: - case Intrinsic::amdgcn_image_sample_cd: - case Intrinsic::amdgcn_image_sample_cd_cl: + SDValue Load + = DAG.getMemIntrinsicNode(IsIntrinsic ? ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); - // Sample with comparison. - case Intrinsic::amdgcn_image_sample_c: - case Intrinsic::amdgcn_image_sample_c_cl: - case Intrinsic::amdgcn_image_sample_c_d: - case Intrinsic::amdgcn_image_sample_c_d_cl: - case Intrinsic::amdgcn_image_sample_c_l: - case Intrinsic::amdgcn_image_sample_c_b: - case Intrinsic::amdgcn_image_sample_c_b_cl: - case Intrinsic::amdgcn_image_sample_c_lz: - case Intrinsic::amdgcn_image_sample_c_cd: - case Intrinsic::amdgcn_image_sample_c_cd_cl: + SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); - // Sample with offsets. - case Intrinsic::amdgcn_image_sample_o: - case Intrinsic::amdgcn_image_sample_cl_o: - case Intrinsic::amdgcn_image_sample_d_o: - case Intrinsic::amdgcn_image_sample_d_cl_o: - case Intrinsic::amdgcn_image_sample_l_o: - case Intrinsic::amdgcn_image_sample_b_o: - case Intrinsic::amdgcn_image_sample_b_cl_o: - case Intrinsic::amdgcn_image_sample_lz_o: - case Intrinsic::amdgcn_image_sample_cd_o: - case Intrinsic::amdgcn_image_sample_cd_cl_o: - - // Sample with comparison and offsets. - case Intrinsic::amdgcn_image_sample_c_o: - case Intrinsic::amdgcn_image_sample_c_cl_o: - case Intrinsic::amdgcn_image_sample_c_d_o: - case Intrinsic::amdgcn_image_sample_c_d_cl_o: - case Intrinsic::amdgcn_image_sample_c_l_o: - case Intrinsic::amdgcn_image_sample_c_b_o: - case Intrinsic::amdgcn_image_sample_c_b_cl_o: - case Intrinsic::amdgcn_image_sample_c_lz_o: - case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: - - // Basic gather4 - case Intrinsic::amdgcn_image_gather4: - case Intrinsic::amdgcn_image_gather4_cl: - case Intrinsic::amdgcn_image_gather4_l: - case Intrinsic::amdgcn_image_gather4_b: - case Intrinsic::amdgcn_image_gather4_b_cl: - case Intrinsic::amdgcn_image_gather4_lz: - - // Gather4 with comparison - case Intrinsic::amdgcn_image_gather4_c: - case Intrinsic::amdgcn_image_gather4_c_cl: - case Intrinsic::amdgcn_image_gather4_c_l: - case Intrinsic::amdgcn_image_gather4_c_b: - case Intrinsic::amdgcn_image_gather4_c_b_cl: - case Intrinsic::amdgcn_image_gather4_c_lz: - - // Gather4 with offsets - case Intrinsic::amdgcn_image_gather4_o: - case Intrinsic::amdgcn_image_gather4_cl_o: - case Intrinsic::amdgcn_image_gather4_l_o: - case Intrinsic::amdgcn_image_gather4_b_o: - case Intrinsic::amdgcn_image_gather4_b_cl_o: - case Intrinsic::amdgcn_image_gather4_lz_o: - - // Gather4 with comparison and offsets - case Intrinsic::amdgcn_image_gather4_c_o: - case Intrinsic::amdgcn_image_gather4_c_cl_o: - case Intrinsic::amdgcn_image_gather4_c_l_o: - case Intrinsic::amdgcn_image_gather4_c_b_o: - case Intrinsic::amdgcn_image_gather4_c_b_cl_o: - case Intrinsic::amdgcn_image_gather4_c_lz_o: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vaddr - Op.getOperand(3), // rsrc - Op.getOperand(4), // sampler - Op.getOperand(5), // dmask - Op.getOperand(6), // unorm - Op.getOperand(7), // glc - Op.getOperand(8), // slc - Op.getOperand(9), // lwe - Op.getOperand(10) // da - }; - unsigned Opc = getImageOpcode(IID); - Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - default: { - const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = - AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID); - if (D16ImageDimIntr) { - SmallVector Ops; - for (auto Value : Op.getNode()->op_values()) - Ops.push_back(Value); - Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); - Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops, - M->getMemoryVT(), M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - - return SDValue(); - } - } + return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3767,13 +3651,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::INTRINSIC_W_CHAIN: { - SDValue Chain; - if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0), - Chain, DAG)) { + if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } + break; } case ISD::SELECT: { @@ -4279,22 +4162,24 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + assert(VT == MVT::v2f16 || VT == MVT::v2i16); - // Turn into pair of packed build_vectors. - // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); - return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); + + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, + DAG.getConstant(16, SL, MVT::i32)); + + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + + return DAG.getNode(ISD::BITCAST, SL, VT, Or); } bool @@ -4829,13 +4714,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG); + } + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4849,10 +4744,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(10) // slc }; - EVT VT = Op.getValueType(); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4933,6 +4827,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: { + EVT LoadVT = Op.getValueType(); + if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) || + LoadVT == MVT::v4f16) { + MemSDNode *M = cast(Op); + return adjustLoadValueType(getImageOpcode(IntrID), M, DAG); + } + + return SDValue(); + } + // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -4979,7 +4885,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_image_sample_c_b_cl_o: case Intrinsic::amdgcn_image_sample_c_lz_o: case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { // Replace dmask with everything disabled with undef. const ConstantSDNode *DMask = dyn_cast(Op.getOperand(5)); if (!DMask || DMask->isNullValue()) { @@ -4987,9 +4925,32 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); } + if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) || + Op.getValueType() == MVT::v4f16) { + return adjustLoadValueType(getImageOpcode(IntrID), cast(Op), + DAG); + } + return SDValue(); } default: + EVT LoadVT = Op.getValueType(); + if (LoadVT.getScalarSizeInBits() != 16) + return SDValue(); + + const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID); + if (D16ImageDimIntr) { + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + MemSDNode *M = cast(Op); + + if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16)) + return SDValue(); + + return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, + M, DAG, true); + } + return SDValue(); } } @@ -4997,26 +4958,32 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); - SDLoc DL(VData); - if (StoreVT.isVector()) { - assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); - if (!Subtarget->hasUnpackedD16VMem()) { - if (!isTypeLegal(StoreVT)) { - // If Target supports packed vmem, we just need to workaround - // the illegal type by casting to an equivalent one. - EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); - return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); - } - } else { // We need to unpack the packed data to store. - EVT IntStoreVT = StoreVT.changeTypeToInteger(); - SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); - EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; - return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); - } - } // No change for f16 and legal vector D16 types. - return VData; + if (!StoreVT.isVector()) + return VData; + + SDLoc DL(VData); + assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); + + if (Subtarget->hasUnpackedD16VMem()) { + // We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + StoreVT.getVectorNumElements()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + return DAG.UnrollVectorOp(ZExt.getNode()); + } + + if (isTypeLegal(StoreVT)) + return VData; + + // If target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); + return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -5207,46 +5174,48 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_image_store: case Intrinsic::amdgcn_image_store_mip: { SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) - VData = handleD16VData(VData, DAG); - SDValue Ops[] = { - Chain, // Chain - VData, // vdata - Op.getOperand(3), // vaddr - Op.getOperand(4), // rsrc - Op.getOperand(5), // dmask - Op.getOperand(6), // glc - Op.getOperand(7), // slc - Op.getOperand(8), // lwe - Op.getOperand(9) // da - }; - unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? - AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; - MemSDNode *M = cast(Op); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } + if ((Subtarget->hasUnpackedD16VMem() && + VData.getValueType() == MVT::v2f16) || + VData.getValueType() == MVT::v4f16) { + SDValue Chain = Op.getOperand(0); + VData = handleD16VData(VData, DAG); + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // vaddr + Op.getOperand(4), // rsrc + Op.getOperand(5), // dmask + Op.getOperand(6), // glc + Op.getOperand(7), // slc + Op.getOperand(8), // lwe + Op.getOperand(9) // da + }; + unsigned Opc = (IntrinsicID == Intrinsic::amdgcn_image_store) ? + AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; + MemSDNode *M = cast(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + + return SDValue(); + } default: { const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID); if (D16ImageDimIntr) { SDValue VData = Op.getOperand(2); EVT StoreVT = VData.getValueType(); - if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) || - StoreVT == MVT::v4f16) { - VData = handleD16VData(VData, DAG); + if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) && + Subtarget->hasUnpackedD16VMem()) || + !isTypeLegal(StoreVT)) { + SmallVector Ops(Op.getNode()->op_values()); - SmallVector Ops; - for (auto Value : Op.getNode()->op_values()) - Ops.push_back(Value); Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); - Ops[2] = VData; + Ops[2] = handleD16VData(VData, DAG); MemSDNode *M = cast(Op); return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ae8b19a46fc3..ffb644fafbee 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -60,8 +60,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerIntrinsicWChain_IllegalReturnType(SDValue Op, SDValue &Chain, - SelectionDAG &DAG) const; + SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic = false) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 8080151d6d9b..363b0e712237 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -871,11 +871,13 @@ def : ClampPat; def : ClampPat; def : ClampPat; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), (V_PK_MAX_F16 $src0_modifiers, $src0, $src0_modifiers, $src0, DSTCLAMP.ENABLE) >; +} /********** ================================ **********/ /********** Floating point absolute/negative **********/ @@ -1333,11 +1335,13 @@ def : GCNPat< (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) >; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) >; } +} let OtherPredicates = [NoFP32Denormals] in { def : GCNPat< @@ -1387,11 +1391,6 @@ class ExpPattern : GCNPa def : ExpPattern; def : ExpPattern; -def : GCNPat < - (v2i16 (build_vector i16:$src0, i16:$src1)), - (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) ->; - // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < @@ -1399,6 +1398,13 @@ def : GCNPat < (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0)) >; + +let SubtargetPredicate = HasVOP3PInsts in { +def : GCNPat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < @@ -1406,6 +1412,7 @@ def : GCNPat < (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) >; + def : GCNPat < (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), @@ -1418,6 +1425,9 @@ def : GCNPat < (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) >; +} // End SubtargetPredicate = HasVOP3PInsts + + // def : GCNPat < // (v2f16 (scalar_to_vector f16:$src0)), // (COPY $src0) diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index c0bf7ba70a12..abf703017b32 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,12 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_v2i16: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; FIXME: or should be unnecessary ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32 define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -52,21 +54,26 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, < ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u32 -; VI: v_add_u32_sdwa +; VI: s_add_i32 +; VI: s_add_i32 +; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_and_b32 +; VI: s_or_b32 define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void } +; FIXME: Eliminate or with sdwa ; GCN-LABEL: {{^}}v_test_add_v2i16_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -84,7 +91,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -99,10 +106,9 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 -; VI: flat_load_ushort [[LOAD0:v[0-9]+]] -; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] +; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]] ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -117,10 +123,11 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} +; VI: flat_load_dword ; VI-NOT: v_add_u16 +; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000, ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} ; VI-NOT: v_add_u16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -139,9 +146,9 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; VI-NOT: v_add_u16 ; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80 -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NOT: v_add_u16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -162,15 +169,13 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace( ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] -; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] +; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} @@ -198,13 +203,11 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9: buffer_store_dwordx4 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] ; VI-DAG: v_add_u16_e32 -; VI-DAG: v_add_u16_e32 +; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -230,8 +233,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_add_u16_e32 -; VI: v_add_u16_e32 + ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: buffer_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index f7b25a6fc653..6d3878c8ab26 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -8,8 +8,17 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_load_dword [[LHS:s[0-9]+]] +; VI: s_load_dword [[RHS:s[0-9]+]] +; VI: s_ashr_i32 +; VI: s_ashr_i32 +; VI: s_sext_i32_i16 +; VI: s_sext_i32_i16 +; VI: s_ashr_i32 +; VI: s_ashr_i32 +; VI: s_lshl_b32 +; VI: s_and_b32 +; VI: s_or_b32 ; CI-DAG: v_ashrrev_i32_e32 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index 817430ac4349..9c8f04fdfd98 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -71,10 +71,15 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_store_short +; SICI: buffer_store_short + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: buffer_store_short +; VI: buffer_store_short ; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c ; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30 @@ -92,9 +97,16 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort + +; SICI: buffer_store_short +; SICI: buffer_store_short +; SICI: buffer_store_short + +; SICI: buffer_load_ushort +; SICI: buffer_store_short ; GFX9-DAG: global_load_short_d16_hi v ; GFX9-DAG: global_load_short_d16 v diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index dafabdff39a4..d3e4afc8e830 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) @@ -36,16 +36,8 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_or_b32_e32 -; VI: flat_load_ushort [[HI:v[0-9]+]] -; VI: flat_load_ushort [[LO:v[0-9]+]] -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]] -; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]] -; VI: flat_store_dword - -; GFX9: s_load_dword [[VAL:s[0-9]+]] -; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +; GFX89: s_load_dword [[VAL:s[0-9]+]] +; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out @@ -59,13 +51,12 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX89: s_load_dword s +; GFX89: s_load_dword s +; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff +; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { @@ -147,9 +138,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_mul_f16_e64 v{{[0-9]+}}, |[[VAL]]|, 4.0 ; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-DAG: v_add_f16_sdwa v{{[0-9]+}}, |[[VAL]]|, [[CONST2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid @@ -167,11 +158,12 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %i ; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; FIXME: Extra bfe on VI -; GFX9-NOT: v_bfe_u32 -; VI: v_bfe_u32 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]] + + +; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 15 +; VI: flat_store_short + ; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index be8d52fa9589..7cc556ce168d 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -222,12 +222,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ret void } -; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: -; VI-DAG: v_bfe_u32 -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} -; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI-NOT: 0xffff ; VI: v_or_b32 @@ -245,9 +242,8 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} @@ -265,9 +261,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: -; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]] +; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} ; VI-NOT: 0xffff ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 937bd74a0fe4..39455acad484 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -94,12 +94,13 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad ; SI-NEXT: v_max3_f32 ; SI-NEXT: v_max3_f32 -; VI: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_max_f16_e32 v0, v0, v1 +; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_max_f16_e32 v0, v2, v0 +; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_max_f16_e32 v0, v0, v3 +; VI: v_or_b32_e32 v0, v0, v1 ; GFX9: v_pk_max_f16 ; GFX9-NEXT: v_pk_max_f16 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index d22333384dcc..06befaa64b5c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -92,12 +92,13 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad ; SI-NEXT: v_min3_f32 ; SI-NEXT: v_min3_f32 -; VI: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 +; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_min_f16_e32 v0, v0, v1 +; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_min_f16_e32 v0, v2, v0 +; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_min_f16_e32 v0, v0, v3 +; VI: v_or_b32_e32 v0, v0, v1 ; GFX9: v_pk_min_f16 ; GFX9: v_pk_min_f16 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index b43271c1bd01..a4722876d3f3 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -73,12 +73,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa ; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}} ; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]] ; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]] -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; CIVI: flat_store_dword +; FIXME: Random commute +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -95,14 +92,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x ; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]] ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]] ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]] -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 +; FIXME: Random commute +; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 + +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] + ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} @@ -120,7 +116,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|, 4.0 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 5649ddfc6e39..b4f8bb98cd78 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -60,7 +60,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa ret void } -; FIXME: Terrible code with VI and even worse with SI/CI +; FIXME: Terrible code with SI/CI. +; FIXME: scalar for VI, vector for gfx9 ; GCN-LABEL: {{^}}s_fneg_v2f16: ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} @@ -68,12 +69,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}} -; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} - define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fneg = fsub <2 x half> , %in store <2 x half> %fneg, <2 x half> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 0df58519a621..a042700edf85 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; half args should be promoted to float for SI and lower. @@ -13,13 +13,17 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { ret void } +; FIXME: Should always be the same ; GCN-LABEL: {{^}}load_v2f16_arg: -; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] -; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: s_endpgm +; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 +; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} + +; VI: s_load_dword [[ARG:s[0-9]+]] +; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] +; VI: buffer_store_dword [[V_ARG]] define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out ret void @@ -40,12 +44,18 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha } ; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_dwordx2 + +; FIXME: Why not one load? +; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]] +; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]] +; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void @@ -104,14 +114,20 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -145,8 +161,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI-DAG: buffer_load_ushort v +; SI-DAG: buffer_load_ushort v + +; VI-DAG: s_load_dword s +; VI: s_lshr_b32 + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f64_f32_e32 @@ -176,10 +196,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + +; VI: s_load_dword s +; VI: s_load_dword s + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -196,15 +220,23 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s + -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index 4844060feba4..3227633496ad 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: @@ -120,11 +120,14 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -138,11 +141,14 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -156,11 +162,14 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -174,11 +183,14 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -192,11 +204,15 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00 +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -210,11 +226,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -228,11 +247,14 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -246,11 +268,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -264,11 +289,14 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -321,11 +349,14 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -339,11 +370,15 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -357,11 +392,15 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -375,10 +414,9 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, -1, [[REG]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -393,10 +431,9 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* % ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, 0xfffefffe, [[REG]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -411,10 +448,10 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* % ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, 0xfff0fff0, [[REG]] + +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -429,11 +466,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -447,11 +487,14 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll index c1d67ba614c6..ae736f533787 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -42,16 +42,19 @@ define amdgpu_kernel void @s_input_output_f16() { ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' +; CI: error: couldn't allocate output register for constraint 's' +; CI: error: couldn't allocate input reg for constraint 's' + +; VI-NOT: error define amdgpu_kernel void @s_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x half> %v) ret void } -; GCN: error: couldn't allocate output register for constraint 'v' -; GCN: error: couldn't allocate input reg for constraint 'v' +; CI: error: couldn't allocate output register for constraint 'v' +; CI: error: couldn't allocate input reg for constraint 'v' +; VI-NOT: error define amdgpu_kernel void @v_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(<2 x half> %v) @@ -67,8 +70,12 @@ define amdgpu_kernel void @s_input_output_i16() { ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' +; FIXME: Should work on all targets? + +; CI: error: couldn't allocate output register for constraint 's' +; CI: error: couldn't allocate input reg for constraint 's' + +; VI-NOT: error define amdgpu_kernel void @s_input_output_v2i16() { %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 337cac2fc229..4f075f502585 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 8bf55a4544c0..cb2753bcc083 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: ; GCN: s_load_dword [[VEC:s[0-9]+]] @@ -39,11 +39,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* % ; GCN: s_load_dword [[ELT0:s[0-9]+]] ; GCN: s_load_dword [[VEC:s[0-9]+]] -; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} -; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 -; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 -; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] -; CIVI-DAG: ; use [[SHR]] +; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 +; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CI-DAG: ; use [[SHR]] + + +; FIXME: Should be able to void mask of upper bits +; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]] +; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 + +; VI-DAG: ; use [[SHR]] + ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] @@ -103,10 +113,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] ; GCN: s_load_dword [[VEC:s[0-9]+]], -; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 -; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 -; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 -; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] +; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; CI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 +; CI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] + + +; VI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; VI-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 +; VI: s_and_b32 [[MASK_HI:s[0-9]+]], [[VEC]], 0xffff0000 +; VI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[MASK_HI]] ; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 ; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index c307df8a4441..8e9abb9de8b6 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1,8 +1,8 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}i8_arg: ; HSA-VI: kernarg_segment_alignment = 4 @@ -162,10 +162,11 @@ entry: ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out @@ -285,14 +286,14 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -305,6 +306,7 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X + ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 @@ -370,22 +372,20 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out @@ -502,38 +502,32 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll index 76dd2fe6e532..2d793c0bd84f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s @@ -13,9 +13,12 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xy: -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { @@ -26,17 +29,27 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; +; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll index acc7f14f5fa8..b14430e4659c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}image_load_f16 ; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 @@ -58,11 +58,17 @@ main_body: ret void } -; GCN-LABEL: {{^}}image_store_v2f16 +; FIXME: Eliminate and to get low bits +; GCN-LABEL: {{^}}image_store_v2f16: +; UNPACKED: s_load_dword [[DATA:s[0-9]+]] +; UNPACKED-DAG: s_lshr_b32 [[UNPACK_1:s[0-9]+]], [[DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff +; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]] -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 + + +; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 ; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { @@ -72,20 +78,19 @@ main_body: } ; GCN-LABEL: {{^}}image_store_v4f16 +; UNPACKED: s_load_dword s +; UNPACKED: s_load_dword s +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_and_b32 +; UNPACKED: s_and_b32 +; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 - -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] - -; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; PACKED: s_load_dword [[DATA0:s[0-9]+]] +; PACKED: s_load_dword [[DATA1:s[0-9]+]] +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) @@ -93,20 +98,19 @@ main_body: } ; GCN-LABEL: {{^}}image_store_mip_v4f16 +; UNPACKD: s_load_dword s +; UNPACKD: s_load_dword s +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_and_b32 +; UNPACKED: s_and_b32 +; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 - -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] - -; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; PACKED: s_load_dword [[DATA0:s[0-9]+]] +; PACKED: s_load_dword [[DATA1:s[0-9]+]] +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index 8234e2c3993d..671a5a6f05a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}load_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index bbe21ea34a78..517c0a90650e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s @@ -12,12 +12,13 @@ main_body: ret void } - ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen ; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { @@ -26,21 +27,23 @@ main_body: ret void } - ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] ; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll index bfcce66ac1d0..f59741426ba8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -145,8 +145,12 @@ define amdgpu_kernel void @fma_v2f16( } ; GCN-LABEL: {{^}}fma_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] + +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] + ; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} @@ -185,8 +189,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} @@ -228,8 +232,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 931303a7c9a6..350ecedb80d6 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; GCN-LABEL: {{^}}s_lshr_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -8,11 +8,20 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; VI: s_load_dword [[LHS:s[0-9]+]] +; VI: s_load_dword [[RHS:s[0-9]+]] +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 +; VI-DAG: s_lshl_b32 +; VI: v_or_b32_e32 + ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 0c1276135806..b692b2226c66 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -117,8 +117,10 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 ; SI: v_min_i32 ; SI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_sext_i32_i16 +; VI: s_sext_i32_i16 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_pk_min_i16 @@ -131,17 +133,16 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < ret void } -; FIXME: VI use s_min_i32 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_pk_min_i16 ; GFX9: v_pk_min_i16 @@ -461,14 +462,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, < ; SI: v_min_u32 ; SI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 445526ec89d0..621d83b731e4 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -5,7 +5,7 @@ ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 +; VI: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 define half @reduction_half4(<4 x half> %vec4) { @@ -22,7 +22,7 @@ entry: ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_u16_e32 +; VI: v_add_u16_sdwa ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 define i16 @reduction_v4i16(<4 x i16> %vec4) { @@ -41,8 +41,8 @@ entry: ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 +; VI: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -67,8 +67,8 @@ entry: ; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_u16_e32 -; VI-NEXT: v_add_u16_e32 +; VI: v_add_u16_sdwa +; VI-NEXT: v_add_u16_sdwa ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 @@ -97,10 +97,10 @@ entry: ; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 +; VI: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -131,7 +131,7 @@ entry: ; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_u16_e32 +; VI: v_min_u16_sdwa ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 define i16 @reduction_min_v4i16(<4 x i16> %vec4) { @@ -152,8 +152,8 @@ entry: ; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}} ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_u16_e32 -; VI-NEXT: v_min_u16_e32 +; VI: v_min_u16_sdwa +; VI-NEXT: v_min_u16_sdwa ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 @@ -224,10 +224,10 @@ entry: ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 +; VI: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa ; VI-NEXT: v_min_i16_e32 ; VI-NEXT: v_min_i16_e32 ; VI-NEXT: v_min_i16_e32 @@ -339,7 +339,7 @@ entry: ; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_u16_e32 +; VI: v_max_u16_sdwa ; VI-NEXT: v_max_u16_e32 ; VI-NEXT: v_max_u16_e32 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { @@ -358,7 +358,7 @@ entry: ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_i16_e32 +; VI: v_max_i16_sdwa ; VI-NEXT: v_max_i16_e32 ; VI-NEXT: v_max_i16_e32 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 { @@ -377,7 +377,7 @@ entry: ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_f16_e32 +; VI: v_max_f16_sdwa ; VI-NEXT: v_max_f16_e32 ; VI-NEXT: v_max_f16_e32 define half @reduction_fmax_v4half(<4 x half> %vec4) { @@ -396,7 +396,7 @@ entry: ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_f16_e32 +; VI: v_min_f16_sdwa ; VI-NEXT: v_min_f16_e32 ; VI-NEXT: v_min_f16_e32 define half @reduction_fmin_v4half(<4 x half> %vec4) { @@ -409,4 +409,4 @@ entry: %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 47e6545d0a07..3c92e8e5cba3 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDWA,GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll index e5c8191b24b9..1e34036e6bd3 100644 --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. @@ -76,8 +76,14 @@ define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, } ; GCN-LABEL: {{^}}select_v2i16: -; GCN: v_cndmask_b32_e32 -; GCN-NOT: v_cndmask_b32 +; GFX89: s_load_dword +; GFX89: s_load_dword +; GFX89: s_load_dword +; GFX89: v_cndmask_b32 +; GFX89-NOT: v_cndmask_b32 + +; SI: v_cndmask_b32_e32 +; SI-NOT: v_cndmask_b32e define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b @@ -86,7 +92,9 @@ define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> } ; GCN-LABEL: {{^}}v_select_v2i16: -; GCN: v_cndmask_b32_e32 +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: v_cndmask_b32 ; GCN-NOT: cndmask define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr @@ -330,7 +338,7 @@ define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x do } ; GCN-LABEL: {{^}}v_select_v2f16: -; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32 ; GCN-NOT: cndmask define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index c3e71e27d2b3..260aac8d159d 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; GCN-LABEL: {{^}}s_shl_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -8,9 +8,14 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_lshlrev_b32_e32 -; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_lshr_b32 +; VI: s_lshr_b32 +; VI: s_and_b32 +; VI: s_and_b32 +; SI: s_and_B32 +; SI: s_or_b32 ; CI-DAG: v_lshlrev_b32_e32 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index c80945f390be..429493c85fb8 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 @@ -177,10 +177,15 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr ret void } -; FIXME: s_bfe_i64 +; FIXME: s_bfe_i64, same on SI and VI ; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32: -; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 -; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 +; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + +; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + + ; GCN-DAG: s_sext_i32_i16 ; GCN-DAG: s_sext_i32_i16 ; GCN: s_endpgm @@ -199,8 +204,6 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) } ; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32: -; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48 -; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 44f3cb19dc94..eb02084d8eb8 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s ; GCN-LABEL: {{^}}s_abs_v2i16: ; GFX9: s_load_dword [[VAL:s[0-9]+]] @@ -8,13 +8,15 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -; VI: v_sub_u32_e32 -; VI-DAG: v_sub_u32_e32 -; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI: v_add_u32_e32 -; VI: v_add_u32_e32 -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_sub_i32 +; VI: s_sub_i32 +; VI: s_max_i32 +; VI: s_max_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_and_b32 +; SI: s_or_b32 ; CI: v_sub_i32_e32 ; CI-DAG: v_sub_i32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index a608ef715c53..cee8d3eb6159 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,12 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_v2i16: +; GFX89: {{flat|global}}_load_dword +; GFX89: {{flat|global}}_load_dword + ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -47,10 +50,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, < ; FIXME: VI should not scalarize arg access. ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: +; GCN: s_load_dword s +; GCN: s_load_dword s + ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; VI: v_subrev_u32_e32 -; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: s_sub_i32 +; VI: s_sub_i32 +; VI: s_lshl_b32 +; VI: s_and_b32 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -58,12 +66,15 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out } ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: -; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX89-DAG: {{flat|global}}_load_dword + +; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -95,11 +106,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} -; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; VI: flat_load_ushort [[LOAD0:v[0-9]+]] -; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] +; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]] ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -114,11 +124,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} -; VI-NOT: v_subrev_i16 -; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}} -; VI-NOT: v_subrev_i16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_or_b32_e32 +; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]] +; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]] +; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]] define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -136,9 +145,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; VI-NOT: v_subrev_i16 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080 -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: flat_load_dword +; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NOT: v_subrev_i16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -159,19 +169,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace( ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] - -; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] -; VI-NOT: and -; VI-NOT: shl -; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; VI-NOT: and -; VI-NOT: shl -; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] +; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid @@ -196,14 +199,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx4 -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] - -; VI: v_sub_u16_e32 -; VI: v_sub_u16_e32 - +; VI: flat_load_dword [[A:v[0-9]+]] +; VI: flat_load_dword [[B:v[0-9]+]] +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]] +; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -228,8 +227,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: v_sub_u16_e32 -; VI: v_sub_u16_e32 +; VI: flat_load_dword +; VI: flat_load_dword +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; VI: buffer_store_dwordx2 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll index 53e306270ac5..14e2cde0e295 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -1,18 +1,15 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s ; FIXME: Should still like to vectorize the memory operations for VI ; Simple 3-pair chain with loads and stores ; GCN-LABEL: @test1_as_3_3_3_v2f16( -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX89: ret define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { %i0 = load half, half addrspace(3)* %a, align 2 %i1 = load half, half addrspace(3)* %b, align 2 @@ -29,14 +26,11 @@ define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addr } ; GCN-LABEL: @test1_as_3_0_0( -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: load <2 x half>, <2 x half>* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half>* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: load <2 x half>, <2 x half>* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half>* % +; GFX89: ret define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { %i0 = load half, half addrspace(3)* %a, align 2 %i1 = load half, half* %b, align 2 @@ -53,14 +47,11 @@ define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* } ; GCN-LABEL: @test1_as_0_0_3_v2f16( -; GFX9: load <2 x half>, <2 x half>* -; GFX9: load <2 x half>, <2 x half>* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half>* +; GFX89: load <2 x half>, <2 x half>* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX89: ret define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { %i0 = load half, half* %a, align 2 %i1 = load half, half* %b, align 2