diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h index e1f273ff71db..55d6d365fbb4 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -306,18 +306,6 @@ m_GAShr(const LHS &L, const RHS &R) { return BinaryOp_match(L, R); } -template -inline BinaryOp_match -m_GSMax(const LHS &L, const RHS &R) { - return BinaryOp_match(L, R); -} - -template -inline BinaryOp_match -m_GSMin(const LHS &L, const RHS &R) { - return BinaryOp_match(L, R); -} - // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc template struct UnaryOp_match { SrcTy L; @@ -480,13 +468,6 @@ m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) { TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2); } -template -inline TernaryOp_match -m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) { - return TernaryOp_match( - Src0, Src1, Src2); -} - /// Matches a register negated by a G_SUB. /// G_SUB 0, %negated_reg template @@ -503,7 +484,7 @@ m_Not(const SrcTy &&Src) { return m_GXor(Src, m_AllOnesInt()); } -} // namespace MIPatternMatch +} // namespace GMIPatternMatch } // namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index b6a6fb3e77db..a8399176bb4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,21 +37,13 @@ def cvt_f32_ubyteN : GICombineRule< [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; -def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; - -def clamp_i64_to_i16 : GICombineRule< - (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo), - (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16, - [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), - (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; - // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; + def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> { + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; - let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; } def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 76406f318490..bba03736d01a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -174,9 +174,6 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; - def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index c0cb1781abe3..894677ec68b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -213,8 +213,6 @@ def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2", def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", SDTIntToFPOp, []>; -def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", - AMDGPUIntPackOp, []>; // urecip - This operation is a helper for integer division, it returns the // result of 1 / a as a fractional unsigned integer. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index e018628ae8cc..e4b628bf6b23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -12,9 +12,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPULegalizerInfo.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -29,134 +26,6 @@ using namespace llvm; using namespace MIPatternMatch; -class AMDGPUPreLegalizerCombinerHelper { -protected: - MachineIRBuilder &B; - MachineFunction &MF; - MachineRegisterInfo &MRI; - CombinerHelper &Helper; - -public: - AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) - : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; - - struct ClampI64ToI16MatchInfo { - int64_t Cmp1; - int64_t Cmp2; - Register Origin; - }; - - bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineFunction &MF, - ClampI64ToI16MatchInfo &MatchInfo); - - void applyClampI64ToI16(MachineInstr &MI, - const ClampI64ToI16MatchInfo &MatchInfo); -}; - -bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, - ClampI64ToI16MatchInfo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); - - // Try to find a pattern where an i64 value should get clamped to short. - const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); - if (SrcType != LLT::scalar(64)) - return false; - - const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - if (DstType != LLT::scalar(16)) - return false; - - Register Base; - - // Try to match a combination of min / max MIR opcodes. - if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { - if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { - return false; - } - } - - if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { - if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { - return false; - } - } - - const auto Cmp1 = MatchInfo.Cmp1; - const auto Cmp2 = MatchInfo.Cmp2; - const auto Diff = std::abs(Cmp2 - Cmp1); - - // If the difference between both comparison values is 0 or 1, there is no - // need to clamp. - if (Diff == 0 || Diff == 1) - return false; - - const int64_t Min = std::numeric_limits::min(); - const int64_t Max = std::numeric_limits::max(); - - // Check if the comparison values are between SHORT_MIN and SHORT_MAX. - return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || - (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); -} - -// We want to find a combination of instructions that -// gets generated when an i64 gets clamped to i16. -// The corresponding pattern is: -// G_MAX / G_MAX for i16 <= G_TRUNC i64. -// This can be efficiently written as following: -// v_cvt_pk_i16_i32 v0, v0, v1 -// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max -void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( - MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { - - Register Src = MatchInfo.Origin; - assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == - LLT::scalar(64)); - const LLT S32 = LLT::scalar(32); - - B.setMBB(*MI.getParent()); - B.setInstrAndDebugLoc(MI); - - auto Unmerge = B.buildUnmerge(S32, Src); - - assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); - - const LLT V2S16 = LLT::vector(2, 16); - auto CvtPk = B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, - {V2S16}, - {Unmerge.getReg(0), Unmerge.getReg(1)}, - MI.getFlags()); - - auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); - auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); - auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); - auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); - - auto Bitcast = B.buildBitcast({S32}, CvtPk); - - auto Med3 = B.buildInstr(AMDGPU::G_AMDGPU_MED3, - {S32}, - {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, - MI.getFlags()); - - B.buildTrunc(MI.getOperand(0).getReg(), Med3); - - MI.eraseFromParent(); -} - -class AMDGPUPreLegalizerCombinerHelperState { -protected: - CombinerHelper &Helper; - AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; - -public: - AMDGPUPreLegalizerCombinerHelperState( - CombinerHelper &Helper, - AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) - : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} -}; - #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS @@ -190,9 +59,7 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); - AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, - PreLegalizerHelper); + AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg); if (Generated.tryCombineAll(Observer, MI, B, Helper)) return true; @@ -258,7 +125,6 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); - GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c9cca1e1beb8..502356d4f9a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3621,8 +3621,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: - case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: - case AMDGPU::G_AMDGPU_MED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 92c0d196de22..ecb875debefd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2575,18 +2575,6 @@ def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { } } -def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src0, type0:$src1); - let hasSideEffects = 0; -} - -def G_AMDGPU_MED3 : AMDGPUGenericInstruction { - let OutOperandList = (outs type0:$dst); - let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); - let hasSideEffects = 0; -} - // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll deleted file mode 100644 index 7d74c60a9e49..000000000000 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll +++ /dev/null @@ -1,112 +0,0 @@ -; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s -; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s -; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s - -declare i64 @llvm.smax.i64(i64, i64) -declare i64 @llvm.smin.i64(i64, i64) - -; GFX10-LABEL: {{^}}v_clamp_i64_i16 -; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] -; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]] -define i16 @v_clamp_i64_i16(i64 %in) #0 { -entry: - %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768) - %min = call i64 @llvm.smin.i64(i64 %max, i64 32767) - %result = trunc i64 %min to i16 - ret i16 %result -} - -; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse -; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] -; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]] -define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 { -entry: - %min = call i64 @llvm.smin.i64(i64 %in, i64 32767) - %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768) - %result = trunc i64 %max to i16 - ret i16 %result -} - -; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower -; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001 -; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc -; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc - -; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo -; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo -define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 { -entry: - %min = call i64 @llvm.smin.i64(i64 %in, i64 32769) - %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768) - %result = trunc i64 %max to i16 - ret i16 %result -} - -; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher -; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000 -; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc -; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo -define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 { -entry: - %max = call i64 @llvm.smax.i64(i64 %in, i64 -32769) - %min = call i64 @llvm.smin.i64(i64 %max, i64 32768) - %result = trunc i64 %min to i16 - ret i16 %result -} - -; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short -; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] -; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x100 -; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]] -define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 { -entry: - %min = call i64 @llvm.smin.i64(i64 %in, i64 256) - %max = call i64 @llvm.smax.i64(i64 %min, i64 -255) - %result = trunc i64 %max to i16 - ret i16 %result -} - -; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse -; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 -; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 -; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] -; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[B]], 0x100 -; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]] -define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 { -entry: - %max = call i64 @llvm.smax.i64(i64 %in, i64 -255) - %min = call i64 @llvm.smin.i64(i64 %max, i64 256) - %result = trunc i64 %min to i16 - ret i16 %result -} - -; GFX10-LABEL: {{^}}v_clamp_i64_i16_zero -; GFX6789: v_mov_b32_e32 v0, 0 -; GFX10: v_mov_b32_e32 v0, 0 -define i16 @v_clamp_i64_i16_zero(i64 %in) #0 { -entry: - %max = call i64 @llvm.smax.i64(i64 %in, i64 0) - %min = call i64 @llvm.smin.i64(i64 %max, i64 0) - %result = trunc i64 %min to i16 - ret i16 %result -}