forked from OSchip/llvm-project
Added clamp i64 to i16 global isel pattern.
This commit is contained in:
parent
e7f9a83499
commit
62af0305b7
|
@ -468,6 +468,14 @@ m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
|
|||
TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
|
||||
}
|
||||
|
||||
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
|
||||
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
|
||||
TargetOpcode::G_SELECT>
|
||||
m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
|
||||
return TernaryOp_match<Src0Ty, Src1Ty, Src2Ty,
|
||||
TargetOpcode::G_SELECT>(Src0, Src1, Src2);
|
||||
}
|
||||
|
||||
/// Matches a register negated by a G_SUB.
|
||||
/// G_SUB 0, %negated_reg
|
||||
template <typename SrcTy>
|
||||
|
@ -484,6 +492,50 @@ m_Not(const SrcTy &&Src) {
|
|||
return m_GXor(Src, m_AllOnesInt());
|
||||
}
|
||||
|
||||
// class that allows to match one of the following patterns:
|
||||
// select (pred, x, value1) -> cmp slt -> select (pred, origin, value2) ->
|
||||
// cmp sgt OR select (pred, x, value1) -> cmp sgt -> select (pred, origin,
|
||||
// value2) -> cmp slt
|
||||
// also binds the boundary values and the origin.
|
||||
template <typename Boundary1,
|
||||
typename Boundary2, typename Origin>
|
||||
struct MaxMin_match_helper {
|
||||
Boundary1 B1;
|
||||
Boundary2 B2;
|
||||
Origin O;
|
||||
|
||||
MaxMin_match_helper(const Boundary1 &FirstBoundary,
|
||||
const Boundary2 &SecondBoundary, const Origin &Or)
|
||||
: B1(FirstBoundary), B2(SecondBoundary), O(Or) {}
|
||||
|
||||
template <typename OpTy>
|
||||
bool match(const MachineRegisterInfo &MRI, OpTy &&Op) {
|
||||
CmpInst::Predicate Predicate1;
|
||||
Register Base;
|
||||
|
||||
if (mi_match(Op, MRI,
|
||||
m_GISelect(m_GICmp(m_Pred(Predicate1), m_Reg(), m_Reg()),
|
||||
m_Reg(Base), B1))) {
|
||||
CmpInst::Predicate Predicate2;
|
||||
|
||||
if (mi_match(Base, MRI, m_GISelect(m_GICmp(m_Pred(Predicate2), m_Reg(), m_Reg()), O, B2))) {
|
||||
if ((Predicate1 == CmpInst::ICMP_SLT && Predicate2 == CmpInst::ICMP_SGT) ||
|
||||
(Predicate1 == CmpInst::ICMP_SGT && Predicate2 == CmpInst::ICMP_SLT)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Boundary1, typename Boundary2, typename Origin>
|
||||
inline MaxMin_match_helper<Boundary1, Boundary2, Origin>
|
||||
m_MaxMin(const Boundary1 &B1, const Boundary2 &B2, const Origin &O) {
|
||||
return MaxMin_match_helper<Boundary1, Boundary2, Origin>(B1, B2, O);
|
||||
}
|
||||
|
||||
} // namespace GMIPatternMatch
|
||||
} // namespace llvm
|
||||
|
||||
|
|
|
@ -37,6 +37,14 @@ def cvt_f32_ubyteN : GICombineRule<
|
|||
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
|
||||
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
|
||||
|
||||
def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
|
||||
|
||||
def clamp_i64_to_i16 : GICombineRule<
|
||||
(defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
|
||||
(match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
|
||||
[{ return PostLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
|
||||
(apply [{ PostLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
|
||||
|
||||
// Combines which should only apply on SI/VI
|
||||
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
|
||||
|
||||
|
@ -49,7 +57,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
|
|||
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
|
||||
"AMDGPUGenPostLegalizerCombinerHelper",
|
||||
[all_combines, gfx6gfx7_combines,
|
||||
uchar_to_float, cvt_f32_ubyteN]> {
|
||||
uchar_to_float, cvt_f32_ubyteN, clamp_i64_to_i16]> {
|
||||
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
|
||||
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
|
||||
let AdditionalArguments = [];
|
||||
|
|
|
@ -66,6 +66,19 @@ public:
|
|||
bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
|
||||
void applyCvtF32UByteN(MachineInstr &MI,
|
||||
const CvtF32UByteMatchInfo &MatchInfo);
|
||||
|
||||
struct ClampI64ToI16MatchInfo {
|
||||
int64_t Cmp1;
|
||||
int64_t Cmp2;
|
||||
Register Origin;
|
||||
};
|
||||
|
||||
bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineFunction &MF,
|
||||
ClampI64ToI16MatchInfo &MatchInfo);
|
||||
|
||||
void applyClampI64ToI16(MachineInstr &MI,
|
||||
const ClampI64ToI16MatchInfo &MatchInfo);
|
||||
};
|
||||
|
||||
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
|
||||
|
@ -245,6 +258,91 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
|
|||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
MachineFunction &MF,
|
||||
ClampI64ToI16MatchInfo &MatchInfo) {
|
||||
assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
|
||||
const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
|
||||
if (SrcType != LLT::scalar(64))
|
||||
return false;
|
||||
|
||||
MachineIRBuilder B(MI);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16");
|
||||
|
||||
if (mi_match(MI.getOperand(1).getReg(), MRI,
|
||||
m_MaxMin(m_ICst(MatchInfo.Cmp1),
|
||||
m_ICst(MatchInfo.Cmp2),
|
||||
m_Reg(MatchInfo.Origin)))) {
|
||||
const auto Cmp1 = static_cast<int64_t>(MatchInfo.Cmp1);
|
||||
const auto Cmp2 = static_cast<int64_t>(MatchInfo.Cmp2);
|
||||
|
||||
const int64_t Min = static_cast<int64_t>(std::numeric_limits<int16_t>::min());
|
||||
const int64_t Max = static_cast<int64_t>(std::numeric_limits<int16_t>::max());
|
||||
|
||||
// are we really trying to clamp against short boundaries?
|
||||
return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
|
||||
(Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(MachineInstr &MI,
|
||||
const ClampI64ToI16MatchInfo &MatchInfo) {
|
||||
LLVM_DEBUG(dbgs() << "Combining MI");
|
||||
|
||||
MachineIRBuilder B(MI);
|
||||
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
||||
|
||||
Register Src = MatchInfo.Origin;
|
||||
assert(MRI.getType(Src) == LLT::scalar(64));
|
||||
const LLT S32 = LLT::scalar(32);
|
||||
|
||||
auto Unmerge = B.buildUnmerge(S32, Src);
|
||||
Register Hi32 = Unmerge->getOperand(0).getReg();
|
||||
Register Lo32 = Unmerge->getOperand(1).getReg();
|
||||
MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
|
||||
MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
|
||||
|
||||
constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
|
||||
assert(MI.getOpcode() != CvtOpcode);
|
||||
|
||||
Register CvtDst = MRI.createGenericVirtualRegister(S32);
|
||||
MRI.setRegClass(CvtDst, &AMDGPU::VGPR_32RegClass);
|
||||
|
||||
auto CvtPk = B.buildInstr(CvtOpcode);
|
||||
CvtPk.addDef(CvtDst);
|
||||
CvtPk.addReg(Hi32);
|
||||
CvtPk.addReg(Lo32);
|
||||
CvtPk.setMIFlags(MI.getFlags());
|
||||
|
||||
auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
|
||||
auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
|
||||
|
||||
Register MinBoundaryDst = MRI.createGenericVirtualRegister(S32);
|
||||
MRI.setRegClass(MinBoundaryDst, &AMDGPU::VGPR_32RegClass);
|
||||
B.buildConstant(MinBoundaryDst, min);
|
||||
|
||||
Register MaxBoundaryDst = MRI.createGenericVirtualRegister(S32);
|
||||
MRI.setRegClass(MaxBoundaryDst, &AMDGPU::VGPR_32RegClass);
|
||||
B.buildConstant(MaxBoundaryDst, max);
|
||||
|
||||
Register MedDst = MRI.createGenericVirtualRegister(S32);
|
||||
MRI.setRegClass(MedDst, &AMDGPU::VGPR_32RegClass);
|
||||
|
||||
auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
|
||||
Med.addDef(MedDst);
|
||||
Med.addReg(MinBoundaryDst);
|
||||
Med.addReg(CvtDst);
|
||||
Med.addReg(MaxBoundaryDst);
|
||||
Med.setMIFlags(MI.getFlags());
|
||||
|
||||
B.buildCopy(MI.getOperand(0).getReg(), MedDst);
|
||||
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
class AMDGPUPostLegalizerCombinerHelperState {
|
||||
protected:
|
||||
CombinerHelper &Helper;
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s
|
||||
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s
|
||||
; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16
|
||||
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
|
||||
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
|
||||
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
|
||||
; GCN: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]]
|
||||
define i16 @v_clamp_i64_i16(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp sgt i64 %in, -32768
|
||||
%1 = select i1 %0, i64 %in, i64 -32768
|
||||
%2 = icmp slt i64 %1, 32767
|
||||
%3 = select i1 %2, i64 %1, i64 32767
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16_reverse
|
||||
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
|
||||
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
|
||||
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
|
||||
; GCN: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]]
|
||||
define i16 @v_clamp_i64_i16_reverse(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp slt i64 %in, 32767
|
||||
%1 = select i1 %0, i64 %in, i64 32767
|
||||
%2 = icmp sgt i64 %1, -32768
|
||||
%3 = select i1 %2, i64 %1, i64 -32768
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16_wrong_lower
|
||||
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
|
||||
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
|
||||
; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
|
||||
|
||||
; GCN: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
|
||||
; GCN: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
|
||||
define i16 @v_clamp_i64_i16_wrong_lower(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp slt i64 %in, 32769
|
||||
%1 = select i1 %0, i64 %in, i64 32769
|
||||
%2 = icmp sgt i64 %1, -32768
|
||||
%3 = select i1 %2, i64 %1, i64 -32768
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16_wrong_lower_and_higher
|
||||
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
|
||||
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
|
||||
|
||||
; GCN: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
|
||||
define i16 @v_clamp_i64_i16_wrong_lower_and_higher(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp sgt i64 %in, -32769
|
||||
%1 = select i1 %0, i64 %in, i64 -32769
|
||||
%2 = icmp slt i64 %1, 32768
|
||||
%3 = select i1 %2, i64 %1, i64 32768
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16_lower_than_short
|
||||
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
|
||||
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
|
||||
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
|
||||
; GCN: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]]
|
||||
define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp slt i64 %in, 256
|
||||
%1 = select i1 %0, i64 %in, i64 256
|
||||
%2 = icmp sgt i64 %1, -255
|
||||
%3 = select i1 %2, i64 %1, i64 -255
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse
|
||||
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
|
||||
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
|
||||
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||
; GCN: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
|
||||
; GCN: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]]
|
||||
define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp sgt i64 %in, -255
|
||||
%1 = select i1 %0, i64 %in, i64 -255
|
||||
%2 = icmp slt i64 %1, 256
|
||||
%3 = select i1 %2, i64 %1, i64 256
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_clamp_i64_i16_zero
|
||||
; GFX678: v_mov_b32_e32 [[A:v[0-9]+]], 0
|
||||
; GCN: v_mov_b32_e32 [[A:v[0-9]+]], 0
|
||||
define i16 @v_clamp_i64_i16_zero(i64 %in) nounwind {
|
||||
entry:
|
||||
%0 = icmp sgt i64 %in, 0
|
||||
%1 = select i1 %0, i64 %in, i64 0
|
||||
%2 = icmp slt i64 %1, 0
|
||||
%3 = select i1 %2, i64 %1, i64 0
|
||||
%4 = trunc i64 %3 to i16
|
||||
|
||||
ret i16 %4
|
||||
}
|
Loading…
Reference in New Issue