forked from OSchip/llvm-project
[AMDGPU]: Fixes an invalid clamp selection pattern.
When running the tests on PowerPC and x86, the lit test GlobalISel/trunc.ll fails at the memory sanitize step. This seems to be due to wrong invalid logic (which matches even if it shouldn't) and likely missing variable initialisation." Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D95878
This commit is contained in:
parent
86bde76b29
commit
f89f6d1e5d
|
@ -306,6 +306,18 @@ m_GAShr(const LHS &L, const RHS &R) {
|
||||||
return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
|
return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename LHS, typename RHS>
|
||||||
|
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>
|
||||||
|
m_GSMax(const LHS &L, const RHS &R) {
|
||||||
|
return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>(L, R);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename LHS, typename RHS>
|
||||||
|
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>
|
||||||
|
m_GSMin(const LHS &L, const RHS &R) {
|
||||||
|
return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>(L, R);
|
||||||
|
}
|
||||||
|
|
||||||
// Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
|
// Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
|
||||||
template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
|
template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
|
||||||
SrcTy L;
|
SrcTy L;
|
||||||
|
@ -468,6 +480,13 @@ m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
|
||||||
TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
|
TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
|
||||||
|
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty, TargetOpcode::G_SELECT>
|
||||||
|
m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
|
||||||
|
return TernaryOp_match<Src0Ty, Src1Ty, Src2Ty, TargetOpcode::G_SELECT>(
|
||||||
|
Src0, Src1, Src2);
|
||||||
|
}
|
||||||
|
|
||||||
/// Matches a register negated by a G_SUB.
|
/// Matches a register negated by a G_SUB.
|
||||||
/// G_SUB 0, %negated_reg
|
/// G_SUB 0, %negated_reg
|
||||||
template <typename SrcTy>
|
template <typename SrcTy>
|
||||||
|
@ -484,7 +503,7 @@ m_Not(const SrcTy &&Src) {
|
||||||
return m_GXor(Src, m_AllOnesInt());
|
return m_GXor(Src, m_AllOnesInt());
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace GMIPatternMatch
|
} // namespace MIPatternMatch
|
||||||
} // namespace llvm
|
} // namespace llvm
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -37,13 +37,21 @@ def cvt_f32_ubyteN : GICombineRule<
|
||||||
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
|
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
|
||||||
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
|
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
|
||||||
|
|
||||||
|
def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
|
||||||
|
|
||||||
|
def clamp_i64_to_i16 : GICombineRule<
|
||||||
|
(defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
|
||||||
|
(match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
|
||||||
|
[{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
|
||||||
|
(apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
|
||||||
|
|
||||||
// Combines which should only apply on SI/VI
|
// Combines which should only apply on SI/VI
|
||||||
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
|
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
|
||||||
|
|
||||||
|
|
||||||
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
|
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
|
||||||
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
|
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
|
||||||
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
|
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
|
||||||
|
let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
|
||||||
}
|
}
|
||||||
|
|
||||||
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
|
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
|
||||||
|
|
|
@ -174,6 +174,9 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>;
|
||||||
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
|
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
|
||||||
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
|
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
|
||||||
|
|
||||||
|
def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
|
||||||
|
def : GINodeEquiv<G_AMDGPU_MED3, AMDGPUsmed3>;
|
||||||
|
|
||||||
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
|
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
|
||||||
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
|
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
|
||||||
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
|
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
|
||||||
|
|
|
@ -213,6 +213,8 @@ def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
|
||||||
def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
|
def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
|
||||||
SDTIntToFPOp, []>;
|
SDTIntToFPOp, []>;
|
||||||
|
|
||||||
|
def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32",
|
||||||
|
AMDGPUIntPackOp, []>;
|
||||||
|
|
||||||
// urecip - This operation is a helper for integer division, it returns the
|
// urecip - This operation is a helper for integer division, it returns the
|
||||||
// result of 1 / a as a fractional unsigned integer.
|
// result of 1 / a as a fractional unsigned integer.
|
||||||
|
|
|
@ -12,6 +12,9 @@
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
#include "AMDGPU.h"
|
#include "AMDGPU.h"
|
||||||
|
#include "AMDGPULegalizerInfo.h"
|
||||||
|
#include "GCNSubtarget.h"
|
||||||
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||||
#include "llvm/CodeGen/GlobalISel/Combiner.h"
|
#include "llvm/CodeGen/GlobalISel/Combiner.h"
|
||||||
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
|
||||||
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
|
||||||
|
@ -26,6 +29,141 @@
|
||||||
using namespace llvm;
|
using namespace llvm;
|
||||||
using namespace MIPatternMatch;
|
using namespace MIPatternMatch;
|
||||||
|
|
||||||
|
class AMDGPUPreLegalizerCombinerHelper {
|
||||||
|
protected:
|
||||||
|
MachineIRBuilder &B;
|
||||||
|
MachineFunction &MF;
|
||||||
|
MachineRegisterInfo &MRI;
|
||||||
|
CombinerHelper &Helper;
|
||||||
|
|
||||||
|
public:
|
||||||
|
AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
|
||||||
|
: B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
|
||||||
|
|
||||||
|
struct ClampI64ToI16MatchInfo {
|
||||||
|
int64_t Cmp1 = 0;
|
||||||
|
int64_t Cmp2 = 0;
|
||||||
|
Register Origin;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||||
|
MachineFunction &MF,
|
||||||
|
ClampI64ToI16MatchInfo &MatchInfo);
|
||||||
|
|
||||||
|
void applyClampI64ToI16(MachineInstr &MI,
|
||||||
|
const ClampI64ToI16MatchInfo &MatchInfo);
|
||||||
|
};
|
||||||
|
|
||||||
|
bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
|
||||||
|
MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
|
||||||
|
ClampI64ToI16MatchInfo &MatchInfo) {
|
||||||
|
assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
|
||||||
|
|
||||||
|
// Try to find a pattern where an i64 value should get clamped to short.
|
||||||
|
const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
|
||||||
|
if (SrcType != LLT::scalar(64))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
|
||||||
|
if (DstType != LLT::scalar(16))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
Register Base;
|
||||||
|
|
||||||
|
auto IsApplicableForCombine = [&MatchInfo]() -> bool {
|
||||||
|
const auto Cmp1 = MatchInfo.Cmp1;
|
||||||
|
const auto Cmp2 = MatchInfo.Cmp2;
|
||||||
|
const auto Diff = std::abs(Cmp2 - Cmp1);
|
||||||
|
|
||||||
|
// If the difference between both comparison values is 0 or 1, there is no
|
||||||
|
// need to clamp.
|
||||||
|
if (Diff == 0 || Diff == 1)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const int64_t Min = std::numeric_limits<int16_t>::min();
|
||||||
|
const int64_t Max = std::numeric_limits<int16_t>::max();
|
||||||
|
|
||||||
|
// Check if the comparison values are between SHORT_MIN and SHORT_MAX.
|
||||||
|
return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
|
||||||
|
(Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
|
||||||
|
};
|
||||||
|
|
||||||
|
// Try to match a combination of min / max MIR opcodes.
|
||||||
|
if (mi_match(MI.getOperand(1).getReg(), MRI,
|
||||||
|
m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
|
||||||
|
if (mi_match(Base, MRI,
|
||||||
|
m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
|
||||||
|
return IsApplicableForCombine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mi_match(MI.getOperand(1).getReg(), MRI,
|
||||||
|
m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
|
||||||
|
if (mi_match(Base, MRI,
|
||||||
|
m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
|
||||||
|
return IsApplicableForCombine();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We want to find a combination of instructions that
|
||||||
|
// gets generated when an i64 gets clamped to i16.
|
||||||
|
// The corresponding pattern is:
|
||||||
|
// G_MAX / G_MAX for i16 <= G_TRUNC i64.
|
||||||
|
// This can be efficiently written as following:
|
||||||
|
// v_cvt_pk_i16_i32 v0, v0, v1
|
||||||
|
// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
|
||||||
|
void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
|
||||||
|
MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
|
||||||
|
|
||||||
|
Register Src = MatchInfo.Origin;
|
||||||
|
assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
|
||||||
|
LLT::scalar(64));
|
||||||
|
const LLT S32 = LLT::scalar(32);
|
||||||
|
|
||||||
|
B.setMBB(*MI.getParent());
|
||||||
|
B.setInstrAndDebugLoc(MI);
|
||||||
|
|
||||||
|
auto Unmerge = B.buildUnmerge(S32, Src);
|
||||||
|
|
||||||
|
assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
|
||||||
|
|
||||||
|
const LLT V2S16 = LLT::vector(2, 16);
|
||||||
|
auto CvtPk =
|
||||||
|
B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
|
||||||
|
{Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
|
||||||
|
|
||||||
|
auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
|
||||||
|
auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
|
||||||
|
auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
|
||||||
|
auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
|
||||||
|
|
||||||
|
auto Bitcast = B.buildBitcast({S32}, CvtPk);
|
||||||
|
|
||||||
|
auto Med3 = B.buildInstr(
|
||||||
|
AMDGPU::G_AMDGPU_MED3, {S32},
|
||||||
|
{MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
|
||||||
|
MI.getFlags());
|
||||||
|
|
||||||
|
B.buildTrunc(MI.getOperand(0).getReg(), Med3);
|
||||||
|
|
||||||
|
MI.eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
class AMDGPUPreLegalizerCombinerHelperState {
|
||||||
|
protected:
|
||||||
|
CombinerHelper &Helper;
|
||||||
|
AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
|
||||||
|
|
||||||
|
public:
|
||||||
|
AMDGPUPreLegalizerCombinerHelperState(
|
||||||
|
CombinerHelper &Helper,
|
||||||
|
AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
|
||||||
|
: Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
|
||||||
|
};
|
||||||
|
|
||||||
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
#include "AMDGPUGenPreLegalizeGICombiner.inc"
|
#include "AMDGPUGenPreLegalizeGICombiner.inc"
|
||||||
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
|
||||||
|
@ -59,7 +197,9 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
|
||||||
MachineInstr &MI,
|
MachineInstr &MI,
|
||||||
MachineIRBuilder &B) const {
|
MachineIRBuilder &B) const {
|
||||||
CombinerHelper Helper(Observer, B, KB, MDT);
|
CombinerHelper Helper(Observer, B, KB, MDT);
|
||||||
AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
|
AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
|
||||||
|
AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
|
||||||
|
PreLegalizerHelper);
|
||||||
|
|
||||||
if (Generated.tryCombineAll(Observer, MI, B, Helper))
|
if (Generated.tryCombineAll(Observer, MI, B, Helper))
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -3507,6 +3507,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||||
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
|
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
|
||||||
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
|
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
|
||||||
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
|
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
|
||||||
|
case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
|
||||||
|
case AMDGPU::G_AMDGPU_MED3:
|
||||||
return getDefaultMappingVOP(MI);
|
return getDefaultMappingVOP(MI);
|
||||||
case AMDGPU::G_UMULH:
|
case AMDGPU::G_UMULH:
|
||||||
case AMDGPU::G_SMULH: {
|
case AMDGPU::G_SMULH: {
|
||||||
|
|
|
@ -2577,6 +2577,18 @@ def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction {
|
||||||
|
let OutOperandList = (outs type0:$dst);
|
||||||
|
let InOperandList = (ins type0:$src0, type0:$src1);
|
||||||
|
let hasSideEffects = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
def G_AMDGPU_MED3 : AMDGPUGenericInstruction {
|
||||||
|
let OutOperandList = (outs type0:$dst);
|
||||||
|
let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
|
||||||
|
let hasSideEffects = 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
|
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
|
||||||
// operand Expects a MachineMemOperand in addition to explicit
|
// operand Expects a MachineMemOperand in addition to explicit
|
||||||
// operands.
|
// operands.
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s
|
||||||
|
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s
|
||||||
|
; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
|
||||||
|
|
||||||
|
declare i64 @llvm.smax.i64(i64, i64)
|
||||||
|
declare i64 @llvm.smin.i64(i64, i64)
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16
|
||||||
|
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
|
||||||
|
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
|
||||||
|
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||||
|
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
|
||||||
|
; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
|
||||||
|
define i16 @v_clamp_i64_i16(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %in, i64 -32768)
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %max, i64 32767)
|
||||||
|
%result = trunc i64 %min to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse
|
||||||
|
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
|
||||||
|
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
|
||||||
|
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||||
|
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
|
||||||
|
; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
|
||||||
|
define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %in, i64 32767)
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %min, i64 -32768)
|
||||||
|
%result = trunc i64 %max to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
|
||||||
|
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
|
||||||
|
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
|
||||||
|
; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
|
||||||
|
|
||||||
|
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
|
||||||
|
; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
|
||||||
|
define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %in, i64 32769)
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %min, i64 -32768)
|
||||||
|
%result = trunc i64 %max to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
|
||||||
|
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
|
||||||
|
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
|
||||||
|
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
|
||||||
|
define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %in, i64 -32769)
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %max, i64 32768)
|
||||||
|
%result = trunc i64 %min to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short
|
||||||
|
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
|
||||||
|
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
|
||||||
|
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||||
|
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX10: v_mov_b32_e32 [[B]], 0x100
|
||||||
|
; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
|
||||||
|
define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %in, i64 256)
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %min, i64 -255)
|
||||||
|
%result = trunc i64 %max to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse
|
||||||
|
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
|
||||||
|
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
|
||||||
|
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
|
||||||
|
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
|
||||||
|
; GFX10: v_mov_b32_e32 [[B]], 0x100
|
||||||
|
; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
|
||||||
|
define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %in, i64 -255)
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %max, i64 256)
|
||||||
|
%result = trunc i64 %min to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
||||||
|
|
||||||
|
; GFX10-LABEL: {{^}}v_clamp_i64_i16_zero
|
||||||
|
; GFX6789: v_mov_b32_e32 v0, 0
|
||||||
|
; GFX10: v_mov_b32_e32 v0, 0
|
||||||
|
define i16 @v_clamp_i64_i16_zero(i64 %in) #0 {
|
||||||
|
entry:
|
||||||
|
%max = call i64 @llvm.smax.i64(i64 %in, i64 0)
|
||||||
|
%min = call i64 @llvm.smin.i64(i64 %max, i64 0)
|
||||||
|
%result = trunc i64 %min to i16
|
||||||
|
ret i16 %result
|
||||||
|
}
|
Loading…
Reference in New Issue