Added and used new target pseudo for v_cvt_pk_i16_i32, changes due to code review.

This commit is contained in:
Thomas Symalla 2021-01-13 15:23:45 +01:00
parent 52bfb50145
commit 6604d81e1b
6 changed files with 32 additions and 30 deletions

View File

@ -174,6 +174,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>;
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;

View File

@ -213,6 +213,8 @@ def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
SDTIntToFPOp, []>;
def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32",
AMDGPUIntPackOp, []>;
// urecip - This operation is a helper for integer division, it returns the
// result of 1 / a as a fractional unsigned integer.

View File

@ -11,10 +11,13 @@
//
//===----------------------------------------------------------------------===//
<<<<<<< HEAD
<<<<<<< HEAD
#include "AMDGPU.h"
=======
#include "AMDGPULegalizerInfo.h"
=======
>>>>>>> Added and used new target pseudo for v_cvt_pk_i16_i32, changes due to code review.
#include "AMDGPUTargetMachine.h"
>>>>>>> Move Combiner to PreLegalize step
#include "llvm/CodeGen/GlobalISel/Combiner.h"
@ -70,8 +73,6 @@ bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
if (DstType != LLT::scalar(16))
return false;
LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n");
Register Base;
// Try to match a combination of min / max MIR opcodes.
@ -128,38 +129,33 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
assert(MI.getOpcode() != CvtOpcode);
assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
Register CvtDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const LLT V2S16 = LLT::vector(2, 16);
MRI.setType(CvtDst, V2S16);
Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
MRI.setType(CvtDst, S32);
auto CvtPk = B.buildInstr(CvtOpcode);
CvtPk.addDef(CvtDst);
CvtPk.addReg(Hi32);
CvtPk.addReg(Lo32);
CvtPk.setMIFlags(MI.getFlags());
B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32,
{CvtDst},
{Hi32, Lo32},
MI.getFlags());
auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
MRI.setRegClass(MinBoundaryDst.getReg(0), REG_CLASS);
MRI.setRegClass(MinBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass);
auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
MRI.setRegClass(MaxBoundaryDst.getReg(0), REG_CLASS);
MRI.setRegClass(MaxBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass);
Register MedDst = MRI.createVirtualRegister(REG_CLASS);
Register MedDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MRI.setType(MedDst, S32);
auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
Med.addDef(MedDst);
Med.addReg(MinBoundaryDst.getReg(0));
Med.addReg(CvtDst);
Med.addReg(MaxBoundaryDst.getReg(0));
Med.setMIFlags(MI.getFlags());
B.buildInstr(AMDGPU::V_MED3_I32,
{MedDst},
{MinBoundaryDst.getReg(0), CvtDst, MaxBoundaryDst.getReg(0)},
MI.getFlags());
Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
B.buildTrunc(TruncDst, MedDst);
@ -197,10 +193,9 @@ public:
AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
const AMDGPULegalizerInfo *LI,
GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
/*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
KB(KB), MDT(MDT) {
if (!GeneratedRuleCfg.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
@ -282,16 +277,12 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
const Function &F = MF.getFunction();
bool EnableOpt =
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const AMDGPULegalizerInfo *LI =
static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), LI, KB, MDT);
F.hasMinSize(), KB, MDT);
Combiner C(PCInfo, TPC);
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
}

View File

@ -3621,6 +3621,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {

View File

@ -2575,6 +2575,12 @@ def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
}
}
def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src0, type0:$src1);
let hasSideEffects = 0;
}
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
// operand Expects a MachineMemOperand in addition to explicit
// operands.

View File

@ -109,4 +109,4 @@ entry:
%min = call i64 @llvm.smin.i64(i64 %max, i64 0)
%result = trunc i64 %min to i16
ret i16 %result
}
}