AMDGPU/GlobalISel: cmp/select method for extract element

Differential Revision: https://reviews.llvm.org/D80749
This commit is contained in:
Stanislav Mekhanoshin 2020-05-27 13:09:00 -07:00
parent 672ed53860
commit 5d62606f90
6 changed files with 2174 additions and 1285 deletions

View File

@ -1856,6 +1856,88 @@ static void extendLow32IntoHigh32(MachineIRBuilder &B,
}
}
bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
MachineInstr &MI, MachineRegisterInfo &MRI,
const OperandsMapper &OpdMapper) const {
Register VecReg = MI.getOperand(1).getReg();
Register Idx = MI.getOperand(2).getReg();
const RegisterBank &IdxBank =
*OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
LLT VecTy = MRI.getType(VecReg);
unsigned EltSize = VecTy.getScalarSizeInBits();
unsigned NumElem = VecTy.getNumElements();
if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
IsDivergentIdx))
return false;
MachineIRBuilder B(MI);
LLT S32 = LLT::scalar(32);
const RegisterBank &DstBank =
*OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
const RegisterBank &SrcBank =
*OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank &CCBank =
(DstBank == AMDGPU::SGPRRegBank &&
SrcBank == AMDGPU::SGPRRegBank &&
IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
: AMDGPU::VCCRegBank;
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
}
LLT EltTy = VecTy.getScalarType();
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
unsigned NumLanes = DstRegs.size();
if (!NumLanes)
NumLanes = 1;
else
EltTy = MRI.getType(DstRegs[0]);
auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
SmallVector<Register, 2> Res(NumLanes);
for (unsigned L = 0; L < NumLanes; ++L)
Res[L] = UnmergeToEltTy.getReg(L);
for (unsigned I = 1; I < NumElem; ++I) {
auto IC = B.buildConstant(S32, I);
MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
for (unsigned L = 0; L < NumLanes; ++L) {
auto S = B.buildSelect(EltTy, Cmp,
UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
for (unsigned N : { 0, 2, 3 })
MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
Res[L] = S->getOperand(0).getReg();
}
}
for (unsigned L = 0; L < NumLanes; ++L) {
Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
B.buildCopy(DstReg, Res[L]);
MRI.setRegBank(DstReg, DstBank);
}
MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
MI.eraseFromParent();
return true;
}
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@ -2450,6 +2532,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
LLT DstTy = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
return;
MachineIRBuilder B(MI);
const ValueMapping &DstMapping

View File

@ -179,6 +179,12 @@ public:
const InstructionMapping &
getInstrMapping(const MachineInstr &MI) const override;
private:
bool foldExtractEltToCmpSelect(MachineInstr &MI,
MachineRegisterInfo &MRI,
const OperandsMapper &OpdMapper) const;
};
} // End llvm namespace.
#endif

View File

@ -9623,17 +9623,13 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
// expanded into a set of cmp/select instructions.
static bool shouldExpandVectorDynExt(SDNode *N) {
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
unsigned NumElem,
bool IsDivergentIdx) {
if (UseDivergentRegisterIndexing)
return false;
SDValue Vec = N->getOperand(0);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
unsigned VecSize = VecVT.getSizeInBits();
unsigned EltSize = EltVT.getSizeInBits();
unsigned NumElem = VecVT.getVectorNumElements();
unsigned VecSize = EltSize * NumElem;
// Sub-dword vectors of size 2 dword or less have better implementation.
if (VecSize <= 64 && EltSize < 32)
@ -9645,7 +9641,7 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
return true;
// Always do this if var-idx is divergent, otherwise it will become a loop.
if (Idx->isDivergent())
if (IsDivergentIdx)
return true;
// Large vectors would yield too many compares and v_cndmask_b32 instructions.
@ -9654,6 +9650,21 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
return NumInsts <= 16;
}
static bool shouldExpandVectorDynExt(SDNode *N) {
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
if (isa<ConstantSDNode>(Idx))
return false;
SDValue Vec = N->getOperand(0);
EVT VecVT = Vec.getValueType();
EVT EltVT = VecVT.getVectorElementType();
unsigned EltSize = EltVT.getSizeInBits();
unsigned NumElem = VecVT.getVectorNumElements();
return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
Idx->isDivergent());
}
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
@ -9715,7 +9726,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
unsigned EltSize = EltVT.getSizeInBits();
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
if (shouldExpandVectorDynExt(N)) {
if (::shouldExpandVectorDynExt(N)) {
SDLoc SL(N);
SDValue Idx = N->getOperand(1);
SDValue V;
@ -9778,7 +9789,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
// INSERT_VECTOR_ELT (<n x e>, var-idx)
// => BUILD_VECTOR n x select (e, const-idx)
if (!shouldExpandVectorDynExt(N))
if (!::shouldExpandVectorDynExt(N))
return SDValue();
SelectionDAG &DAG = DCI.DAG;

View File

@ -203,6 +203,11 @@ public:
/// and not emit a relocation for an LDS global.
bool shouldUseLDSConstAddress(const GlobalValue *GV) const;
/// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
/// expanded into a set of cmp/select instructions.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
bool IsDivergentIdx);
private:
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array

File diff suppressed because it is too large Load Diff