forked from OSchip/llvm-project
AMDGPU/GlobalISel: cmp/select method for extract element
Differential Revision: https://reviews.llvm.org/D80749
This commit is contained in:
parent
672ed53860
commit
5d62606f90
|
@ -1856,6 +1856,88 @@ static void extendLow32IntoHigh32(MachineIRBuilder &B,
|
|||
}
|
||||
}
|
||||
|
||||
bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
|
||||
MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||
const OperandsMapper &OpdMapper) const {
|
||||
|
||||
Register VecReg = MI.getOperand(1).getReg();
|
||||
Register Idx = MI.getOperand(2).getReg();
|
||||
|
||||
const RegisterBank &IdxBank =
|
||||
*OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
|
||||
|
||||
bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
|
||||
|
||||
LLT VecTy = MRI.getType(VecReg);
|
||||
unsigned EltSize = VecTy.getScalarSizeInBits();
|
||||
unsigned NumElem = VecTy.getNumElements();
|
||||
|
||||
if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
|
||||
IsDivergentIdx))
|
||||
return false;
|
||||
|
||||
MachineIRBuilder B(MI);
|
||||
LLT S32 = LLT::scalar(32);
|
||||
|
||||
const RegisterBank &DstBank =
|
||||
*OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
|
||||
const RegisterBank &SrcBank =
|
||||
*OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
|
||||
|
||||
const RegisterBank &CCBank =
|
||||
(DstBank == AMDGPU::SGPRRegBank &&
|
||||
SrcBank == AMDGPU::SGPRRegBank &&
|
||||
IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
|
||||
: AMDGPU::VCCRegBank;
|
||||
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
|
||||
|
||||
if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
|
||||
Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
|
||||
MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
|
||||
}
|
||||
|
||||
LLT EltTy = VecTy.getScalarType();
|
||||
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
|
||||
unsigned NumLanes = DstRegs.size();
|
||||
if (!NumLanes)
|
||||
NumLanes = 1;
|
||||
else
|
||||
EltTy = MRI.getType(DstRegs[0]);
|
||||
|
||||
auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
|
||||
SmallVector<Register, 2> Res(NumLanes);
|
||||
for (unsigned L = 0; L < NumLanes; ++L)
|
||||
Res[L] = UnmergeToEltTy.getReg(L);
|
||||
|
||||
for (unsigned I = 1; I < NumElem; ++I) {
|
||||
auto IC = B.buildConstant(S32, I);
|
||||
MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
|
||||
auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
|
||||
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
|
||||
|
||||
for (unsigned L = 0; L < NumLanes; ++L) {
|
||||
auto S = B.buildSelect(EltTy, Cmp,
|
||||
UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
|
||||
|
||||
for (unsigned N : { 0, 2, 3 })
|
||||
MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
|
||||
|
||||
Res[L] = S->getOperand(0).getReg();
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned L = 0; L < NumLanes; ++L) {
|
||||
Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
|
||||
B.buildCopy(DstReg, Res[L]);
|
||||
MRI.setRegBank(DstReg, DstBank);
|
||||
}
|
||||
|
||||
MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
|
||||
MI.eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void AMDGPURegisterBankInfo::applyMappingImpl(
|
||||
const OperandsMapper &OpdMapper) const {
|
||||
MachineInstr &MI = OpdMapper.getMI();
|
||||
|
@ -2450,6 +2532,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
LLT DstTy = MRI.getType(DstReg);
|
||||
LLT SrcTy = MRI.getType(SrcReg);
|
||||
|
||||
if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
|
||||
return;
|
||||
|
||||
MachineIRBuilder B(MI);
|
||||
|
||||
const ValueMapping &DstMapping
|
||||
|
|
|
@ -179,6 +179,12 @@ public:
|
|||
|
||||
const InstructionMapping &
|
||||
getInstrMapping(const MachineInstr &MI) const override;
|
||||
|
||||
private:
|
||||
|
||||
bool foldExtractEltToCmpSelect(MachineInstr &MI,
|
||||
MachineRegisterInfo &MRI,
|
||||
const OperandsMapper &OpdMapper) const;
|
||||
};
|
||||
} // End llvm namespace.
|
||||
#endif
|
||||
|
|
|
@ -9623,17 +9623,13 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
|
|||
|
||||
// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
|
||||
// expanded into a set of cmp/select instructions.
|
||||
static bool shouldExpandVectorDynExt(SDNode *N) {
|
||||
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
|
||||
if (UseDivergentRegisterIndexing || isa<ConstantSDNode>(Idx))
|
||||
bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
|
||||
unsigned NumElem,
|
||||
bool IsDivergentIdx) {
|
||||
if (UseDivergentRegisterIndexing)
|
||||
return false;
|
||||
|
||||
SDValue Vec = N->getOperand(0);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
EVT EltVT = VecVT.getVectorElementType();
|
||||
unsigned VecSize = VecVT.getSizeInBits();
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
unsigned NumElem = VecVT.getVectorNumElements();
|
||||
unsigned VecSize = EltSize * NumElem;
|
||||
|
||||
// Sub-dword vectors of size 2 dword or less have better implementation.
|
||||
if (VecSize <= 64 && EltSize < 32)
|
||||
|
@ -9645,7 +9641,7 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
|
|||
return true;
|
||||
|
||||
// Always do this if var-idx is divergent, otherwise it will become a loop.
|
||||
if (Idx->isDivergent())
|
||||
if (IsDivergentIdx)
|
||||
return true;
|
||||
|
||||
// Large vectors would yield too many compares and v_cndmask_b32 instructions.
|
||||
|
@ -9654,6 +9650,21 @@ static bool shouldExpandVectorDynExt(SDNode *N) {
|
|||
return NumInsts <= 16;
|
||||
}
|
||||
|
||||
static bool shouldExpandVectorDynExt(SDNode *N) {
|
||||
SDValue Idx = N->getOperand(N->getNumOperands() - 1);
|
||||
if (isa<ConstantSDNode>(Idx))
|
||||
return false;
|
||||
|
||||
SDValue Vec = N->getOperand(0);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
EVT EltVT = VecVT.getVectorElementType();
|
||||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
unsigned NumElem = VecVT.getVectorNumElements();
|
||||
|
||||
return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
|
||||
Idx->isDivergent());
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performExtractVectorEltCombine(
|
||||
SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
SDValue Vec = N->getOperand(0);
|
||||
|
@ -9715,7 +9726,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
|
|||
unsigned EltSize = EltVT.getSizeInBits();
|
||||
|
||||
// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
|
||||
if (shouldExpandVectorDynExt(N)) {
|
||||
if (::shouldExpandVectorDynExt(N)) {
|
||||
SDLoc SL(N);
|
||||
SDValue Idx = N->getOperand(1);
|
||||
SDValue V;
|
||||
|
@ -9778,7 +9789,7 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
|
|||
|
||||
// INSERT_VECTOR_ELT (<n x e>, var-idx)
|
||||
// => BUILD_VECTOR n x select (e, const-idx)
|
||||
if (!shouldExpandVectorDynExt(N))
|
||||
if (!::shouldExpandVectorDynExt(N))
|
||||
return SDValue();
|
||||
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
|
|
|
@ -203,6 +203,11 @@ public:
|
|||
/// and not emit a relocation for an LDS global.
|
||||
bool shouldUseLDSConstAddress(const GlobalValue *GV) const;
|
||||
|
||||
/// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
|
||||
/// expanded into a set of cmp/select instructions.
|
||||
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
|
||||
bool IsDivergentIdx);
|
||||
|
||||
private:
|
||||
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
|
||||
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue