forked from OSchip/llvm-project
AMDGPU/GlobalISel: Fix G_EXTRACT_VECTOR_ELT mapping for s-v case
If an SGPR vector is indexed with a VGPR, the actual indexing will be done on the SGPR and produce an SGPR. A copy needs to be inserted inside the waterwall loop to the VGPR result.
This commit is contained in:
parent
375371cc8b
commit
5cabb8357a
|
@ -1437,6 +1437,39 @@ AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
|
|||
return MIB;
|
||||
}
|
||||
|
||||
bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
|
||||
Register SrcReg) const {
|
||||
MachineRegisterInfo &MRI = *B.getMRI();
|
||||
LLT SrcTy = MRI.getType(SrcReg);
|
||||
if (SrcTy.getSizeInBits() == 32) {
|
||||
// Use a v_mov_b32 here to make the exec dependency explicit.
|
||||
B.buildInstr(AMDGPU::V_MOV_B32_e32)
|
||||
.addDef(DstReg)
|
||||
.addUse(SrcReg);
|
||||
return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
|
||||
constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
|
||||
}
|
||||
|
||||
Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
|
||||
B.buildInstr(AMDGPU::V_MOV_B32_e32)
|
||||
.addDef(TmpReg0)
|
||||
.addUse(SrcReg, 0, AMDGPU::sub0);
|
||||
B.buildInstr(AMDGPU::V_MOV_B32_e32)
|
||||
.addDef(TmpReg1)
|
||||
.addUse(SrcReg, 0, AMDGPU::sub1);
|
||||
B.buildInstr(AMDGPU::REG_SEQUENCE)
|
||||
.addDef(DstReg)
|
||||
.addUse(TmpReg0)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addUse(TmpReg1)
|
||||
.addImm(AMDGPU::sub1);
|
||||
|
||||
return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
|
||||
constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
|
||||
}
|
||||
|
||||
void AMDGPURegisterBankInfo::applyMappingImpl(
|
||||
const OperandsMapper &OpdMapper) const {
|
||||
MachineInstr &MI = OpdMapper.getMI();
|
||||
|
@ -1906,17 +1939,43 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
|
||||
assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
|
||||
|
||||
if (DstRegs.empty()) {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
executeInWaterfallLoop(MI, MRI, { 2 });
|
||||
return;
|
||||
}
|
||||
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
|
||||
MachineIRBuilder B(MI);
|
||||
|
||||
const ValueMapping &DstMapping
|
||||
= OpdMapper.getInstrMapping().getOperandMapping(0);
|
||||
const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
|
||||
const RegisterBank *SrcBank =
|
||||
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
|
||||
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
Register SrcReg = MI.getOperand(1).getReg();
|
||||
Register IdxReg = MI.getOperand(2).getReg();
|
||||
LLT DstTy = MRI.getType(DstReg);
|
||||
(void)DstTy;
|
||||
|
||||
// If this is a VGPR result only because the index was a VGPR result, the
|
||||
// actual indexing will be done on the SGPR source vector, which will
|
||||
// produce a scalar result. We need to copy to the VGPR result inside the
|
||||
// waterfall loop.
|
||||
const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
|
||||
SrcBank == &AMDGPU::SGPRRegBank;
|
||||
if (DstRegs.empty()) {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
|
||||
executeInWaterfallLoop(MI, MRI, { 2 });
|
||||
|
||||
if (NeedCopyToVGPR) {
|
||||
// We don't want a phi for this temporary reg.
|
||||
Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
|
||||
MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
|
||||
MI.getOperand(0).setReg(TmpReg);
|
||||
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
|
||||
|
||||
// Use a v_mov_b32 here to make the exec dependency explicit.
|
||||
buildVCopy(B, DstReg, TmpReg);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
assert(DstTy.getSizeInBits() == 64);
|
||||
|
||||
|
@ -1924,7 +1983,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
const LLT S32 = LLT::scalar(32);
|
||||
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
|
||||
|
||||
MachineIRBuilder B(MI);
|
||||
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
|
||||
auto One = B.buildConstant(S32, 1);
|
||||
|
||||
|
@ -1937,16 +1995,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
|
||||
auto IdxLo = B.buildShl(S32, IdxReg, One);
|
||||
auto IdxHi = B.buildAdd(S32, IdxLo, One);
|
||||
B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
|
||||
B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
|
||||
|
||||
const ValueMapping &DstMapping
|
||||
= OpdMapper.getInstrMapping().getOperandMapping(0);
|
||||
auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
|
||||
auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
|
||||
|
||||
// FIXME: Should be getting from mapping or not?
|
||||
const RegisterBank *SrcBank =
|
||||
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
|
||||
MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
|
||||
MRI.setRegBank(DstReg, *DstBank);
|
||||
MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
|
||||
MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
|
||||
MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
|
||||
|
@ -1964,6 +2017,23 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
|||
MI.eraseFromParent();
|
||||
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
|
||||
OpsToWaterfall, MRI);
|
||||
|
||||
if (NeedCopyToVGPR) {
|
||||
MachineBasicBlock *LoopBB = Extract1->getParent();
|
||||
Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
|
||||
Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
|
||||
MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
|
||||
MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
|
||||
|
||||
Extract0->getOperand(0).setReg(TmpReg0);
|
||||
Extract1->getOperand(0).setReg(TmpReg1);
|
||||
|
||||
B.setInsertPt(*LoopBB, ++Extract1->getIterator());
|
||||
|
||||
buildVCopy(B, DstRegs[0], TmpReg0);
|
||||
buildVCopy(B, DstRegs[1], TmpReg1);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
case AMDGPU::G_INSERT_VECTOR_ELT: {
|
||||
|
|
|
@ -45,7 +45,8 @@ public:
|
|||
const SIRegisterInfo *TRI;
|
||||
const SIInstrInfo *TII;
|
||||
|
||||
private:
|
||||
bool buildVCopy(MachineIRBuilder &B, Register DstReg, Register SrcReg) const;
|
||||
|
||||
bool collectWaterfallOperands(
|
||||
SmallSet<Register, 4> &SGPROperandRegs,
|
||||
MachineInstr &MI,
|
||||
|
|
|
@ -55,7 +55,8 @@ body: |
|
|||
; WAVE64: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
|
||||
; WAVE64: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
|
||||
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
|
||||
; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
|
||||
; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
|
||||
; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
|
||||
; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
||||
|
@ -63,7 +64,7 @@ body: |
|
|||
; WAVE64: successors: %bb.3(0x80000000)
|
||||
; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; WAVE64: .3:
|
||||
; WAVE64: $vgpr0 = COPY [[EVEC]](s32)
|
||||
; WAVE64: $vgpr0 = COPY [[V_MOV_B32_e32_]](s32)
|
||||
; WAVE32-LABEL: name: extract_vector_elt_v16s32_sv
|
||||
; WAVE32: successors: %bb.1(0x80000000)
|
||||
; WAVE32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0
|
||||
|
@ -78,7 +79,8 @@ body: |
|
|||
; WAVE32: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1
|
||||
; WAVE32: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec
|
||||
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
|
||||
; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
|
||||
; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32)
|
||||
; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
|
||||
; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
|
||||
; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
||||
|
@ -86,7 +88,7 @@ body: |
|
|||
; WAVE32: successors: %bb.3(0x80000000)
|
||||
; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
|
||||
; WAVE32: .3:
|
||||
; WAVE32: $vgpr0 = COPY [[EVEC]](s32)
|
||||
; WAVE32: $vgpr0 = COPY [[V_MOV_B32_e32_]](s32)
|
||||
%0:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
|
||||
%1:_(s32) = COPY $vgpr0
|
||||
%2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %1
|
||||
|
@ -280,8 +282,10 @@ body: |
|
|||
; WAVE64: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
|
||||
; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
|
||||
; WAVE64: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
|
||||
; WAVE64: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
|
||||
; WAVE64: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
|
||||
; WAVE64: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
|
||||
; WAVE64: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
|
||||
; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
|
||||
; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
|
||||
; WAVE64: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE64: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
|
||||
; WAVE64: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
||||
|
@ -289,7 +293,7 @@ body: |
|
|||
; WAVE64: successors: %bb.3(0x80000000)
|
||||
; WAVE64: $exec = S_MOV_B64_term [[S_MOV_B64_term]]
|
||||
; WAVE64: .3:
|
||||
; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
|
||||
; WAVE64: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[V_MOV_B32_e32_]](s32), [[V_MOV_B32_e32_1]](s32)
|
||||
; WAVE64: $vgpr0_vgpr1 = COPY [[MV]](s64)
|
||||
; WAVE32-LABEL: name: extract_vector_elt_v8s64_sv
|
||||
; WAVE32: successors: %bb.1(0x80000000)
|
||||
|
@ -315,8 +319,10 @@ body: |
|
|||
; WAVE32: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec
|
||||
; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32)
|
||||
; WAVE32: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]]
|
||||
; WAVE32: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
|
||||
; WAVE32: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
|
||||
; WAVE32: [[EVEC:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32)
|
||||
; WAVE32: [[EVEC1:%[0-9]+]]:sreg_32(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32)
|
||||
; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC]](s32), implicit $exec
|
||||
; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32(s32) = V_MOV_B32_e32 [[EVEC1]](s32), implicit $exec
|
||||
; WAVE32: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; WAVE32: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
|
||||
; WAVE32: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
||||
|
@ -324,7 +330,7 @@ body: |
|
|||
; WAVE32: successors: %bb.3(0x80000000)
|
||||
; WAVE32: $exec_lo = S_MOV_B32_term [[S_MOV_B32_term]]
|
||||
; WAVE32: .3:
|
||||
; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32)
|
||||
; WAVE32: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[V_MOV_B32_e32_]](s32), [[V_MOV_B32_e32_1]](s32)
|
||||
; WAVE32: $vgpr0_vgpr1 = COPY [[MV]](s64)
|
||||
%0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
|
||||
%1:_(s32) = COPY $vgpr0
|
||||
|
|
Loading…
Reference in New Issue