[AMDGPU] Make GCNRegBankReassign assign based on subreg banks

When scavenging consider the sub-register of the source operand
to determine the bank of a candidate register (not just sub0).
Without this it is possible to introduce an infinite loop,
e.g. $sgpr15_sgpr16_sgpr17 can be assigned for a conflict between
$sgpr0 and SGPR_96:sub1.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D84910
This commit is contained in:
Carl Ritson 2020-08-04 12:20:12 +09:00
parent bcea3a7a28
commit 57899934ea
4 changed files with 248 additions and 76 deletions

View File

@ -83,9 +83,10 @@ class GCNRegBankReassign : public MachineFunctionPass {
class Candidate {
public:
Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
unsigned weight)
: MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
Candidate(MachineInstr *mi, unsigned reg, unsigned subreg,
unsigned freebanks, unsigned weight)
: MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks),
Weight(weight) {}
bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
@ -100,6 +101,7 @@ class GCNRegBankReassign : public MachineFunctionPass {
MachineInstr *MI;
unsigned Reg;
unsigned SubReg;
unsigned FreeBanks;
unsigned Weight;
};
@ -162,7 +164,7 @@ private:
const MCPhysReg *CSRegs;
// Returns bank for a phys reg.
unsigned getPhysRegBank(unsigned Reg) const;
unsigned getPhysRegBank(unsigned Reg, unsigned SubReg) const;
// Return a bit set for each register bank used. 4 banks for VGPRs and
// 8 banks for SGPRs.
@ -176,7 +178,7 @@ private:
// a register chosen from Bank.
std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
unsigned Reg = AMDGPU::NoRegister,
int Bank = -1);
unsigned SubReg = 0, int Bank = -1);
// Return true if register is regular VGPR or SGPR or their tuples.
// Returns false for special registers like m0, vcc etc.
@ -216,11 +218,12 @@ private:
// candidates are collected and added to work list.
unsigned computeStallCycles(unsigned SrcReg,
unsigned Reg = AMDGPU::NoRegister,
int Bank = -1, bool Collect = false);
unsigned SubReg = 0, int Bank = -1,
bool Collect = false);
// Search for a register in Bank unused within LI.
// Returns phys reg or NoRegister.
unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
unsigned scavengeReg(LiveInterval &LI, unsigned Bank, unsigned SubReg) const;
// Try to reassign candidate. Returns number or stall cycles saved.
unsigned tryReassign(Candidate &C);
@ -277,15 +280,24 @@ char GCNRegBankReassign::ID = 0;
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg,
unsigned SubReg) const {
assert(Register::isPhysicalRegister(Reg));
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
if (Size == 16)
Reg = TRI->get32BitRegister(Reg);
else if (Size > 32)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
else if (Size > 32) {
if (SubReg) {
const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg);
Reg = TRI->getSubReg(Reg, SubReg);
if (TRI->getRegSizeInBits(*SubRC) > 32)
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
} else {
Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
}
}
if (TRI->hasVGPRs(RC)) {
Reg -= AMDGPU::VGPR0;
@ -360,7 +372,7 @@ uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
std::pair<unsigned, unsigned>
GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
int Bank) {
unsigned SubReg, int Bank) {
unsigned StallCycles = 0;
unsigned UsedBanks = 0;
@ -375,26 +387,39 @@ GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
if (!Op.isReg() || Op.isUndef())
continue;
Register R = Op.getReg();
if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
const Register R = Op.getReg();
const TargetRegisterClass *RC = TRI->getRegClassForReg(*MRI, R);
// Do not compute stalls for AGPRs
if (TRI->hasAGPRs(RC))
continue;
// Do not compute stalls if sub-register covers all banks
if (Op.getSubReg()) {
LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
if (TRI->hasVGPRs(RC)) {
if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
continue;
} else {
if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
continue;
}
}
unsigned ShiftedBank = Bank;
if (Bank != -1 && R == Reg && Op.getSubReg()) {
unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
if (Offset && Bank < NUM_VGPR_BANKS) {
// If a register spans all banks we cannot shift it to avoid conflict.
if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
continue;
ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
} else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
// If a register spans all banks we cannot shift it to avoid conflict.
if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
continue;
if (Bank != -1 && R == Reg && (Op.getSubReg() || SubReg)) {
unsigned RegOffset =
TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0);
unsigned Offset = TRI->getChannelFromSubReg(
Op.getSubReg() ? Op.getSubReg() : (unsigned)AMDGPU::sub0);
if (Bank < NUM_VGPR_BANKS) {
unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset);
ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS;
} else if (Bank >= SGPR_BANK_OFFSET) {
unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1);
ShiftedBank = SGPR_BANK_OFFSET +
(Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
(Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS;
}
}
@ -576,17 +601,17 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
if (FreeBanks1)
Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
+ ((Size2 > Size1) ? 1 : 0)));
Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1,
Weight + ((Size2 > Size1) ? 1 : 0)));
if (FreeBanks2)
Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
+ ((Size1 > Size2) ? 1 : 0)));
Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2,
Weight + ((Size1 > Size2) ? 1 : 0)));
}
}
}
unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
unsigned Reg, int Bank,
unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg,
unsigned SubReg, int Bank,
bool Collect) {
unsigned TotalStallCycles = 0;
SmallSet<const MachineInstr *, 16> Visited;
@ -598,7 +623,7 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
continue;
unsigned StallCycles;
unsigned UsedBanks;
std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank);
TotalStallCycles += StallCycles;
if (Collect)
collectCandidates(MI, UsedBanks, StallCycles);
@ -607,8 +632,8 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
return TotalStallCycles;
}
unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
unsigned Bank) const {
unsigned GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank,
unsigned SubReg) const {
const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
: MaxNumSGPRs;
@ -620,7 +645,7 @@ unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
if (TRI->isSubRegisterEq(Reg, MaxReg))
break;
if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank)
continue;
for (unsigned I = 0; CSRegs[I]; ++I)
@ -669,7 +694,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
if (C.FreeBanks & (1 << Bank)) {
LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank);
if (Stalls < OrigStalls) {
LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
<< Stalls << '\n');
@ -683,7 +708,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
LRM->unassign(LI);
while (!BankStalls.empty()) {
BankStall BS = BankStalls.pop_back_val();
unsigned Reg = scavengeReg(LI, BS.Bank);
unsigned Reg = scavengeReg(LI, BS.Bank, C.SubReg);
if (Reg == AMDGPU::NoRegister) {
LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
<< '\n');
@ -801,7 +826,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
Candidates.pop_back();
if (LocalCyclesSaved) {
removeCandidates(C.Reg);
computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
Candidates.sort();
LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";

View File

@ -1492,7 +1492,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
; MOVREL-NEXT: v_mov_b32_e32 v20, v1
; MOVREL-NEXT: v_mov_b32_e32 v23, v1
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18
@ -1501,7 +1501,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4
@ -2123,7 +2123,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18
; MOVREL-NEXT: v_mov_b32_e32 v19, v0
; MOVREL-NEXT: v_mov_b32_e32 v20, v1
; MOVREL-NEXT: v_mov_b32_e32 v23, v1
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18
@ -2137,7 +2137,7 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, do
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4
@ -4111,7 +4111,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16
; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16
; MOVREL-NEXT: v_mov_b32_e32 v17, v2
; MOVREL-NEXT: v_mov_b32_e32 v19, v2
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
; MOVREL-NEXT: v_mov_b32_e32 v18, v3
; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2
@ -4119,7 +4119,7 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4
; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5
; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v14, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3
; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
@ -4251,42 +4251,42 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_v_s(<5 x double> inreg
; MOVREL-NEXT: s_mov_b32 s7, s9
; MOVREL-NEXT: s_mov_b32 s8, s10
; MOVREL-NEXT: s_mov_b32 s9, s11
; MOVREL-NEXT: v_mov_b32_e32 v18, s15
; MOVREL-NEXT: v_mov_b32_e32 v17, s14
; MOVREL-NEXT: v_mov_b32_e32 v16, s13
; MOVREL-NEXT: v_mov_b32_e32 v15, s12
; MOVREL-NEXT: v_mov_b32_e32 v14, s11
; MOVREL-NEXT: v_mov_b32_e32 v13, s10
; MOVREL-NEXT: v_mov_b32_e32 v12, s9
; MOVREL-NEXT: v_mov_b32_e32 v11, s8
; MOVREL-NEXT: v_mov_b32_e32 v10, s7
; MOVREL-NEXT: v_mov_b32_e32 v9, s6
; MOVREL-NEXT: v_mov_b32_e32 v8, s5
; MOVREL-NEXT: v_mov_b32_e32 v7, s4
; MOVREL-NEXT: v_mov_b32_e32 v6, s3
; MOVREL-NEXT: v_mov_b32_e32 v5, s2
; MOVREL-NEXT: v_mov_b32_e32 v4, s1
; MOVREL-NEXT: v_mov_b32_e32 v3, s0
; MOVREL-NEXT: v_mov_b32_e32 v20, s15
; MOVREL-NEXT: v_mov_b32_e32 v19, s14
; MOVREL-NEXT: v_mov_b32_e32 v18, s13
; MOVREL-NEXT: v_mov_b32_e32 v17, s12
; MOVREL-NEXT: v_mov_b32_e32 v16, s11
; MOVREL-NEXT: v_mov_b32_e32 v15, s10
; MOVREL-NEXT: v_mov_b32_e32 v14, s9
; MOVREL-NEXT: v_mov_b32_e32 v13, s8
; MOVREL-NEXT: v_mov_b32_e32 v12, s7
; MOVREL-NEXT: v_mov_b32_e32 v11, s6
; MOVREL-NEXT: v_mov_b32_e32 v10, s5
; MOVREL-NEXT: v_mov_b32_e32 v9, s4
; MOVREL-NEXT: v_mov_b32_e32 v8, s3
; MOVREL-NEXT: v_mov_b32_e32 v7, s2
; MOVREL-NEXT: v_mov_b32_e32 v6, s1
; MOVREL-NEXT: v_mov_b32_e32 v5, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1
; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v5, v6, v1, s0
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2
; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0
; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3
; MOVREL-NEXT: v_readfirstlane_b32 s2, v4
; MOVREL-NEXT: v_readfirstlane_b32 s3, v5
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v7, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1
; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0
; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1
; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1
; MOVREL-NEXT: v_readfirstlane_b32 s0, v2
; MOVREL-NEXT: v_readfirstlane_b32 s1, v3
; MOVREL-NEXT: v_readfirstlane_b32 s3, v5
; MOVREL-NEXT: v_readfirstlane_b32 s4, v6
; MOVREL-NEXT: v_readfirstlane_b32 s5, v7
; MOVREL-NEXT: v_readfirstlane_b32 s6, v8
@ -4448,7 +4448,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
; MOVREL-NEXT: v_mov_b32_e32 v13, v2
; MOVREL-NEXT: v_mov_b32_e32 v15, v2
; MOVREL-NEXT: v_mov_b32_e32 v14, v3
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
@ -4457,7 +4457,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec,
; MOVREL-NEXT: v_readfirstlane_b32 s0, v0
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2
; MOVREL-NEXT: v_readfirstlane_b32 s3, v3
; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo
@ -4514,7 +4514,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v:
; MOVREL: ; %bb.0: ; %entry
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12
; MOVREL-NEXT: v_mov_b32_e32 v13, v2
; MOVREL-NEXT: v_mov_b32_e32 v15, v2
; MOVREL-NEXT: v_mov_b32_e32 v14, v3
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
@ -4522,7 +4522,7 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec,
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
; MOVREL-NEXT: v_readfirstlane_b32 s0, v0
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo
; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12
; MOVREL-NEXT: v_readfirstlane_b32 s2, v2

View File

@ -0,0 +1,69 @@
# RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-WavefrontSize32,+WavefrontSize64 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
# Test that subreg reassignments are correctly handled when whole register also
# conflicts. If this is mishandled stall counts will be incorrect and cause an
# infinite loop.
# GCN-LABEL: vgpr64_mixed_use{{$}}
# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
# GCN: $vgpr4_vgpr5 = IMPLICIT_DEF
# GCN: $vcc = IMPLICIT_DEF
# GCN: $vgpr2_vgpr3 = IMPLICIT_DEF
# GCN: $vgpr6_vgpr7 = IMPLICIT_DEF
# GCN: $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
# GCN: $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
# GCN: $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr5, $vcc, implicit $exec
# GCN: $vgpr2 = V_CNDMASK_B32_e64 0, $vgpr0, 0, $vgpr4, killed $vcc, implicit $exec
# GCN: $sgpr0_sgpr1 = V_CMP_LT_U64_e64 $vgpr4_vgpr5, $vgpr0_vgpr1, implicit $exec
---
name: vgpr64_mixed_use
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
- { id: 1, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
- { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' }
- { id: 3, class: vgpr_32 }
- { id: 4, class: vgpr_32 }
- { id: 5, class: sreg_64_xexec }
- { id: 6, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
- { id: 7, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
- { id: 8, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
- { id: 9, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
- { id: 10, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
- { id: 11, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
- { id: 12, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
- { id: 13, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
- { id: 14, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
- { id: 15, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
- { id: 16, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
- { id: 17, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = IMPLICIT_DEF
%6 = IMPLICIT_DEF
%7 = IMPLICIT_DEF
%8 = IMPLICIT_DEF
%9 = IMPLICIT_DEF
%10 = IMPLICIT_DEF
%11 = IMPLICIT_DEF
%12 = IMPLICIT_DEF
%13 = IMPLICIT_DEF
%14 = IMPLICIT_DEF
%15 = IMPLICIT_DEF
%16 = IMPLICIT_DEF
%17 = IMPLICIT_DEF
%3 = V_CNDMASK_B32_e64 0, %0.sub1, 0, %1.sub1, %2, implicit $exec
%4 = V_CNDMASK_B32_e64 0, %0.sub0, 0, %1.sub0, %2, implicit $exec
%5 = V_CMP_LT_U64_e64 %1, %0, implicit $exec
S_ENDPGM 0
...

View File

@ -494,3 +494,81 @@ body: |
%2 = V_AND_B32_e32 %1, %0, implicit $exec
S_ENDPGM 0
...
# Test that bank of subreg is considered during scavenging.
# If handled incorrectly an infinite loop occurs.
# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}}
# GCN: S_AND_B32 renamable $sgpr13, $sgpr0,
---
name: s0_vs_s15_16_17_sub1
tracksRegLiveness: true
registers:
- { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' }
- { id: 1, class: sgpr_32 }
body: |
bb.0:
%0 = IMPLICIT_DEF
$sgpr0 = IMPLICIT_DEF
%1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc
S_ENDPGM 0
...
# Test that the size of subreg is correctly handled in bank calculation.
# If handled incorrectly an infinite loop occurs.
# GCN-LABEL: vgpr_sub_dependence{{$}}
# GCN: $vgpr9_vgpr10_vgpr11_vgpr12 = IMPLICIT_DEF
# GCN: $vgpr16_vgpr17 = IMPLICIT_DEF
# GCN: $vgpr14_vgpr15 = IMPLICIT_DEF
# GCN: $vgpr0_vgpr1 = IMPLICIT_DEF
# GCN: $vgpr7_vgpr8 = IMPLICIT_DEF
# GCN: $vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
# GCN: $vgpr18_vgpr19 = IMPLICIT_DEF
# GCN: $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
# GCN: $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
# GCN: $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
# GCN: $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
# GCN: $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
# GCN: $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
# GCN: $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr11_vgpr12, 0, killed $vgpr16_vgpr17, 0, 0, implicit $mode, implicit $exec
# GCN: $vgpr0_vgpr1 = V_ADD_F64 0, $vgpr9_vgpr10, 0, killed $vgpr14_vgpr15, 0, 0, implicit $mode, implicit $exec
---
name: vgpr_sub_dependence
tracksRegLiveness: true
registers:
- { id: 0, class: vreg_128, preferred-register: '$vgpr10_vgpr11_vgpr12_vgpr13' }
- { id: 1, class: vreg_64, preferred-register: '$vgpr16_vgpr17' }
- { id: 2, class: vreg_64, preferred-register: '$vgpr14_vgpr15' }
- { id: 3, class: vreg_64 }
- { id: 4, class: vreg_64 }
- { id: 5, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
- { id: 6, class: vreg_64, preferred-register: '$vgpr7_vgpr8' }
- { id: 7, class: vreg_128, preferred-register: '$vgpr3_vgpr4_vgpr5_vgpr6' }
- { id: 8, class: vreg_64, preferred-register: '$vgpr18_vgpr19' }
- { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
- { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
- { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
- { id: 12, class: vreg_128, preferred-register: '$vgpr32_vgpr33_vgpr34_vgpr35' }
- { id: 13, class: vreg_128, preferred-register: '$vgpr36_vgpr37_vgpr38_vgpr39' }
- { id: 14, class: vreg_128, preferred-register: '$vgpr40_vgpr41_vgpr42_vgpr43' }
- { id: 15, class: vreg_128, preferred-register: '$vgpr44_vgpr45_vgpr46_vgpr47' }
body: |
bb.0:
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
%2 = IMPLICIT_DEF
%5 = IMPLICIT_DEF
%6 = IMPLICIT_DEF
%7 = IMPLICIT_DEF
%8 = IMPLICIT_DEF
%9 = IMPLICIT_DEF
%10 = IMPLICIT_DEF
%11 = IMPLICIT_DEF
%12 = IMPLICIT_DEF
%13 = IMPLICIT_DEF
%14 = IMPLICIT_DEF
%15 = IMPLICIT_DEF
%3 = V_ADD_F64 0, %0.sub2_sub3:vreg_128, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec
%4 = V_ADD_F64 0, %0.sub0_sub1:vreg_128, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
...