[AMDGPU] Propagate AGPR RC from PHI to its PHI operands

We can fix register class of PHI based on its all AGPR uses.
That leaves behind all PHIs which were already processed
earlier. Propagate RC back to PHI operands of a PHI.

Differential Revision: https://reviews.llvm.org/D77344
This commit is contained in:
Stanislav Mekhanoshin 2020-04-02 16:06:45 -07:00
parent b4b7c989d6
commit 0462795095
2 changed files with 55 additions and 0 deletions

View File

@ -766,6 +766,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
bool AllAGPRUses = true;
SetVector<const MachineInstr *> worklist;
SmallSet<const MachineInstr *, 4> Visited;
SetVector<MachineInstr *> PHIOperands;
worklist.insert(&MI);
Visited.insert(&MI);
while (!worklist.empty()) {
@ -810,6 +811,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
if (DefMI && DefMI->isPHI())
PHIOperands.insert(DefMI);
}
}
bool hasVGPRInput = false;
@ -845,4 +851,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
TII->legalizeOperands(MI, MDT);
}
// Propagate register class back to PHI operands which are PHI themselves.
while (!PHIOperands.empty()) {
processPHINode(*PHIOperands.pop_back_val());
}
}

View File

@ -487,5 +487,50 @@ exit:
ret void
}
; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
; Check that we do not copy agprs to vgprs and back in an outer loop.
; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: [[INNER_LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[INNER_LOOP]]
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
; Final result should be read only once after the loop.
; GCN-COUNT-32: v_accvgpr_read_b32
define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
entry:
br label %for.cond.preheader
for.cond.preheader:
%phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
%c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
br label %inner.for.cond.preheader
inner.for.cond.preheader:
%phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
%c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
%inc = add nuw nsw i32 %c, 1
%cc = icmp eq i32 %inc, 16
br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
inner.exit:
%inc.0 = add nuw nsw i32 %c.0, 1
%cc.0 = icmp eq i32 %inc.0, 16
br i1 %cc.0, label %exit, label %for.cond.preheader
exit:
store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
ret void
}
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare i32 @llvm.amdgcn.workitem.id.x()