forked from OSchip/llvm-project
[AMDGPU] SiFixSGPRCopies should not modify non-divergent PHI
Differential revision: https://reviews.llvm.org/D40556 llvm-svn: 319534
This commit is contained in:
parent
11ce6e6a83
commit
c1425c9d6b
|
@ -81,6 +81,7 @@
|
||||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
#include "llvm/CodeGen/MachineOperand.h"
|
#include "llvm/CodeGen/MachineOperand.h"
|
||||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||||
|
#include "llvm/CodeGen/MachinePostDominators.h"
|
||||||
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
||||||
#include "llvm/Pass.h"
|
#include "llvm/Pass.h"
|
||||||
#include "llvm/Support/CodeGen.h"
|
#include "llvm/Support/CodeGen.h"
|
||||||
|
@ -109,7 +110,12 @@ namespace {
|
||||||
|
|
||||||
class SIFixSGPRCopies : public MachineFunctionPass {
|
class SIFixSGPRCopies : public MachineFunctionPass {
|
||||||
MachineDominatorTree *MDT;
|
MachineDominatorTree *MDT;
|
||||||
|
MachinePostDominatorTree *MPDT;
|
||||||
|
DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
|
||||||
|
void computePDF(MachineFunction * MF);
|
||||||
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||||
|
void printPDF();
|
||||||
|
#endif
|
||||||
public:
|
public:
|
||||||
static char ID;
|
static char ID;
|
||||||
|
|
||||||
|
@ -122,6 +128,8 @@ public:
|
||||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||||
AU.addRequired<MachineDominatorTree>();
|
AU.addRequired<MachineDominatorTree>();
|
||||||
AU.addPreserved<MachineDominatorTree>();
|
AU.addPreserved<MachineDominatorTree>();
|
||||||
|
AU.addRequired<MachinePostDominatorTree>();
|
||||||
|
AU.addPreserved<MachinePostDominatorTree>();
|
||||||
AU.setPreservesCFG();
|
AU.setPreservesCFG();
|
||||||
MachineFunctionPass::getAnalysisUsage(AU);
|
MachineFunctionPass::getAnalysisUsage(AU);
|
||||||
}
|
}
|
||||||
|
@ -409,12 +417,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
|
|
||||||
const TargetRegisterInfo *TRI) {
|
|
||||||
return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
|
|
||||||
return hasTerminatorThatModifiesExec(*MBB, *TRI); });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Checks if there is potential path From instruction To instruction.
|
// Checks if there is potential path From instruction To instruction.
|
||||||
// If CutOff is specified and it sits in between of that path we ignore
|
// If CutOff is specified and it sits in between of that path we ignore
|
||||||
// a higher portion of the path and report it is not reachable.
|
// a higher portion of the path and report it is not reachable.
|
||||||
|
@ -562,12 +564,47 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
|
||||||
return Changed;
|
return Changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
|
||||||
|
MachineFunction::iterator B = MF->begin();
|
||||||
|
MachineFunction::iterator E = MF->end();
|
||||||
|
for (; B != E; ++B) {
|
||||||
|
if (B->succ_size() > 1) {
|
||||||
|
for (auto S : B->successors()) {
|
||||||
|
MachineDomTreeNode *runner = MPDT->getNode(&*S);
|
||||||
|
MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
|
||||||
|
while (runner && runner != sentinel) {
|
||||||
|
PDF[runner->getBlock()].insert(&*B);
|
||||||
|
runner = runner->getIDom();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||||
|
void SIFixSGPRCopies::printPDF() {
|
||||||
|
dbgs() << "\n######## PostDominanceFrontiers set #########\n";
|
||||||
|
for (auto &I : PDF) {
|
||||||
|
dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
|
||||||
|
for (auto &J : I.second) {
|
||||||
|
dbgs() << J->getNumber() << ' ';
|
||||||
|
}
|
||||||
|
dbgs() << '\n';
|
||||||
|
}
|
||||||
|
dbgs() << "\n##############################################\n";
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
||||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||||
MDT = &getAnalysis<MachineDominatorTree>();
|
MDT = &getAnalysis<MachineDominatorTree>();
|
||||||
|
MPDT = &getAnalysis<MachinePostDominatorTree>();
|
||||||
|
PDF.clear();
|
||||||
|
computePDF(&MF);
|
||||||
|
DEBUG(printPDF());
|
||||||
|
|
||||||
SmallVector<MachineInstr *, 16> Worklist;
|
SmallVector<MachineInstr *, 16> Worklist;
|
||||||
|
|
||||||
|
@ -621,15 +658,27 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
||||||
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
|
if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
// We don't need to fix the PHI if the common dominator of the
|
// We don't need to fix the PHI if all the source blocks
|
||||||
// two incoming blocks terminates with a uniform branch.
|
// have no divergent control dependecies
|
||||||
bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
|
bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
|
||||||
if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
|
if (!HasVGPROperand) {
|
||||||
MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
|
bool Uniform = true;
|
||||||
MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
|
MachineBasicBlock * Join = MI.getParent();
|
||||||
|
for (auto &O : MI.explicit_operands()) {
|
||||||
if (!predsHasDivergentTerminator(MBB0, TRI) &&
|
if (O.isMBB()) {
|
||||||
!predsHasDivergentTerminator(MBB1, TRI)) {
|
MachineBasicBlock * Source = O.getMBB();
|
||||||
|
SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
|
||||||
|
SetVector<MachineBasicBlock*> &JoinPDF = PDF[Join];
|
||||||
|
SetVector<MachineBasicBlock*> CDList;
|
||||||
|
for (auto &I : SourcePDF) {
|
||||||
|
if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
|
||||||
|
if (hasTerminatorThatModifiesExec(*I, *TRI))
|
||||||
|
Uniform = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Uniform) {
|
||||||
DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
|
DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -89,7 +89,7 @@ endif:
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}divergent_loop:
|
; GCN-LABEL: {{^}}divergent_loop:
|
||||||
; VGPR: workitem_private_segment_byte_size = 16{{$}}
|
; VGPR: workitem_private_segment_byte_size = 12{{$}}
|
||||||
|
|
||||||
; GCN: {{^}}; BB#0:
|
; GCN: {{^}}; BB#0:
|
||||||
|
|
||||||
|
@ -123,10 +123,9 @@ endif:
|
||||||
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
|
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
|
||||||
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
|
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
|
||||||
; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
|
; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
|
||||||
; GCN: v_cmp_ne_u32_e32 vcc,
|
; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}}
|
||||||
; GCN: s_and_b64 vcc, exec, vcc
|
|
||||||
; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||||
; GCN-NEXT: s_cbranch_vccnz [[LOOP]]
|
; GCN-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||||
|
|
||||||
|
|
||||||
; GCN: [[END]]:
|
; GCN: [[END]]:
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
|
; GCN-LABEL: BB0_2
|
||||||
|
; GCN-NOT: v_readfirstlane
|
||||||
|
|
||||||
|
|
||||||
|
target triple = "amdgcn--amdhsa"
|
||||||
|
define amdgpu_kernel void @uniform-PHI(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
|
||||||
|
bb:
|
||||||
|
%tmp = sext i32 %arg2 to i64
|
||||||
|
%tmp3 = tail call i64 @_Z13get_global_idj(i32 0) #2
|
||||||
|
%tmp4 = icmp ugt i64 %tmp3, %tmp
|
||||||
|
%tmp5 = icmp sgt i32 %arg2, 0
|
||||||
|
%tmp6 = and i1 %tmp4, %tmp5
|
||||||
|
br i1 %tmp6, label %bb7, label %bb17
|
||||||
|
|
||||||
|
bb7: ; preds = %bb
|
||||||
|
br label %bb8
|
||||||
|
|
||||||
|
bb8: ; preds = %bb8, %bb7
|
||||||
|
%tmp9 = phi i32 [ %tmp15, %bb8 ], [ 0, %bb7 ]
|
||||||
|
%tmp10 = phi i32 [ %tmp14, %bb8 ], [ 0, %bb7 ]
|
||||||
|
%tmp11 = zext i32 %tmp9 to i64
|
||||||
|
%tmp12 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp11
|
||||||
|
%tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4
|
||||||
|
%tmp14 = add nsw i32 %tmp13, %tmp10
|
||||||
|
%tmp15 = add nuw nsw i32 %tmp9, 1
|
||||||
|
%tmp16 = icmp eq i32 %tmp15, %arg2
|
||||||
|
br i1 %tmp16, label %bb17, label %bb8
|
||||||
|
|
||||||
|
bb17: ; preds = %bb8, %bb
|
||||||
|
%tmp18 = phi i32 [ 0, %bb ], [ %tmp14, %bb8 ]
|
||||||
|
store i32 %tmp18, i32 addrspace(1)* %arg1, align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
|
||||||
|
attributes #1 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "target-features"="+16-bit-insts,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||||
|
attributes #2 = { convergent nounwind readnone }
|
|
@ -6,11 +6,10 @@
|
||||||
; CHECK: v_cmp_ne_u32_e32 vcc, 0
|
; CHECK: v_cmp_ne_u32_e32 vcc, 0
|
||||||
; CHECK: s_and_saveexec_b64
|
; CHECK: s_and_saveexec_b64
|
||||||
; CHECK-NEXT: ; mask branch
|
; CHECK-NEXT: ; mask branch
|
||||||
; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
|
|
||||||
; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
|
; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
|
||||||
|
|
||||||
; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
|
; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
|
||||||
; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
|
; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]]
|
||||||
|
|
||||||
; CHECK: s_endpgm
|
; CHECK: s_endpgm
|
||||||
define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
|
define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
|
||||||
|
@ -35,7 +34,6 @@ out:
|
||||||
; CHECK-LABEL: {{^}}test2:
|
; CHECK-LABEL: {{^}}test2:
|
||||||
; CHECK: s_and_saveexec_b64
|
; CHECK: s_and_saveexec_b64
|
||||||
; CHECK-NEXT: ; mask branch
|
; CHECK-NEXT: ; mask branch
|
||||||
; CHECK-NEXT: s_cbranch_execz
|
|
||||||
define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
|
||||||
main_body:
|
main_body:
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||||
|
|
|
@ -162,8 +162,8 @@ exit:
|
||||||
; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
|
; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
|
||||||
; SI: buffer_load_dword
|
; SI: buffer_load_dword
|
||||||
; SI-DAG: buffer_store_dword
|
; SI-DAG: buffer_store_dword
|
||||||
; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100
|
; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
|
||||||
; SI: s_cbranch_vccz [[LABEL_LOOP]]
|
; SI: s_cbranch_scc0 [[LABEL_LOOP]]
|
||||||
; SI: [[LABEL_EXIT]]:
|
; SI: [[LABEL_EXIT]]:
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue