forked from OSchip/llvm-project
AMDGPU: add execfix flag to SI_ELSE
Summary: SI_ELSE is lowered into two parts: s_or_saveexec_b64 dst, src (at the start of the basic block) s_xor_b64 exec, exec, dst (at the end of the basic block) The idea is that dst contains the exec mask of the preceding IF block. It can happen that SIWholeQuadMode decides to switch from WQM to Exact mode inside the basic block that contains SI_ELSE, in which case it introduces an instruction s_and_b64 exec, exec, s[...] which masks out bits that can correspond to both the IF and the ELSE paths. So the resulting sequence must be: s_or_savexec_b64 dst, src s_and_b64 exec, exec, s[...] <-- added by SIWholeQuadMode s_and_b64 dst, dst, exec <-- added by SILowerControlFlow s_xor_b64 exec, exec, dst Whether to add the additional s_and_b64 dst, dst, exec is currently determined via the ExecModified tracking. With this change, it is instead determined by an additional flag on SI_ELSE which is set by SIWholeQuadMode. Finally: It also occured to me that an alternative approach for the long run is for SILowerControlFlow to unconditionally emit s_or_saveexec_b64 dst, src ... s_and_b64 dst, dst, exec s_xor_b64 exec, exec, dst and have a pass that detects and cleans up the "redundant AND with exec" pattern where possible. This could be useful anyway, because we also add instructions s_and_b64 vcc, exec, vcc before s_cbranch_scc (in moveToALU), and those are often redundant. I have some pending changes to how KILL is lowered that could also benefit from such a cleanup pass. In any case, this current patch could help in the short term with the whole ExecModified business. Reviewers: tstellarAMD, arsenm Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: https://reviews.llvm.org/D22846 llvm-svn: 276972
This commit is contained in:
parent
1081ccf855
commit
3b572002a2
|
@ -1952,8 +1952,7 @@ def SI_IF: PseudoInstSI <
|
|||
}
|
||||
|
||||
def SI_ELSE : PseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target),
|
||||
[(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
|
||||
(outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix)> {
|
||||
let Constraints = "$src = $dst";
|
||||
}
|
||||
|
||||
|
@ -2132,6 +2131,11 @@ def SI_PC_ADD_REL_OFFSET : PseudoInstSI <
|
|||
|
||||
let Predicates = [isGCN] in {
|
||||
|
||||
def : Pat<
|
||||
(int_amdgcn_else i64:$src, bb:$target),
|
||||
(SI_ELSE $src, $target, 0)
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(int_AMDGPU_kilp),
|
||||
(SI_KILL 0xbf800000)
|
||||
|
|
|
@ -84,7 +84,7 @@ private:
|
|||
bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
|
||||
|
||||
void If(MachineInstr &MI);
|
||||
void Else(MachineInstr &MI, bool ExecModified);
|
||||
void Else(MachineInstr &MI);
|
||||
void Break(MachineInstr &MI);
|
||||
void IfBreak(MachineInstr &MI);
|
||||
void ElseBreak(MachineInstr &MI);
|
||||
|
@ -252,7 +252,7 @@ void SILowerControlFlow::If(MachineInstr &MI) {
|
|||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
|
||||
void SILowerControlFlow::Else(MachineInstr &MI) {
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
DebugLoc DL = MI.getDebugLoc();
|
||||
unsigned Dst = MI.getOperand(0).getReg();
|
||||
|
@ -262,7 +262,7 @@ void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
|
|||
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
|
||||
.addReg(Src); // Saved EXEC
|
||||
|
||||
if (ExecModified) {
|
||||
if (MI.getOperand(3).getImm() != 0) {
|
||||
// Adjust the saved exec to account for the modifications during the flow
|
||||
// block that contains the ELSE. This can happen when WQM mode is switched
|
||||
// off.
|
||||
|
@ -427,7 +427,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
|||
|
||||
MachineBasicBlock *EmptyMBBAtEnd = nullptr;
|
||||
MachineBasicBlock::iterator I, Next;
|
||||
bool ExecModified = false;
|
||||
|
||||
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
||||
Next = std::next(I);
|
||||
|
@ -438,9 +437,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
|||
if (TII->isFLAT(MI))
|
||||
NeedFlat = true;
|
||||
|
||||
if (I->modifiesRegister(AMDGPU::EXEC, TRI))
|
||||
ExecModified = true;
|
||||
|
||||
switch (MI.getOpcode()) {
|
||||
default: break;
|
||||
case AMDGPU::SI_IF:
|
||||
|
@ -449,7 +445,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
|
|||
break;
|
||||
|
||||
case AMDGPU::SI_ELSE:
|
||||
Else(MI, ExecModified);
|
||||
Else(MI);
|
||||
break;
|
||||
|
||||
case AMDGPU::SI_BREAK:
|
||||
|
|
|
@ -434,6 +434,9 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
|
|||
|
||||
State = Needs;
|
||||
}
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::SI_ELSE && State == StateExact)
|
||||
MI.getOperand(3).setImm(1);
|
||||
}
|
||||
|
||||
if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}else_no_execfix:
|
||||
; CHECK: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
|
||||
; CHECK-NEXT: ; mask branch
|
||||
define amdgpu_ps float @else_no_execfix(i32 %z, float %v) {
|
||||
main_body:
|
||||
%cc = icmp sgt i32 %z, 5
|
||||
br i1 %cc, label %if, label %else
|
||||
|
||||
if:
|
||||
%v.if = fmul float %v, 2.0
|
||||
br label %end
|
||||
|
||||
else:
|
||||
%v.else = fmul float %v, 3.0
|
||||
br label %end
|
||||
|
||||
end:
|
||||
%r = phi float [ %v.if, %if ], [ %v.else, %else ]
|
||||
ret float %r
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}else_execfix_leave_wqm:
|
||||
; CHECK: ; %Flow
|
||||
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
|
||||
; CHECK-NEXT: s_and_b64 exec, exec,
|
||||
; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]]
|
||||
; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
|
||||
; CHECK-NEXT: ; mask branch
|
||||
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
|
||||
main_body:
|
||||
%cc = icmp sgt i32 %z, 5
|
||||
br i1 %cc, label %if, label %else
|
||||
|
||||
if:
|
||||
%v.if = fmul float %v, 2.0
|
||||
br label %end
|
||||
|
||||
else:
|
||||
%c = fmul float %v, 3.0
|
||||
%c.i = bitcast float %c to i32
|
||||
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
|
||||
%v.else = extractelement <4 x float> %tex, i32 0
|
||||
br label %end
|
||||
|
||||
end:
|
||||
%r = phi float [ %v.if, %if ], [ %v.else, %else ]
|
||||
call void @llvm.amdgcn.buffer.store.f32(float %r, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
|
||||
|
||||
declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
|
Loading…
Reference in New Issue