forked from OSchip/llvm-project
AMDGPU: Fix copying i1 value out of loop with non-uniform exit
Summary: When an i1-value is defined inside of a loop and used outside of it, we cannot simply use the SGPR bitmask from the loop's last iteration. There are also useful and correct cases of an i1-value being copied between basic blocks, e.g. when a condition is computed outside of a loop and used inside it. The concept of dominators is not sufficient to capture what is going on, so I propose the notion of "lane-dominators". Fixes a bug encountered in Nier: Automata. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103743 Change-Id: If37b969ddc71d823ab3004aeafb9ea050e45bd9a Reviewers: arsenm, rampitec Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D40547 llvm-svn: 329164
This commit is contained in:
parent
21d9b33d62
commit
3ffd383a15
|
@ -17,6 +17,7 @@
|
|||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "Utils/AMDGPULaneDominator.h"
|
||||
#include "llvm/CodeGen/LiveIntervals.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
|
@ -141,7 +142,8 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
|
|||
DefInst->getOperand(3).getReg()) &&
|
||||
TRI->getCommonSubClass(
|
||||
MRI.getRegClass(DefInst->getOperand(3).getReg()),
|
||||
&AMDGPU::SGPR_64RegClass)) {
|
||||
&AMDGPU::SGPR_64RegClass) &&
|
||||
AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
|
||||
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
|
||||
.add(Dst)
|
||||
.addReg(AMDGPU::EXEC)
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// MBB A lane-dominates MBB B if
|
||||
// 1. A dominates B in the usual sense, i.e. every path from the entry to B
|
||||
// goes through A, and
|
||||
// 2. whenever B executes, every active lane during that execution of B was
|
||||
// also active during the most recent execution of A.
|
||||
//
|
||||
// The simplest example where A dominates B but does not lane-dominate it is
|
||||
// where A is a loop:
|
||||
//
|
||||
// |
|
||||
// +--+
|
||||
// A |
|
||||
// +--+
|
||||
// |
|
||||
// B
|
||||
//
|
||||
// Unfortunately, the second condition is not fully captured by the control
|
||||
// flow graph when it is unstructured (as may happen when branch conditions are
|
||||
// uniform).
|
||||
//
|
||||
// The following replacement of the second condition is a conservative
|
||||
// approximation. It is an equivalent condition when the CFG is fully
|
||||
// structured:
|
||||
//
|
||||
// 2'. every cycle in the CFG that contains A also contains B.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPULaneDominator.h"
|
||||
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/CodeGen/MachineBasicBlock.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
// Given machine basic blocks A and B where A dominates B, check whether
|
||||
// A lane-dominates B.
|
||||
//
|
||||
// The check is conservative, i.e. there can be false-negatives.
|
||||
bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
|
||||
// Check whether A is reachable from itself without going through B.
|
||||
DenseSet<MachineBasicBlock *> Reachable;
|
||||
SmallVector<MachineBasicBlock *, 8> Stack;
|
||||
|
||||
Stack.push_back(A);
|
||||
do {
|
||||
MachineBasicBlock *MBB = Stack.back();
|
||||
Stack.pop_back();
|
||||
|
||||
for (MachineBasicBlock *Succ : MBB->successors()) {
|
||||
if (Succ == A)
|
||||
return false;
|
||||
if (Succ != B && Reachable.insert(Succ).second)
|
||||
Stack.push_back(Succ);
|
||||
}
|
||||
} while (!Stack.empty());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace AMDGPU
|
||||
|
||||
} // namespace llvm
|
|
@ -0,0 +1,24 @@
|
|||
//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
|
||||
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class MachineBasicBlock;
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
|
||||
|
||||
} // end namespace AMDGPU
|
||||
} // end namespace llvm
|
||||
|
||||
#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
|
|
@ -2,4 +2,5 @@ add_llvm_library(LLVMAMDGPUUtils
|
|||
AMDGPUBaseInfo.cpp
|
||||
AMDKernelCodeTUtils.cpp
|
||||
AMDGPUAsmUtils.cpp
|
||||
AMDGPULaneDominator.cpp
|
||||
)
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
|
||||
; SI-LABEL: {{^}}i1_copy_from_loop:
|
||||
;
|
||||
; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
|
||||
; only contain the lanes that were active during the last loop iteration.
|
||||
;
|
||||
; SI: ; %for.body
|
||||
; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
|
||||
; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
|
||||
; SI-NOT: [[VREG]]
|
||||
; SI: ; %for.end
|
||||
; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
|
||||
define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i = phi i32 [0, %entry], [%i.inc, %end.loop]
|
||||
%cc = icmp ult i32 %i, 4
|
||||
br i1 %cc, label %mid.loop, label %for.end
|
||||
|
||||
mid.loop:
|
||||
%v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i1 false, i1 false)
|
||||
%cc2 = fcmp oge float %v, 0.0
|
||||
br i1 %cc2, label %end.loop, label %for.end
|
||||
|
||||
end.loop:
|
||||
%i.inc = add i32 %i, 1
|
||||
br label %for.body
|
||||
|
||||
for.end:
|
||||
br i1 %cc, label %if, label %end
|
||||
|
||||
if:
|
||||
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float undef, float undef, float undef, i1 true, i1 true)
|
||||
br label %end
|
||||
|
||||
end:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
|
||||
|
||||
attributes #0 = { nounwind readonly }
|
||||
attributes #1 = { nounwind inaccessiblememonly }
|
Loading…
Reference in New Issue