forked from OSchip/llvm-project
AMDGPU: Analyze divergence of inline asm
This commit is contained in:
parent
0d6fccb460
commit
cb7b661d3d
|
@ -578,8 +578,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static bool isArgPassedInSGPR(const Argument *A) {
|
||||
const Function *F = A->getParent();
|
||||
|
||||
|
@ -606,6 +604,54 @@ static bool isArgPassedInSGPR(const Argument *A) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
|
||||
/// this is analyzing the collective result of all output registers. Otherwise,
|
||||
/// this is only querying a specific result index if this returns multiple
|
||||
/// registers in a struct.
|
||||
bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
|
||||
const CallInst *CI, ArrayRef<unsigned> Indices) const {
|
||||
// TODO: Handle complex extract indices
|
||||
if (Indices.size() > 1)
|
||||
return true;
|
||||
|
||||
const DataLayout &DL = CI->getModule()->getDataLayout();
|
||||
const SIRegisterInfo *TRI = ST->getRegisterInfo();
|
||||
ImmutableCallSite CS(CI);
|
||||
TargetLowering::AsmOperandInfoVector TargetConstraints
|
||||
= TLI->ParseConstraints(DL, ST->getRegisterInfo(), CS);
|
||||
|
||||
const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
|
||||
|
||||
int OutputIdx = 0;
|
||||
for (auto &TC : TargetConstraints) {
|
||||
if (TC.Type != InlineAsm::isOutput)
|
||||
continue;
|
||||
|
||||
// Skip outputs we don't care about.
|
||||
if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
|
||||
continue;
|
||||
|
||||
TLI->ComputeConstraintToUse(TC, SDValue());
|
||||
|
||||
Register AssignedReg;
|
||||
const TargetRegisterClass *RC;
|
||||
std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
|
||||
TRI, TC.ConstraintCode, TC.ConstraintVT);
|
||||
if (AssignedReg) {
|
||||
// FIXME: This is a workaround for getRegForInlineAsmConstraint
|
||||
// returning VS_32
|
||||
RC = TRI->getPhysRegClass(AssignedReg);
|
||||
}
|
||||
|
||||
// For AGPR constraints null is returned on subtargets without AGPRs, so
|
||||
// assume divergent for null.
|
||||
if (!RC || !TRI->isSGPRClass(RC))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// \returns true if the new GPU divergence analysis is enabled.
|
||||
bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
|
||||
return !UseLegacyDA;
|
||||
|
@ -638,7 +684,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
|
|||
return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
|
||||
|
||||
// Assume all function calls are a source of divergence.
|
||||
if (isa<CallInst>(V) || isa<InvokeInst>(V))
|
||||
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
|
||||
if (isa<InlineAsm>(CI->getCalledValue()))
|
||||
return isInlineAsmSourceOfDivergence(CI);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assume all function calls are a source of divergence.
|
||||
if (isa<InvokeInst>(V))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -656,6 +709,19 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
|
||||
if (!ExtValue)
|
||||
return false;
|
||||
|
||||
if (const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0))) {
|
||||
// If we have inline asm returning mixed SGPR and VGPR results, we inferred
|
||||
// divergent for the overall struct return. We need to override it in the
|
||||
// case we're extracting an SGPR component here.
|
||||
if (isa<InlineAsm>(CI->getCalledValue()))
|
||||
return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
|
|||
friend BaseT;
|
||||
|
||||
const GCNSubtarget *ST;
|
||||
const AMDGPUTargetLowering *TLI;
|
||||
const SITargetLowering *TLI;
|
||||
AMDGPUTTIImpl CommonTTI;
|
||||
bool IsGraphicsShader;
|
||||
bool HasFP32Denormals;
|
||||
|
@ -183,6 +183,9 @@ public:
|
|||
|
||||
unsigned getCFInstrCost(unsigned Opcode);
|
||||
|
||||
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
|
||||
ArrayRef<unsigned> Indices = {}) const;
|
||||
|
||||
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
|
||||
bool isSourceOfDivergence(const Value *V) const;
|
||||
bool isAlwaysUniform(const Value *V) const;
|
||||
|
|
|
@ -10586,6 +10586,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
|
|||
return std::make_pair(RC->getRegister(Idx), RC);
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Returns VS_32 for physical SGPR constraints
|
||||
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
|
||||
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx908 -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
|
||||
; Make sure nothing crashes on targets with or without AGPRs
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_virtreg_output':
|
||||
; CHECK-NOT: DIVERGENT
|
||||
define i32 @inline_asm_1_sgpr_virtreg_output() {
|
||||
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
|
||||
ret i32 %sgpr
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_physreg_output':
|
||||
; CHECK-NOT: DIVERGENT
|
||||
define i32 @inline_asm_1_sgpr_physreg_output() {
|
||||
%sgpr = call i32 asm "s_mov_b32 s0, 0", "={s0}"()
|
||||
ret i32 %sgpr
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_virtreg_output':
|
||||
; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
|
||||
define i32 @inline_asm_1_vgpr_virtreg_output() {
|
||||
%vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
|
||||
ret i32 %vgpr
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_physreg_output':
|
||||
; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
|
||||
define i32 @inline_asm_1_vgpr_physreg_output() {
|
||||
%vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
|
||||
ret i32 %vgpr
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_virtreg_output':
|
||||
; CHECK: DIVERGENT: %vgpr = call i32 asm "; def $0", "=a"()
|
||||
define i32 @inline_asm_1_agpr_virtreg_output() {
|
||||
%vgpr = call i32 asm "; def $0", "=a"()
|
||||
ret i32 %vgpr
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_physreg_output':
|
||||
; CHECK: DIVERGENT: %vgpr = call i32 asm "; def a0", "={a0}"()
|
||||
define i32 @inline_asm_1_agpr_physreg_output() {
|
||||
%vgpr = call i32 asm "; def a0", "={a0}"()
|
||||
ret i32 %vgpr
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_2_sgpr_virtreg_output':
|
||||
; CHECK-NOT: DIVERGENT
|
||||
define void @inline_asm_2_sgpr_virtreg_output() {
|
||||
%asm = call { i32, i32 } asm "; def $0, $1", "=s,=s"()
|
||||
%sgpr0 = extractvalue { i32, i32 } %asm, 0
|
||||
%sgpr1 = extractvalue { i32, i32 } %asm, 1
|
||||
store i32 %sgpr0, i32 addrspace(1)* undef
|
||||
store i32 %sgpr1, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; One output is SGPR, one is VGPR. Infer divergent for the aggregate, but uniform on the SGPR extract
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_sgpr_vgpr_virtreg_output':
|
||||
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
|
||||
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
|
||||
; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 1
|
||||
define void @inline_asm_sgpr_vgpr_virtreg_output() {
|
||||
%asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
|
||||
%sgpr = extractvalue { i32, i32 } %asm, 0
|
||||
%vgpr = extractvalue { i32, i32 } %asm, 1
|
||||
store i32 %sgpr, i32 addrspace(1)* undef
|
||||
store i32 %vgpr, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output':
|
||||
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
|
||||
; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 0
|
||||
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
|
||||
define void @inline_asm_vgpr_sgpr_virtreg_output() {
|
||||
%asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
|
||||
%vgpr = extractvalue { i32, i32 } %asm, 0
|
||||
%sgpr = extractvalue { i32, i32 } %asm, 1
|
||||
store i32 %vgpr, i32 addrspace(1)* undef
|
||||
store i32 %sgpr, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; Have an extra output constraint
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'multi_sgpr_inline_asm_output_input_constraint':
|
||||
; CHECK-NOT: DIVERGENT
|
||||
define void @multi_sgpr_inline_asm_output_input_constraint() {
|
||||
%asm = call { i32, i32 } asm "; def $0, $1", "=s,=s,s"(i32 1234)
|
||||
%sgpr0 = extractvalue { i32, i32 } %asm, 0
|
||||
%sgpr1 = extractvalue { i32, i32 } %asm, 1
|
||||
store i32 %sgpr0, i32 addrspace(1)* undef
|
||||
store i32 %sgpr1, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output_input_constraint':
|
||||
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
|
||||
; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 0
|
||||
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
|
||||
define void @inline_asm_vgpr_sgpr_virtreg_output_input_constraint() {
|
||||
%asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
|
||||
%vgpr = extractvalue { i32, i32 } %asm, 0
|
||||
%sgpr = extractvalue { i32, i32 } %asm, 1
|
||||
store i32 %vgpr, i32 addrspace(1)* undef
|
||||
store i32 %sgpr, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
|
@ -21,11 +21,30 @@ entry:
|
|||
}
|
||||
|
||||
|
||||
; CHECK: {{^}}branch_on_asm:
|
||||
; Make sure inline assembly is treted as divergent.
|
||||
; CHECK: s_mov_b32 s{{[0-9]+}}, 0
|
||||
; CHECK-LABEL: {{^}}branch_on_asm_vgpr:
|
||||
; Make sure VGPR inline assembly is treated as divergent.
|
||||
; CHECK: v_mov_b32 v{{[0-9]+}}, 0
|
||||
; CHECK: v_cmp_eq_u32
|
||||
; CHECK: s_and_saveexec_b64
|
||||
define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
|
||||
define amdgpu_kernel void @branch_on_asm_vgpr(i32 addrspace(1)* %out) {
|
||||
%zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
|
||||
%cmp = icmp eq i32 %zero, 0
|
||||
br i1 %cmp, label %if, label %endif
|
||||
|
||||
if:
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}branch_on_asm_sgpr:
|
||||
; Make sure SGPR inline assembly is treated as uniform
|
||||
; CHECK: s_mov_b32 s{{[0-9]+}}, 0
|
||||
; CHECK: s_cmp_lg_u32
|
||||
; CHECK: s_cbranch_scc0
|
||||
define amdgpu_kernel void @branch_on_asm_sgpr(i32 addrspace(1)* %out) {
|
||||
%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
|
||||
%cmp = icmp eq i32 %zero, 0
|
||||
br i1 %cmp, label %if, label %endif
|
||||
|
|
Loading…
Reference in New Issue