AMDGPU: Analyze divergence of inline asm

This commit is contained in:
Matt Arsenault 2020-02-03 12:33:43 -05:00
parent 0d6fccb460
commit cb7b661d3d
5 changed files with 206 additions and 8 deletions

View File

@ -578,8 +578,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
static bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();
@ -606,6 +604,54 @@ static bool isArgPassedInSGPR(const Argument *A) {
}
}
/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
/// this is analyzing the collective result of all output registers. Otherwise,
/// this is only querying a specific result index if this returns multiple
/// registers in a struct.
bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
const CallInst *CI, ArrayRef<unsigned> Indices) const {
// TODO: Handle complex extract indices
if (Indices.size() > 1)
return true;
const DataLayout &DL = CI->getModule()->getDataLayout();
const SIRegisterInfo *TRI = ST->getRegisterInfo();
ImmutableCallSite CS(CI);
TargetLowering::AsmOperandInfoVector TargetConstraints
= TLI->ParseConstraints(DL, ST->getRegisterInfo(), CS);
const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
int OutputIdx = 0;
for (auto &TC : TargetConstraints) {
if (TC.Type != InlineAsm::isOutput)
continue;
// Skip outputs we don't care about.
if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
continue;
TLI->ComputeConstraintToUse(TC, SDValue());
Register AssignedReg;
const TargetRegisterClass *RC;
std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
TRI, TC.ConstraintCode, TC.ConstraintVT);
if (AssignedReg) {
// FIXME: This is a workaround for getRegForInlineAsmConstraint
// returning VS_32
RC = TRI->getPhysRegClass(AssignedReg);
}
// For AGPR constraints null is returned on subtargets without AGPRs, so
// assume divergent for null.
if (!RC || !TRI->isSGPRClass(RC))
return true;
}
return false;
}
/// \returns true if the new GPU divergence analysis is enabled.
bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
return !UseLegacyDA;
@ -638,7 +684,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
// Assume all function calls are a source of divergence.
if (isa<CallInst>(V) || isa<InvokeInst>(V))
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
if (isa<InlineAsm>(CI->getCalledValue()))
return isInlineAsmSourceOfDivergence(CI);
return true;
}
// Assume all function calls are a source of divergence.
if (isa<InvokeInst>(V))
return true;
return false;
@ -656,6 +709,19 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return true;
}
}
const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
if (!ExtValue)
return false;
if (const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0))) {
// If we have inline asm returning mixed SGPR and VGPR results, we inferred
// divergent for the overall struct return. We need to override it in the
// case we're extracting an SGPR component here.
if (isa<InlineAsm>(CI->getCalledValue()))
return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
}
return false;
}

View File

@ -70,7 +70,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
friend BaseT;
const GCNSubtarget *ST;
const AMDGPUTargetLowering *TLI;
const SITargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
bool HasFP32Denormals;
@ -183,6 +183,9 @@ public:
unsigned getCFInstrCost(unsigned Opcode);
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
bool isSourceOfDivergence(const Value *V) const;
bool isAlwaysUniform(const Value *V) const;

View File

@ -10586,6 +10586,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(RC->getRegister(Idx), RC);
}
}
// FIXME: Returns VS_32 for physical SGPR constraints
return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
}

View File

@ -0,0 +1,108 @@
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx908 -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
; Make sure nothing crashes on targets with or without AGPRs
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_virtreg_output':
; CHECK-NOT: DIVERGENT
define i32 @inline_asm_1_sgpr_virtreg_output() {
%sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
ret i32 %sgpr
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_physreg_output':
; CHECK-NOT: DIVERGENT
define i32 @inline_asm_1_sgpr_physreg_output() {
%sgpr = call i32 asm "s_mov_b32 s0, 0", "={s0}"()
ret i32 %sgpr
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_virtreg_output':
; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
define i32 @inline_asm_1_vgpr_virtreg_output() {
%vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
ret i32 %vgpr
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_physreg_output':
; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
define i32 @inline_asm_1_vgpr_physreg_output() {
%vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
ret i32 %vgpr
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_virtreg_output':
; CHECK: DIVERGENT: %vgpr = call i32 asm "; def $0", "=a"()
define i32 @inline_asm_1_agpr_virtreg_output() {
%vgpr = call i32 asm "; def $0", "=a"()
ret i32 %vgpr
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_physreg_output':
; CHECK: DIVERGENT: %vgpr = call i32 asm "; def a0", "={a0}"()
define i32 @inline_asm_1_agpr_physreg_output() {
%vgpr = call i32 asm "; def a0", "={a0}"()
ret i32 %vgpr
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_2_sgpr_virtreg_output':
; CHECK-NOT: DIVERGENT
define void @inline_asm_2_sgpr_virtreg_output() {
%asm = call { i32, i32 } asm "; def $0, $1", "=s,=s"()
%sgpr0 = extractvalue { i32, i32 } %asm, 0
%sgpr1 = extractvalue { i32, i32 } %asm, 1
store i32 %sgpr0, i32 addrspace(1)* undef
store i32 %sgpr1, i32 addrspace(1)* undef
ret void
}
; One output is SGPR, one is VGPR. Infer divergent for the aggregate, but uniform on the SGPR extract
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_sgpr_vgpr_virtreg_output':
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 1
define void @inline_asm_sgpr_vgpr_virtreg_output() {
%asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
%sgpr = extractvalue { i32, i32 } %asm, 0
%vgpr = extractvalue { i32, i32 } %asm, 1
store i32 %sgpr, i32 addrspace(1)* undef
store i32 %vgpr, i32 addrspace(1)* undef
ret void
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output':
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 0
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
define void @inline_asm_vgpr_sgpr_virtreg_output() {
%asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
%vgpr = extractvalue { i32, i32 } %asm, 0
%sgpr = extractvalue { i32, i32 } %asm, 1
store i32 %vgpr, i32 addrspace(1)* undef
store i32 %sgpr, i32 addrspace(1)* undef
ret void
}
; Have an extra output constraint
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'multi_sgpr_inline_asm_output_input_constraint':
; CHECK-NOT: DIVERGENT
define void @multi_sgpr_inline_asm_output_input_constraint() {
%asm = call { i32, i32 } asm "; def $0, $1", "=s,=s,s"(i32 1234)
%sgpr0 = extractvalue { i32, i32 } %asm, 0
%sgpr1 = extractvalue { i32, i32 } %asm, 1
store i32 %sgpr0, i32 addrspace(1)* undef
store i32 %sgpr1, i32 addrspace(1)* undef
ret void
}
; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output_input_constraint':
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 0
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
define void @inline_asm_vgpr_sgpr_virtreg_output_input_constraint() {
%asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
%vgpr = extractvalue { i32, i32 } %asm, 0
%sgpr = extractvalue { i32, i32 } %asm, 1
store i32 %vgpr, i32 addrspace(1)* undef
store i32 %sgpr, i32 addrspace(1)* undef
ret void
}

View File

@ -21,11 +21,30 @@ entry:
}
; CHECK: {{^}}branch_on_asm:
; Make sure inline assembly is treted as divergent.
; CHECK: s_mov_b32 s{{[0-9]+}}, 0
; CHECK-LABEL: {{^}}branch_on_asm_vgpr:
; Make sure VGPR inline assembly is treated as divergent.
; CHECK: v_mov_b32 v{{[0-9]+}}, 0
; CHECK: v_cmp_eq_u32
; CHECK: s_and_saveexec_b64
define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
define amdgpu_kernel void @branch_on_asm_vgpr(i32 addrspace(1)* %out) {
%zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
%cmp = icmp eq i32 %zero, 0
br i1 %cmp, label %if, label %endif
if:
store i32 0, i32 addrspace(1)* %out
br label %endif
endif:
ret void
}
; CHECK-LABEL: {{^}}branch_on_asm_sgpr:
; Make sure SGPR inline assembly is treated as uniform
; CHECK: s_mov_b32 s{{[0-9]+}}, 0
; CHECK: s_cmp_lg_u32
; CHECK: s_cbranch_scc0
define amdgpu_kernel void @branch_on_asm_sgpr(i32 addrspace(1)* %out) {
%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
%cmp = icmp eq i32 %zero, 0
br i1 %cmp, label %if, label %endif