AMDGPU: Analyze divergence of inline asm

2020-02-03 12:33:43 -05:00 · 2020-02-03 12:33:43 -05:00 · cb7b661d3d
parent 0d6fccb460
commit cb7b661d3d
5 changed files with 206 additions and 8 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -578,8 +578,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
  }
 }

-
-
 static bool isArgPassedInSGPR(const Argument *A) {
  const Function *F = A->getParent();

@ -606,6 +604,54 @@ static bool isArgPassedInSGPR(const Argument *A) {
  }
 }

+/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
+/// this is analyzing the collective result of all output registers. Otherwise,
+/// this is only querying a specific result index if this returns multiple
+/// registers in a struct.
+bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
+  const CallInst *CI, ArrayRef<unsigned> Indices) const {
+  // TODO: Handle complex extract indices
+  if (Indices.size() > 1)
+    return true;
+
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  ImmutableCallSite CS(CI);
+  TargetLowering::AsmOperandInfoVector TargetConstraints
+    = TLI->ParseConstraints(DL, ST->getRegisterInfo(), CS);
+
+  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
+
+  int OutputIdx = 0;
+  for (auto &TC : TargetConstraints) {
+    if (TC.Type != InlineAsm::isOutput)
+      continue;
+
+    // Skip outputs we don't care about.
+    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
+      continue;
+
+    TLI->ComputeConstraintToUse(TC, SDValue());
+
+    Register AssignedReg;
+    const TargetRegisterClass *RC;
+    std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
+      TRI, TC.ConstraintCode, TC.ConstraintVT);
+    if (AssignedReg) {
+      // FIXME: This is a workaround for getRegForInlineAsmConstraint
+      // returning VS_32
+      RC = TRI->getPhysRegClass(AssignedReg);
+    }
+
+    // For AGPR constraints null is returned on subtargets without AGPRs, so
+    // assume divergent for null.
+    if (!RC || !TRI->isSGPRClass(RC))
+      return true;
+  }
+
+  return false;
+}
+
 /// \returns true if the new GPU divergence analysis is enabled.
 bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
  return !UseLegacyDA;
@ -638,7 +684,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());

  // Assume all function calls are a source of divergence.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (isa<InlineAsm>(CI->getCalledValue()))
+      return isInlineAsmSourceOfDivergence(CI);
+    return true;
+  }
+
+  // Assume all function calls are a source of divergence.
+  if (isa<InvokeInst>(V))
    return true;

  return false;
@ -656,6 +709,19 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
      return true;
    }
  }
+
+  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
+  if (!ExtValue)
+    return false;
+
+  if (const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0))) {
+    // If we have inline asm returning mixed SGPR and VGPR results, we inferred
+    // divergent for the overall struct return. We need to override it in the
+    // case we're extracting an SGPR component here.
+    if (isa<InlineAsm>(CI->getCalledValue()))
+      return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
+  }
+
  return false;
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -70,7 +70,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
  friend BaseT;

  const GCNSubtarget *ST;
-  const AMDGPUTargetLowering *TLI;
+  const SITargetLowering *TLI;
  AMDGPUTTIImpl CommonTTI;
  bool IsGraphicsShader;
  bool HasFP32Denormals;
@ -183,6 +183,9 @@ public:

  unsigned getCFInstrCost(unsigned Opcode);

+  bool isInlineAsmSourceOfDivergence(const CallInst *CI,
+                                     ArrayRef<unsigned> Indices = {}) const;
+
  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
  bool isSourceOfDivergence(const Value *V) const;
  bool isAlwaysUniform(const Value *V) const;
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -10586,6 +10586,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
        return std::make_pair(RC->getRegister(Idx), RC);
    }
  }
+
+  // FIXME: Returns VS_32 for physical SGPR constraints
  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }

--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll
@ -0,0 +1,108 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx908 -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
+; Make sure nothing crashes on targets with or without AGPRs
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_virtreg_output':
+; CHECK-NOT: DIVERGENT
+define i32 @inline_asm_1_sgpr_virtreg_output() {
+  %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"()
+  ret i32 %sgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_physreg_output':
+; CHECK-NOT: DIVERGENT
+define i32 @inline_asm_1_sgpr_physreg_output() {
+  %sgpr = call i32 asm "s_mov_b32 s0, 0", "={s0}"()
+  ret i32 %sgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_virtreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
+define i32 @inline_asm_1_vgpr_virtreg_output() {
+  %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_physreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
+define i32 @inline_asm_1_vgpr_physreg_output() {
+  %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_virtreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "; def $0", "=a"()
+define i32 @inline_asm_1_agpr_virtreg_output() {
+  %vgpr = call i32 asm "; def $0", "=a"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_physreg_output':
+; CHECK: DIVERGENT: %vgpr = call i32 asm "; def a0", "={a0}"()
+define i32 @inline_asm_1_agpr_physreg_output() {
+  %vgpr = call i32 asm "; def a0", "={a0}"()
+  ret i32 %vgpr
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_2_sgpr_virtreg_output':
+; CHECK-NOT: DIVERGENT
+define void @inline_asm_2_sgpr_virtreg_output() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s"()
+  %sgpr0 = extractvalue { i32, i32 } %asm, 0
+  %sgpr1 = extractvalue { i32, i32 } %asm, 1
+  store i32 %sgpr0, i32 addrspace(1)* undef
+  store i32 %sgpr1, i32 addrspace(1)* undef
+  ret void
+}
+
+; One output is SGPR, one is VGPR. Infer divergent for the aggregate, but uniform on the SGPR extract
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_sgpr_vgpr_virtreg_output':
+; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
+; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
+; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 1
+define void @inline_asm_sgpr_vgpr_virtreg_output() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"()
+  %sgpr = extractvalue { i32, i32 } %asm, 0
+  %vgpr = extractvalue { i32, i32 } %asm, 1
+  store i32 %sgpr, i32 addrspace(1)* undef
+  store i32 %vgpr, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output':
+; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
+; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 0
+; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
+define void @inline_asm_vgpr_sgpr_virtreg_output() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"()
+  %vgpr = extractvalue { i32, i32 } %asm, 0
+  %sgpr = extractvalue { i32, i32 } %asm, 1
+  store i32 %vgpr, i32 addrspace(1)* undef
+  store i32 %sgpr, i32 addrspace(1)* undef
+  ret void
+}
+
+; Have an extra output constraint
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'multi_sgpr_inline_asm_output_input_constraint':
+; CHECK-NOT: DIVERGENT
+define void @multi_sgpr_inline_asm_output_input_constraint() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s,s"(i32 1234)
+  %sgpr0 = extractvalue { i32, i32 } %asm, 0
+  %sgpr1 = extractvalue { i32, i32 } %asm, 1
+  store i32 %sgpr0, i32 addrspace(1)* undef
+  store i32 %sgpr1, i32 addrspace(1)* undef
+  ret void
+}
+
+; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output_input_constraint':
+; CHECK: DIVERGENT:       %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
+; CHECK-NEXT: DIVERGENT:       %vgpr = extractvalue { i32, i32 } %asm, 0
+; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1
+define void @inline_asm_vgpr_sgpr_virtreg_output_input_constraint() {
+  %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s,v"(i32 1234)
+  %vgpr = extractvalue { i32, i32 } %asm, 0
+  %sgpr = extractvalue { i32, i32 } %asm, 1
+  store i32 %vgpr, i32 addrspace(1)* undef
+  store i32 %sgpr, i32 addrspace(1)* undef
+  ret void
+}
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@ -21,11 +21,30 @@ entry:
 }


-; CHECK: {{^}}branch_on_asm:
-; Make sure inline assembly is treted as divergent.
-; CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; CHECK-LABEL: {{^}}branch_on_asm_vgpr:
+; Make sure VGPR inline assembly is treated as divergent.
+; CHECK: v_mov_b32 v{{[0-9]+}}, 0
+; CHECK: v_cmp_eq_u32
 ; CHECK: s_and_saveexec_b64
-define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm_vgpr(i32 addrspace(1)* %out) {
+	%zero = call i32 asm "v_mov_b32 $0, 0", "=v"()
+	%cmp = icmp eq i32 %zero, 0
+	br i1 %cmp, label %if, label %endif
+
+if:
+	store i32 0, i32 addrspace(1)* %out
+	br label %endif
+
+endif:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}branch_on_asm_sgpr:
+; Make sure SGPR inline assembly is treated as uniform
+; CHECK: s_mov_b32 s{{[0-9]+}}, 0
+; CHECK: s_cmp_lg_u32
+; CHECK: s_cbranch_scc0
+define amdgpu_kernel void @branch_on_asm_sgpr(i32 addrspace(1)* %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif