[AMDGPU] Consider loads from flat addrspace to be potentially divergent

In general we can't assume flat loads are uniform, and cases where we can prove they are should be handled through infer-address-spaces. Differential Revision: https://reviews.llvm.org/D50991 llvm-svn: 340343
2018-08-21 21:24:31 +00:00 · 2018-08-21 21:24:31 +00:00 · 72855e36c5
parent faef7d034a
commit 72855e36c5
2 changed files with 21 additions and 4 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -545,14 +545,16 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
  if (const Argument *A = dyn_cast<Argument>(V))
    return !isArgPassedInSGPR(A);

-  // Loads from the private address space are divergent, because threads
-  // can execute the load instruction with the same inputs and get different
-  // results.
+  // Loads from the private and flat address spaces are divergent, because
+  // threads can execute the load instruction with the same inputs and get
+  // different results.
  //
  // All other loads are not divergent, because if threads issue loads with the
  // same arguments, they will always get the same result.
  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
-    return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
+    return Load->getPointerAddressSpace() ==
+               ST->getAMDGPUAS().PRIVATE_ADDRESS ||
+           Load->getPointerAddressSpace() == ST->getAMDGPUAS().FLAT_ADDRESS;

  // Atomics are divergent because they are executed sequentially: when an
  // atomic operation refers to the same address in each thread, then each
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/loads.ll
@ -0,0 +1,15 @@
+; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
+
+; Test that we consider loads from flat and private addrspaces to be divergent.
+
+; CHECK: DIVERGENT: %val = load i32, i32* %flat, align 4
+define amdgpu_kernel void @flat_load(i32* %flat) {
+  %val = load i32, i32* %flat, align 4
+  ret void
+}
+
+; CHECK: DIVERGENT: %val = load i32, i32 addrspace(5)* %priv, align 4
+define amdgpu_kernel void @private_load(i32 addrspace(5)* %priv) {
+  %val = load i32, i32 addrspace(5)* %priv, align 4
+  ret void
+}