[AMDGPU] restore r342722 which was reverted with r342743

[AMDGPU] lower-switch in preISel as a workaround for legacy DA Summary: The default target of the switch instruction may sometimes be an "unreachable" block, when it is guaranteed that one of the cases is always taken. The dominator tree concludes that such a switch instruction does not have an immediate post dominator. This confuses divergence analysis, which is unable to propagate sync dependence to the targets of the switch instruction. As a workaround, the AMDGPU target now invokes lower-switch as a preISel pass. LowerSwitch is designed to handle the unreachable default target correctly, allowing the divergence analysis to locate the correct immediate dominator of the now-lowered switch. llvm-svn: 342956
2018-09-25 09:39:21 +00:00 · 2018-09-25 09:39:21 +00:00 · b4f2d1cb68
parent 6d92c198ac
commit b4f2d1cb68
2 changed files with 66 additions and 0 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -45,6 +45,7 @@
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <memory>

@ -678,6 +679,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
 }

 bool AMDGPUPassConfig::addPreISel() {
+  addPass(createLowerSwitchPass());
  addPass(createFlattenCFGPass());
  return false;
 }
--- a/llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll
+++ b/llvm/test/CodeGen/AMDGPU/diverge-switch-default.ll
@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -print-after=si-annotate-control-flow %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK-LABEL: @switch_unreachable_default
+
+define amdgpu_kernel void @switch_unreachable_default(i32 addrspace(1)* %out, i8 addrspace(1)* %in0, i8 addrspace(1)* %in1) #0 {
+centry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  switch i32 %tid, label %sw.default [
+    i32 0, label %sw.bb0
+    i32 1, label %sw.bb1
+  ]
+
+sw.bb0:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  %ptr = phi i8 addrspace(1)* [%in0, %sw.bb0], [%in1, %sw.bb1]
+  %gep_in = getelementptr inbounds i8, i8 addrspace(1)* %ptr, i64 0
+  br label %sw.while
+
+; The loop below is necessary to preserve the effect of the
+; unreachable default on divergence analysis in the presence of other
+; optimizations. The loop consists of a single block where the loop
+; exit is divergent because it depends on the divergent phi at the
+; start of the block. The checks below ensure that the loop exit is
+; handled correctly as divergent. But the data-flow within the block
+; is sensitive to optimizations; so we just ensure that the relevant
+; operations in the block body are indeed in the same block.
+
+; CHECK: [[PHI:%[a-zA-Z0-9._]+]]  = phi i64
+; CHECK-NOT: {{ br }}
+; CHECK: load i8
+; CHECK-NOT: {{ br }}
+; CHECK: [[ICMP:%[a-zA-Z0-9._]+]] = icmp eq
+; CHECK: [[IF:%[a-zA-Z0-9._]+]]   = call i64 @llvm.amdgcn.if.break(i1 [[ICMP]], i64 [[PHI]])
+; CHECK: [[LOOP:%[a-zA-Z0-9._]+]] = call i1 @llvm.amdgcn.loop(i64 [[IF]])
+; CHECK: br i1 [[LOOP]]
+
+sw.while:
+  %p = phi i8 addrspace(1)* [ %gep_in, %sw.epilog ], [ %incdec.ptr, %sw.while ]
+  %count = phi i32 [ 0, %sw.epilog ], [ %count.inc, %sw.while ]
+  %char = load i8, i8 addrspace(1)* %p, align 1
+  %tobool = icmp eq i8 %char, 0
+  %incdec.ptr = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 1
+  %count.inc = add i32 %count, 1
+  br i1 %tobool, label %sw.exit, label %sw.while
+
+sw.exit:
+  %tid64 = zext i32 %tid to i64
+  %gep_out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid64
+  store i32 %count, i32 addrspace(1)* %gep_out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { convergent noinline optnone }