[AMDGPU] restore r342722 which was reverted with r342743

[AMDGPU] lower-switch in preISel as a workaround for legacy DA

Summary:
The default target of the switch instruction may sometimes be an
"unreachable" block, when it is guaranteed that one of the cases is
always taken. The dominator tree concludes that such a switch
instruction does not have an immediate post dominator. This confuses
divergence analysis, which is unable to propagate sync dependence to
the targets of the switch instruction.

As a workaround, the AMDGPU target now invokes lower-switch as a
preISel pass. LowerSwitch is designed to handle the unreachable
default target correctly, allowing the divergence analysis to locate
the correct immediate dominator of the now-lowered switch.

llvm-svn: 342956
This commit is contained in:
Sameer Sahasrabuddhe 2018-09-25 09:39:21 +00:00
parent 6d92c198ac
commit b4f2d1cb68
2 changed files with 66 additions and 0 deletions

View File

@ -45,6 +45,7 @@
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Vectorize.h"
#include <memory>
@ -678,6 +679,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
}
bool AMDGPUPassConfig::addPreISel() {
addPass(createLowerSwitchPass());
addPass(createFlattenCFGPass());
return false;
}

View File

@ -0,0 +1,64 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -print-after=si-annotate-control-flow %s -o /dev/null 2>&1 | FileCheck %s
; CHECK-LABEL: @switch_unreachable_default
define amdgpu_kernel void @switch_unreachable_default(i32 addrspace(1)* %out, i8 addrspace(1)* %in0, i8 addrspace(1)* %in1) #0 {
centry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
switch i32 %tid, label %sw.default [
i32 0, label %sw.bb0
i32 1, label %sw.bb1
]
sw.bb0:
br label %sw.epilog
sw.bb1:
br label %sw.epilog
sw.default:
unreachable
sw.epilog:
%ptr = phi i8 addrspace(1)* [%in0, %sw.bb0], [%in1, %sw.bb1]
%gep_in = getelementptr inbounds i8, i8 addrspace(1)* %ptr, i64 0
br label %sw.while
; The loop below is necessary to preserve the effect of the
; unreachable default on divergence analysis in the presence of other
; optimizations. The loop consists of a single block where the loop
; exit is divergent because it depends on the divergent phi at the
; start of the block. The checks below ensure that the loop exit is
; handled correctly as divergent. But the data-flow within the block
; is sensitive to optimizations; so we just ensure that the relevant
; operations in the block body are indeed in the same block.
; CHECK: [[PHI:%[a-zA-Z0-9._]+]] = phi i64
; CHECK-NOT: {{ br }}
; CHECK: load i8
; CHECK-NOT: {{ br }}
; CHECK: [[ICMP:%[a-zA-Z0-9._]+]] = icmp eq
; CHECK: [[IF:%[a-zA-Z0-9._]+]] = call i64 @llvm.amdgcn.if.break(i1 [[ICMP]], i64 [[PHI]])
; CHECK: [[LOOP:%[a-zA-Z0-9._]+]] = call i1 @llvm.amdgcn.loop(i64 [[IF]])
; CHECK: br i1 [[LOOP]]
sw.while:
%p = phi i8 addrspace(1)* [ %gep_in, %sw.epilog ], [ %incdec.ptr, %sw.while ]
%count = phi i32 [ 0, %sw.epilog ], [ %count.inc, %sw.while ]
%char = load i8, i8 addrspace(1)* %p, align 1
%tobool = icmp eq i8 %char, 0
%incdec.ptr = getelementptr inbounds i8, i8 addrspace(1)* %p, i64 1
%count.inc = add i32 %count, 1
br i1 %tobool, label %sw.exit, label %sw.while
sw.exit:
%tid64 = zext i32 %tid to i64
%gep_out = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid64
store i32 %count, i32 addrspace(1)* %gep_out, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
attributes #0 = { nounwind readnone }
attributes #1 = { convergent noinline optnone }