llvm-project/llvm/test/CodeGen/AMDGPU/diverge-interp-mov-lower.ll

; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,VI,PREGFX9 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX9 %s

; Testing for failures in divergence calculations when divergent intrinsic is lowered during instruction selection

@0 = external dso_local addrspace(4) constant [4 x <4 x float>]

; GCN-LABEL: {{^}}_amdgpu_ps_main:
; GCN-NOT: v_readfirstlane
; PRE-GFX9: flat_load_dword
; GFX9: global_load 
define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_addr #0 {
.entry:
  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg) #1
  %tmp1 = bitcast float %tmp to i32
  %tmp2 = srem i32 %tmp1, 4
  %tmp3 = select i1 false, i32 undef, i32 %tmp2
  %tmp4 = sext i32 %tmp3 to i64
  %tmp5 = getelementptr [4 x <4 x float>], [4 x <4 x float>] addrspace(4)* @0, i64 0, i64 %tmp4
  %tmp6 = load <4 x float>, <4 x float> addrspace(4)* %tmp5, align 16
  %tmp7 = extractelement <4 x float> %tmp6, i32 3
  %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) #1
  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> undef, <2 x half> %tmp8, i1 true, i1 true) #2
  ret void
}

declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #2

attributes #0 = { nounwind "InitialPSInputAddr"="0" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind }
[AMDGPU] Fix issues for backend divergence tracking Summary: A change to use divergence analysis in the AMDGPU backend was getting formal arguments incorrect (not tagged as divergent) unless they were VGPR0, VGPR1 or VGPR2 For graphics shaders it is possible to have more than these passed in as VGPR Modified the checking code to check for any VGPR registers passed in as formal arguments. Also, some intrinsics that are sources of divergence may have been lowered during instruction selection and are missed on subsequent calls to isSDNodeSourceOfDivergence - added the relevant AMDGPUISD checks as well. Finally, the FunctionLoweringInfo tracks virtual registers that are live across basic block boundaries. This is used to check for divergence of CopyFromRegister registers using the DivergenceAnalysis analysis. For multiple blocks the lazily evaluated inverted map VirtReg2Value was not cleared when the ValueMap map was. Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D45372 Change-Id: I112f3bd6dfe0f62e63ce9b43b893982778e4bee3 llvm-svn: 330257 2018-04-18 21:53:31 +08:00			`; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,VI,PREGFX9 %s`
			`; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX81,PREGFX9 %s`
			`; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX9 %s`

			`; Testing for failures in divergence calculations when divergent intrinsic is lowered during instruction selection`

			`@0 = external dso_local addrspace(4) constant [4 x <4 x float>]`

			`; GCN-LABEL: {{^}}_amdgpu_ps_main:`
			`; GCN-NOT: v_readfirstlane`
			`; PRE-GFX9: flat_load_dword`
			`; GFX9: global_load`
			`define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %arg) local_unnamed_addr #0 {`
			`.entry:`
			`%tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg) #1`
			`%tmp1 = bitcast float %tmp to i32`
			`%tmp2 = srem i32 %tmp1, 4`
			`%tmp3 = select i1 false, i32 undef, i32 %tmp2`
			`%tmp4 = sext i32 %tmp3 to i64`
			`%tmp5 = getelementptr [4 x <4 x float>], [4 x <4 x float>] addrspace(4)* @0, i64 0, i64 %tmp4`
			`%tmp6 = load <4 x float>, <4 x float> addrspace(4)* %tmp5, align 16`
			`%tmp7 = extractelement <4 x float> %tmp6, i32 3`
			`%tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7) #1`
			`call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> undef, <2 x half> %tmp8, i1 true, i1 true) #2`
			`ret void`
			`}`

			`declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1`
			`declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1`
			`declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #2`

			`attributes #0 = { nounwind "InitialPSInputAddr"="0" }`
			`attributes #1 = { nounwind readnone speculatable }`
			`attributes #2 = { nounwind }`