[AMDGPU] Prepare for introduction of v3 and v5 MVTs

AMDGPU would like to have MVTs for v3i32, v3f32, v5i32, v5f32. This
commit does not add them, but makes preparatory changes:

* Fixed assumptions of power-of-2 vector type in kernel arg handling,
  and added v5 kernel arg tests and v3/v5 shader arg tests.

* Added v5 tests for cost analysis.

* Added vec3/vec5 arg test cases.

Some of this patch is from Matt Arsenault, also of AMD.

Differential Revision: https://reviews.llvm.org/D58928

Change-Id: I7279d6b4841464d2080eb255ef3c589e268eabcd
llvm-svn: 356342
This commit is contained in:
Tim Renouf 2019-03-17 21:04:16 +00:00
parent d1477e989c
commit e30aa6a136
13 changed files with 526 additions and 14 deletions

View File

@ -1008,9 +1008,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
MemVT = MemVT.getScalarType();
if (MemVT.isExtended()) {
// This should really only happen if we have vec3 arguments
assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
// Round up vec3/vec5 argument.
if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
assert(MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5);
MemVT = MemVT.getPow2VectorType(State.getContext());
}

View File

@ -20,7 +20,9 @@ define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
}
; CHECK: 'add_v3i32'
; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; CHECK: estimated cost of {{[34]}} for {{.*}} add <3 x i32>
define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
%add = add <3 x i32> %vec, %b
@ -37,6 +39,17 @@ define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
ret void
}
; CHECK: 'add_v5i32'
; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; CHECK: estimated cost of {{[58]}} for {{.*}} add <5 x i32>
define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
%add = add <5 x i32> %vec, %b
store <5 x i32> %add, <5 x i32> addrspace(1)* %out
ret void
}
; CHECK: 'add_i64'
; CHECK: estimated cost of 2 for {{.*}} add i64
define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {

View File

@ -38,6 +38,15 @@ define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32
ret void
}
; GCN: 'extractelement_v5i32'
; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32>
define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) {
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
%elt = extractelement <5 x i32> %vec, i32 1
store i32 %elt, i32 addrspace(1)* %out
ret void
}
; GCN: 'extractelement_v8i32'
; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {

View File

@ -27,6 +27,15 @@ define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; CHECK: 'fabs_v5f32'
; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32
define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1
store <5 x float> %fabs, <5 x float> addrspace(1)* %out
ret void
}
; CHECK: 'fabs_f64'
; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
@ -84,6 +93,7 @@ define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
declare float @llvm.fabs.f32(float) #1
declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1
declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1
declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1
declare double @llvm.fabs.f64(double) #1
declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1

View File

@ -20,7 +20,9 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float
}
; ALL: 'fadd_v3f32'
; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fadd <3 x float> %vec, %b
@ -28,6 +30,17 @@ define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; ALL: 'fadd_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fadd <5 x float> %vec, %b
store <5 x float> %add, <5 x float> addrspace(1)* %out
ret void
}
; ALL: 'fadd_f64'
; FASTF64: estimated cost of 2 for {{.*}} fadd double
; SLOWF64: estimated cost of 3 for {{.*}} fadd double

View File

@ -26,8 +26,10 @@ define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float
}
; ALL: 'fdiv_v3f32'
; NOFP32DENORM: estimated cost of 36 for {{.*}} fdiv <3 x float>
; FP32DENORMS: estimated cost of 30 for {{.*}} fdiv <3 x float>
; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 36/30 when it is legal.
; NOFP32DENORM: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
; FP32DENORMS: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fdiv <3 x float> %vec, %b
@ -35,6 +37,18 @@ define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; ALL: 'fdiv_v5f32'
; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 60/50 when it is legal.
; NOFP32DENORM: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
; FP32DENORMS: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
define amdgpu_kernel void @fdiv_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fdiv <5 x float> %vec, %b
store <5 x float> %add, <5 x float> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_f64'
; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double

View File

@ -20,7 +20,9 @@ define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float
}
; ALL: 'fmul_v3f32'
; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fmul <3 x float> %vec, %b
@ -28,6 +30,17 @@ define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; ALL: 'fmul_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fmul <5 x float> %vec, %b
store <5 x float> %add, <5 x float> addrspace(1)* %out
ret void
}
; ALL: 'fmul_f64'
; FASTF64: estimated cost of 2 for {{.*}} fmul double
; SLOWF64: estimated cost of 3 for {{.*}} fmul double

View File

@ -20,7 +20,9 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float
}
; ALL: 'fsub_v3f32'
; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float>
define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fsub <3 x float> %vec, %b
@ -28,6 +30,17 @@ define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float
ret void
}
; ALL: 'fsub_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float>
define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fsub <5 x float> %vec, %b
store <5 x float> %add, <5 x float> addrspace(1)* %out
ret void
}
; ALL: 'fsub_f64'
; FASTF64: estimated cost of 2 for {{.*}} fsub double
; SLOWF64: estimated cost of 3 for {{.*}} fsub double

View File

@ -19,7 +19,9 @@ define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
}
; CHECK: 'mul_v3i32'
; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening,
; and 9 when it is legal.
; CHECK: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
%mul = mul <3 x i32> %vec, %b
@ -27,6 +29,17 @@ define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add
ret void
}
; CHECK: 'mul_v5i32'
; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening,
; and 15 when it is legal.
; CHECK: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
%mul = mul <5 x i32> %vec, %b
store <5 x i32> %mul, <5 x i32> addrspace(1)* %out
ret void
}
; CHECK: 'mul_v4i32'
; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {

View File

@ -26,7 +26,9 @@ declare hidden void @external_void_func_f32(float) #0
declare hidden void @external_void_func_f64(double) #0
declare hidden void @external_void_func_v2f32(<2 x float>) #0
declare hidden void @external_void_func_v2f64(<2 x double>) #0
declare hidden void @external_void_func_v3f32(<3 x float>) #0
declare hidden void @external_void_func_v3f64(<3 x double>) #0
declare hidden void @external_void_func_v5f32(<5 x float>) #0
declare hidden void @external_void_func_v2i16(<2 x i16>) #0
declare hidden void @external_void_func_v2f16(<2 x half>) #0
@ -39,6 +41,7 @@ declare hidden void @external_void_func_v2i32(<2 x i32>) #0
declare hidden void @external_void_func_v3i32(<3 x i32>) #0
declare hidden void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
declare hidden void @external_void_func_v4i32(<4 x i32>) #0
declare hidden void @external_void_func_v5i32(<5 x i32>) #0
declare hidden void @external_void_func_v8i32(<8 x i32>) #0
declare hidden void @external_void_func_v16i32(<16 x i32>) #0
declare hidden void @external_void_func_v32i32(<32 x i32>) #0
@ -341,6 +344,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v3f32_imm:
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
; GCN-NOT: v3,
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v5f32_imm:
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
; GCN-DAG: v_mov_b32_e32 v3, -1.0
; GCN-DAG: v_mov_b32_e32 v4, 0.5
; GCN-NOT: v5,
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
; GCN: v_mov_b32_e32 v0, 0{{$}}
; GCN: v_mov_b32_e32 v1, 0x40100000
@ -519,6 +546,19 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v5i32_imm:
; GCN-DAG: v_mov_b32_e32 v0, 1
; GCN-DAG: v_mov_b32_e32 v1, 2
; GCN-DAG: v_mov_b32_e32 v2, 3
; GCN-DAG: v_mov_b32_e32 v3, 4
; GCN-DAG: v_mov_b32_e32 v4, 5
; GCN-NOT v5,
; GCN: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
ret void
}
; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
@ -764,9 +804,140 @@ entry:
ret void
}
; GCN-LABEL: {{^}}stack_12xv3i32:
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
; GCN: v_mov_b32_e32 v31, 11
; GCN: s_getpc
define void @stack_12xv3i32() #0 {
entry:
call void @external_void_func_12xv3i32(
<3 x i32><i32 0, i32 0, i32 0>,
<3 x i32><i32 1, i32 1, i32 1>,
<3 x i32><i32 2, i32 2, i32 2>,
<3 x i32><i32 3, i32 3, i32 3>,
<3 x i32><i32 4, i32 4, i32 4>,
<3 x i32><i32 5, i32 5, i32 5>,
<3 x i32><i32 6, i32 6, i32 6>,
<3 x i32><i32 7, i32 7, i32 7>,
<3 x i32><i32 8, i32 8, i32 8>,
<3 x i32><i32 9, i32 9, i32 9>,
<3 x i32><i32 10, i32 11, i32 12>,
<3 x i32><i32 13, i32 14, i32 15>)
ret void
}
; GCN-LABEL: {{^}}stack_12xv3f32:
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
; GCN: v_mov_b32_e32 v31, 0x41300000
; GCN: s_getpc
define void @stack_12xv3f32() #0 {
entry:
call void @external_void_func_12xv3f32(
<3 x float><float 0.0, float 0.0, float 0.0>,
<3 x float><float 1.0, float 1.0, float 1.0>,
<3 x float><float 2.0, float 2.0, float 2.0>,
<3 x float><float 3.0, float 3.0, float 3.0>,
<3 x float><float 4.0, float 4.0, float 4.0>,
<3 x float><float 5.0, float 5.0, float 5.0>,
<3 x float><float 6.0, float 6.0, float 6.0>,
<3 x float><float 7.0, float 7.0, float 7.0>,
<3 x float><float 8.0, float 8.0, float 8.0>,
<3 x float><float 9.0, float 9.0, float 9.0>,
<3 x float><float 10.0, float 11.0, float 12.0>,
<3 x float><float 13.0, float 14.0, float 15.0>)
ret void
}
; GCN-LABEL: {{^}}stack_8xv5i32:
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
; GCN: v_mov_b32_e32 v31, 7
; GCN: s_getpc
define void @stack_8xv5i32() #0 {
entry:
call void @external_void_func_8xv5i32(
<5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
<5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>,
<5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>,
<5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>,
<5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>,
<5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>,
<5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>,
<5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>)
ret void
}
; GCN-LABEL: {{^}}stack_8xv5f32:
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
; GCN: v_mov_b32_e32 v31, 0x40e00000
; GCN: s_getpc
define void @stack_8xv5f32() #0 {
entry:
call void @external_void_func_8xv5f32(
<5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
<5 x float><float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>,
<5 x float><float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>,
<5 x float><float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>,
<5 x float><float 4.0, float 4.0, float 4.0, float 4.0, float 4.0>,
<5 x float><float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>,
<5 x float><float 6.0, float 7.0, float 8.0, float 9.0, float 10.0>,
<5 x float><float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>)
ret void
}
declare hidden void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0
declare hidden void @stack_passed_f64_arg(<32 x i32>, double) #0
declare hidden void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,
<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0
declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>,
<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0
declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>,
<3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0
declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
<5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }

View File

@ -1,6 +1,6 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s
declare void @external_void_func_void() #0
@ -26,6 +26,8 @@ declare double @external_f64_func_void() #0
declare <2 x half> @external_v2f16_func_void() #0
declare <4 x half> @external_v4f16_func_void() #0
declare <3 x float> @external_v3f32_func_void() #0
declare <5 x float> @external_v5f32_func_void() #0
declare <2 x double> @external_v2f64_func_void() #0
declare <2 x i32> @external_v2i32_func_void() #0
@ -171,6 +173,11 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
}
; GCN-LABEL: {{^}}test_call_external_v3i32_func_void:
; GCN: s_swappc
; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1]
; GFX7-DAG: flat_store_dword {{.*}}, v2
; GFX89-DAG: buffer_store_dwordx2 v[0:1]
; GFX89-DAG: buffer_store_dword v2
define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
%val = call <3 x i32> @external_v3i32_func_void()
store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
@ -185,6 +192,11 @@ define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
}
; GCN-LABEL: {{^}}test_call_external_v5i32_func_void:
; GCN: s_swappc
; GFX7-DAG: flat_store_dwordx4 {{.*}}, v[0:3]
; GFX7-DAG: flat_store_dword {{.*}}, v4
; GFX89-DAG: buffer_store_dwordx4 v[0:3]
; GFX89-DAG: buffer_store_dword v4
define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
%val = call <5 x i32> @external_v5i32_func_void()
store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8
@ -240,6 +252,30 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
ret void
}
; GCN-LABEL: {{^}}test_call_external_v3f32_func_void:
; GCN: s_swappc
; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1]
; GFX7-DAG: flat_store_dword {{.*}}, v2
; GFX89-DAG: buffer_store_dwordx2 v[0:1]
; GFX89-DAG: buffer_store_dword v2
define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
%val = call <3 x float> @external_v3f32_func_void()
store volatile <3 x float> %val, <3 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_v5f32_func_void:
; GCN: s_swappc
; GFX7-DAG: flat_store_dwordx4 {{.*}}, v[0:3]
; GFX7-DAG: flat_store_dword {{.*}}, v4
; GFX89-DAG: buffer_store_dwordx4 v[0:3]
; GFX89-DAG: buffer_store_dword v4
define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 {
%val = call <5 x float> @external_v5f32_func_void()
store volatile <5 x float> %val, <5 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void:
define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
%val = call { i32, i64 } @external_i32_i64_func_void()

View File

@ -200,4 +200,94 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
ret void
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v3i32:
; GCN-DAG: s_add_i32 s0, s0, 1
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3
define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
%add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
store <3 x i32> %add, <3 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v3f32:
; GCN-DAG: v_add_f32{{.*}}, s0, 1.0
; GCN-DAG: v_add_f32{{.*}}, s1, 2.0
; GCN-DAG: v_add_f32{{.*}}, s2, 4.0
define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
%add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
store <3 x float> %add, <3 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v5i32:
; GCN-DAG: s_add_i32 s0, s0, 1
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s3, 4
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s4, 5
define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
%add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
store <5 x i32> %add, <5 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v5f32:
; GCN-DAG: v_add_f32{{.*}}, s0, 1.0
; GCN-DAG: v_add_f32{{.*}}, s1, 2.0
; GCN-DAG: v_add_f32{{.*}}, s2, 4.0
; GCN-DAG: v_add_f32{{.*}}, s3, -1.0
; GCN-DAG: v_add_f32{{.*}}, s4, 0.5
define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
%add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
store <5 x float> %add, <5 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_v3i32:
; GCN-DAG: v_add_{{.*}}, 1, v0
; GCN-DAG: v_add_{{.*}}, 2, v1
; GCN-DAG: v_add_{{.*}}, 3, v2
define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
%add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
store <3 x i32> %add, <3 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_v3f32:
; GCN-DAG: v_add_{{.*}}, 1.0, v0
; GCN-DAG: v_add_{{.*}}, 2.0, v1
; GCN-DAG: v_add_{{.*}}, 4.0, v2
define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
%add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
store <3 x float> %add, <3 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_v5i32:
; GCN-DAG: v_add_{{.*}}, 1, v0
; GCN-DAG: v_add_{{.*}}, 2, v1
; GCN-DAG: v_add_{{.*}}, 3, v2
; GCN-DAG: v_add_{{.*}}, 4, v3
; GCN-DAG: v_add_{{.*}}, 5, v4
define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
%add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
store <5 x i32> %add, <5 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}ps_mesa_v5f32:
; GCN-DAG: v_add_f32{{.*}}, 1.0, v0
; GCN-DAG: v_add_f32{{.*}}, 2.0, v1
; GCN-DAG: v_add_f32{{.*}}, 4.0, v2
; GCN-DAG: v_add_f32{{.*}}, -1.0, v3
; GCN-DAG: v_add_f32{{.*}}, 0.5, v4
define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
%add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
store <5 x float> %add, <5 x float> addrspace(1)* undef
ret void
}
attributes #0 = { nounwind noinline }

View File

@ -375,6 +375,122 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}v5i8_arg:
; HSA-GFX9: kernarg_segment_byte_size = 16
; HSA-GFX9: kernarg_segment_alignment = 4
; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
entry:
store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v5i16_arg:
; HSA-GFX9: kernarg_segment_byte_size = 32
; HSA-GFX9: kernarg_segment_alignment = 4
; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind {
entry:
store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v5i32_arg:
; HSA-GFX9: kernarg_segment_byte_size = 64
; HSA-GFX9: kernarg_segment_alignment = 5
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind {
entry:
store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v5f32_arg:
; HSA-GFX9: kernarg_segment_byte_size = 64
; HSA-GFX9: kernarg_segment_alignment = 5
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind {
entry:
store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v5i64_arg:
; HSA-GFX9: kernarg_segment_byte_size = 128
; HSA-GFX9: kernarg_segment_alignment = 6
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
entry:
store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}v5f64_arg:
; HSA-GFX9: kernarg_segment_byte_size = 128
; HSA-GFX9: kernarg_segment_alignment = 6
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
entry:
store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
ret void
}
; FIXME: Lots of unpack and re-pack junk on VI
; FUNC-LABEL: {{^}}v8i8_arg:
; HSA-GFX9: kernarg_segment_byte_size = 16