forked from OSchip/llvm-project
[AMDGPU] Prepare for introduction of v3 and v5 MVTs
AMDGPU would like to have MVTs for v3i32, v3f32, v5i32, v5f32. This commit does not add them, but makes preparatory changes: * Fixed assumptions of power-of-2 vector type in kernel arg handling, and added v5 kernel arg tests and v3/v5 shader arg tests. * Added v5 tests for cost analysis. * Added vec3/vec5 arg test cases. Some of this patch is from Matt Arsenault, also of AMD. Differential Revision: https://reviews.llvm.org/D58928 Change-Id: I7279d6b4841464d2080eb255ef3c589e268eabcd llvm-svn: 356342
This commit is contained in:
parent
d1477e989c
commit
e30aa6a136
|
@ -1008,9 +1008,10 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
|
|||
if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
|
||||
MemVT = MemVT.getScalarType();
|
||||
|
||||
if (MemVT.isExtended()) {
|
||||
// This should really only happen if we have vec3 arguments
|
||||
assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
|
||||
// Round up vec3/vec5 argument.
|
||||
if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
|
||||
assert(MemVT.getVectorNumElements() == 3 ||
|
||||
MemVT.getVectorNumElements() == 5);
|
||||
MemVT = MemVT.getPow2VectorType(State.getContext());
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,9 @@ define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
|
|||
}
|
||||
|
||||
; CHECK: 'add_v3i32'
|
||||
; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
|
||||
; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
; CHECK: estimated cost of {{[34]}} for {{.*}} add <3 x i32>
|
||||
define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%add = add <3 x i32> %vec, %b
|
||||
|
@ -37,6 +39,17 @@ define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_v5i32'
|
||||
; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
; CHECK: estimated cost of {{[58]}} for {{.*}} add <5 x i32>
|
||||
define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%add = add <5 x i32> %vec, %b
|
||||
store <5 x i32> %add, <5 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'add_i64'
|
||||
; CHECK: estimated cost of 2 for {{.*}} add i64
|
||||
define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
|
|
|
@ -38,6 +38,15 @@ define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v5i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32>
|
||||
define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <5 x i32> %vec, i32 1
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v8i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
|
||||
define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
|
||||
|
|
|
@ -27,6 +27,15 @@ define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'fabs_v5f32'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32
|
||||
define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1
|
||||
store <5 x float> %fabs, <5 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'fabs_f64'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
|
||||
define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
|
||||
|
@ -84,6 +93,7 @@ define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
|
|||
declare float @llvm.fabs.f32(float) #1
|
||||
declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #1
|
||||
declare <3 x float> @llvm.fabs.v3f32(<3 x float>) #1
|
||||
declare <5 x float> @llvm.fabs.v5f32(<5 x float>) #1
|
||||
|
||||
declare double @llvm.fabs.f64(double) #1
|
||||
declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #1
|
||||
|
|
|
@ -20,7 +20,9 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float
|
|||
}
|
||||
|
||||
; ALL: 'fadd_v3f32'
|
||||
; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
|
||||
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
|
||||
define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fadd <3 x float> %vec, %b
|
||||
|
@ -28,6 +30,17 @@ define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float
|
|||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fadd_v5f32'
|
||||
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
|
||||
define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fadd <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fadd_f64'
|
||||
; FASTF64: estimated cost of 2 for {{.*}} fadd double
|
||||
; SLOWF64: estimated cost of 3 for {{.*}} fadd double
|
||||
|
|
|
@ -26,8 +26,10 @@ define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float
|
|||
}
|
||||
|
||||
; ALL: 'fdiv_v3f32'
|
||||
; NOFP32DENORM: estimated cost of 36 for {{.*}} fdiv <3 x float>
|
||||
; FP32DENORMS: estimated cost of 30 for {{.*}} fdiv <3 x float>
|
||||
; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 36/30 when it is legal.
|
||||
; NOFP32DENORM: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
|
||||
; FP32DENORMS: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
|
||||
define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <3 x float> %vec, %b
|
||||
|
@ -35,6 +37,18 @@ define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float
|
|||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fdiv_v5f32'
|
||||
; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 60/50 when it is legal.
|
||||
; NOFP32DENORM: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
|
||||
; FP32DENORMS: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
|
||||
define amdgpu_kernel void @fdiv_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fdiv_f64'
|
||||
; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
|
||||
; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
|
||||
|
|
|
@ -20,7 +20,9 @@ define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float
|
|||
}
|
||||
|
||||
; ALL: 'fmul_v3f32'
|
||||
; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
|
||||
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
; ALL: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
|
||||
define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fmul <3 x float> %vec, %b
|
||||
|
@ -28,6 +30,17 @@ define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float
|
|||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fmul_v5f32'
|
||||
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
; ALL: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
|
||||
define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fmul <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fmul_f64'
|
||||
; FASTF64: estimated cost of 2 for {{.*}} fmul double
|
||||
; SLOWF64: estimated cost of 3 for {{.*}} fmul double
|
||||
|
|
|
@ -20,7 +20,9 @@ define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float
|
|||
}
|
||||
|
||||
; ALL: 'fsub_v3f32'
|
||||
; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
|
||||
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
; ALL: estimated cost of {{[34]}} for {{.*}} fsub <3 x float>
|
||||
define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fsub <3 x float> %vec, %b
|
||||
|
@ -28,6 +30,17 @@ define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float
|
|||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fsub_v5f32'
|
||||
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
; ALL: estimated cost of {{[58]}} for {{.*}} fsub <5 x float>
|
||||
define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fsub <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'fsub_f64'
|
||||
; FASTF64: estimated cost of 2 for {{.*}} fsub double
|
||||
; SLOWF64: estimated cost of 3 for {{.*}} fsub double
|
||||
|
|
|
@ -19,7 +19,9 @@ define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
|
|||
}
|
||||
|
||||
; CHECK: 'mul_v3i32'
|
||||
; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
|
||||
; Allow for 12 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 9 when it is legal.
|
||||
; CHECK: estimated cost of {{9|12}} for {{.*}} mul <3 x i32>
|
||||
define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i32> %vec, %b
|
||||
|
@ -27,6 +29,17 @@ define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> add
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v5i32'
|
||||
; Allow for 24 when v5i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 15 when it is legal.
|
||||
; CHECK: estimated cost of {{15|24}} for {{.*}} mul <5 x i32>
|
||||
define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <5 x i32> %vec, %b
|
||||
store <5 x i32> %mul, <5 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: 'mul_v4i32'
|
||||
; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
|
||||
define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
|
||||
|
|
|
@ -26,7 +26,9 @@ declare hidden void @external_void_func_f32(float) #0
|
|||
declare hidden void @external_void_func_f64(double) #0
|
||||
declare hidden void @external_void_func_v2f32(<2 x float>) #0
|
||||
declare hidden void @external_void_func_v2f64(<2 x double>) #0
|
||||
declare hidden void @external_void_func_v3f32(<3 x float>) #0
|
||||
declare hidden void @external_void_func_v3f64(<3 x double>) #0
|
||||
declare hidden void @external_void_func_v5f32(<5 x float>) #0
|
||||
|
||||
declare hidden void @external_void_func_v2i16(<2 x i16>) #0
|
||||
declare hidden void @external_void_func_v2f16(<2 x half>) #0
|
||||
|
@ -39,6 +41,7 @@ declare hidden void @external_void_func_v2i32(<2 x i32>) #0
|
|||
declare hidden void @external_void_func_v3i32(<3 x i32>) #0
|
||||
declare hidden void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
|
||||
declare hidden void @external_void_func_v4i32(<4 x i32>) #0
|
||||
declare hidden void @external_void_func_v5i32(<5 x i32>) #0
|
||||
declare hidden void @external_void_func_v8i32(<8 x i32>) #0
|
||||
declare hidden void @external_void_func_v16i32(<16 x i32>) #0
|
||||
declare hidden void @external_void_func_v32i32(<32 x i32>) #0
|
||||
|
@ -341,6 +344,30 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_v3f32_imm:
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 v1, 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 v2, 4.0
|
||||
; GCN-NOT: v3,
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
|
||||
call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_v5f32_imm:
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 1.0
|
||||
; GCN-DAG: v_mov_b32_e32 v1, 2.0
|
||||
; GCN-DAG: v_mov_b32_e32 v2, 4.0
|
||||
; GCN-DAG: v_mov_b32_e32 v3, -1.0
|
||||
; GCN-DAG: v_mov_b32_e32 v4, 0.5
|
||||
; GCN-NOT: v5,
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
|
||||
call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
|
||||
; GCN: v_mov_b32_e32 v0, 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v1, 0x40100000
|
||||
|
@ -519,6 +546,19 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_v5i32_imm:
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 1
|
||||
; GCN-DAG: v_mov_b32_e32 v1, 2
|
||||
; GCN-DAG: v_mov_b32_e32 v2, 3
|
||||
; GCN-DAG: v_mov_b32_e32 v3, 4
|
||||
; GCN-DAG: v_mov_b32_e32 v4, 5
|
||||
; GCN-NOT v5,
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
|
||||
call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
|
@ -764,9 +804,140 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_12xv3i32:
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 v31, 11
|
||||
; GCN: s_getpc
|
||||
define void @stack_12xv3i32() #0 {
|
||||
entry:
|
||||
call void @external_void_func_12xv3i32(
|
||||
<3 x i32><i32 0, i32 0, i32 0>,
|
||||
<3 x i32><i32 1, i32 1, i32 1>,
|
||||
<3 x i32><i32 2, i32 2, i32 2>,
|
||||
<3 x i32><i32 3, i32 3, i32 3>,
|
||||
<3 x i32><i32 4, i32 4, i32 4>,
|
||||
<3 x i32><i32 5, i32 5, i32 5>,
|
||||
<3 x i32><i32 6, i32 6, i32 6>,
|
||||
<3 x i32><i32 7, i32 7, i32 7>,
|
||||
<3 x i32><i32 8, i32 8, i32 8>,
|
||||
<3 x i32><i32 9, i32 9, i32 9>,
|
||||
<3 x i32><i32 10, i32 11, i32 12>,
|
||||
<3 x i32><i32 13, i32 14, i32 15>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_12xv3f32:
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 v31, 0x41300000
|
||||
; GCN: s_getpc
|
||||
define void @stack_12xv3f32() #0 {
|
||||
entry:
|
||||
call void @external_void_func_12xv3f32(
|
||||
<3 x float><float 0.0, float 0.0, float 0.0>,
|
||||
<3 x float><float 1.0, float 1.0, float 1.0>,
|
||||
<3 x float><float 2.0, float 2.0, float 2.0>,
|
||||
<3 x float><float 3.0, float 3.0, float 3.0>,
|
||||
<3 x float><float 4.0, float 4.0, float 4.0>,
|
||||
<3 x float><float 5.0, float 5.0, float 5.0>,
|
||||
<3 x float><float 6.0, float 6.0, float 6.0>,
|
||||
<3 x float><float 7.0, float 7.0, float 7.0>,
|
||||
<3 x float><float 8.0, float 8.0, float 8.0>,
|
||||
<3 x float><float 9.0, float 9.0, float 9.0>,
|
||||
<3 x float><float 10.0, float 11.0, float 12.0>,
|
||||
<3 x float><float 13.0, float 14.0, float 15.0>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_8xv5i32:
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
|
||||
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
|
||||
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
|
||||
; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 v31, 7
|
||||
; GCN: s_getpc
|
||||
define void @stack_8xv5i32() #0 {
|
||||
entry:
|
||||
call void @external_void_func_8xv5i32(
|
||||
<5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
|
||||
<5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>,
|
||||
<5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>,
|
||||
<5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>,
|
||||
<5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>,
|
||||
<5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>,
|
||||
<5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>,
|
||||
<5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stack_8xv5f32:
|
||||
; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
|
||||
; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
|
||||
; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
|
||||
; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
|
||||
; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
|
||||
; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
|
||||
; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
|
||||
; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
|
||||
; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
|
||||
; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
|
||||
; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
|
||||
; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
|
||||
; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
|
||||
; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
|
||||
; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
|
||||
; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
|
||||
; GCN: v_mov_b32_e32 v31, 0x40e00000
|
||||
; GCN: s_getpc
|
||||
define void @stack_8xv5f32() #0 {
|
||||
entry:
|
||||
call void @external_void_func_8xv5f32(
|
||||
<5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
|
||||
<5 x float><float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>,
|
||||
<5 x float><float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>,
|
||||
<5 x float><float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>,
|
||||
<5 x float><float 4.0, float 4.0, float 4.0, float 4.0, float 4.0>,
|
||||
<5 x float><float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>,
|
||||
<5 x float><float 6.0, float 7.0, float 8.0, float 9.0, float 10.0>,
|
||||
<5 x float><float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare hidden void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0
|
||||
declare hidden void @stack_passed_f64_arg(<32 x i32>, double) #0
|
||||
|
||||
declare hidden void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,
|
||||
<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0
|
||||
declare hidden void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>,
|
||||
<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0
|
||||
declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>,
|
||||
<3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0
|
||||
declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
|
||||
<5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind noinline }
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s
|
||||
|
||||
declare void @external_void_func_void() #0
|
||||
|
||||
|
@ -26,6 +26,8 @@ declare double @external_f64_func_void() #0
|
|||
|
||||
declare <2 x half> @external_v2f16_func_void() #0
|
||||
declare <4 x half> @external_v4f16_func_void() #0
|
||||
declare <3 x float> @external_v3f32_func_void() #0
|
||||
declare <5 x float> @external_v5f32_func_void() #0
|
||||
declare <2 x double> @external_v2f64_func_void() #0
|
||||
|
||||
declare <2 x i32> @external_v2i32_func_void() #0
|
||||
|
@ -171,6 +173,11 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_v3i32_func_void:
|
||||
; GCN: s_swappc
|
||||
; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1]
|
||||
; GFX7-DAG: flat_store_dword {{.*}}, v2
|
||||
; GFX89-DAG: buffer_store_dwordx2 v[0:1]
|
||||
; GFX89-DAG: buffer_store_dword v2
|
||||
define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
|
||||
%val = call <3 x i32> @external_v3i32_func_void()
|
||||
store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
|
||||
|
@ -185,6 +192,11 @@ define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_v5i32_func_void:
|
||||
; GCN: s_swappc
|
||||
; GFX7-DAG: flat_store_dwordx4 {{.*}}, v[0:3]
|
||||
; GFX7-DAG: flat_store_dword {{.*}}, v4
|
||||
; GFX89-DAG: buffer_store_dwordx4 v[0:3]
|
||||
; GFX89-DAG: buffer_store_dword v4
|
||||
define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
|
||||
%val = call <5 x i32> @external_v5i32_func_void()
|
||||
store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8
|
||||
|
@ -240,6 +252,30 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_v3f32_func_void:
|
||||
; GCN: s_swappc
|
||||
; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1]
|
||||
; GFX7-DAG: flat_store_dword {{.*}}, v2
|
||||
; GFX89-DAG: buffer_store_dwordx2 v[0:1]
|
||||
; GFX89-DAG: buffer_store_dword v2
|
||||
define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
|
||||
%val = call <3 x float> @external_v3f32_func_void()
|
||||
store volatile <3 x float> %val, <3 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_v5f32_func_void:
|
||||
; GCN: s_swappc
|
||||
; GFX7-DAG: flat_store_dwordx4 {{.*}}, v[0:3]
|
||||
; GFX7-DAG: flat_store_dword {{.*}}, v4
|
||||
; GFX89-DAG: buffer_store_dwordx4 v[0:3]
|
||||
; GFX89-DAG: buffer_store_dword v4
|
||||
define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 {
|
||||
%val = call <5 x float> @external_v5f32_func_void()
|
||||
store volatile <5 x float> %val, <5 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void:
|
||||
define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
|
||||
%val = call { i32, i64 } @external_i32_i64_func_void()
|
||||
|
|
|
@ -200,4 +200,94 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_inreg_v3i32:
|
||||
; GCN-DAG: s_add_i32 s0, s0, 1
|
||||
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2
|
||||
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3
|
||||
define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
|
||||
%add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
|
||||
store <3 x i32> %add, <3 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_inreg_v3f32:
|
||||
; GCN-DAG: v_add_f32{{.*}}, s0, 1.0
|
||||
; GCN-DAG: v_add_f32{{.*}}, s1, 2.0
|
||||
; GCN-DAG: v_add_f32{{.*}}, s2, 4.0
|
||||
define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
|
||||
%add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_inreg_v5i32:
|
||||
; GCN-DAG: s_add_i32 s0, s0, 1
|
||||
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2
|
||||
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3
|
||||
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s3, 4
|
||||
; GCN-DAG: s_add_i32 s{{[0-9]*}}, s4, 5
|
||||
define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
|
||||
%add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
|
||||
store <5 x i32> %add, <5 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_inreg_v5f32:
|
||||
; GCN-DAG: v_add_f32{{.*}}, s0, 1.0
|
||||
; GCN-DAG: v_add_f32{{.*}}, s1, 2.0
|
||||
; GCN-DAG: v_add_f32{{.*}}, s2, 4.0
|
||||
; GCN-DAG: v_add_f32{{.*}}, s3, -1.0
|
||||
; GCN-DAG: v_add_f32{{.*}}, s4, 0.5
|
||||
define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
|
||||
%add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_v3i32:
|
||||
; GCN-DAG: v_add_{{.*}}, 1, v0
|
||||
; GCN-DAG: v_add_{{.*}}, 2, v1
|
||||
; GCN-DAG: v_add_{{.*}}, 3, v2
|
||||
define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
|
||||
%add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
|
||||
store <3 x i32> %add, <3 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_v3f32:
|
||||
; GCN-DAG: v_add_{{.*}}, 1.0, v0
|
||||
; GCN-DAG: v_add_{{.*}}, 2.0, v1
|
||||
; GCN-DAG: v_add_{{.*}}, 4.0, v2
|
||||
define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
|
||||
%add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_v5i32:
|
||||
; GCN-DAG: v_add_{{.*}}, 1, v0
|
||||
; GCN-DAG: v_add_{{.*}}, 2, v1
|
||||
; GCN-DAG: v_add_{{.*}}, 3, v2
|
||||
; GCN-DAG: v_add_{{.*}}, 4, v3
|
||||
; GCN-DAG: v_add_{{.*}}, 5, v4
|
||||
define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
|
||||
%add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
|
||||
store <5 x i32> %add, <5 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}ps_mesa_v5f32:
|
||||
; GCN-DAG: v_add_f32{{.*}}, 1.0, v0
|
||||
; GCN-DAG: v_add_f32{{.*}}, 2.0, v1
|
||||
; GCN-DAG: v_add_f32{{.*}}, 4.0, v2
|
||||
; GCN-DAG: v_add_f32{{.*}}, -1.0, v3
|
||||
; GCN-DAG: v_add_f32{{.*}}, 0.5, v4
|
||||
define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
|
||||
%add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
|
||||
attributes #0 = { nounwind noinline }
|
||||
|
|
|
@ -375,6 +375,122 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v5i8_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 16
|
||||
; HSA-GFX9: kernarg_segment_alignment = 4
|
||||
|
||||
; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
||||
; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
||||
; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
||||
|
||||
; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
|
||||
; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
|
||||
entry:
|
||||
store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v5i16_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 32
|
||||
; HSA-GFX9: kernarg_segment_alignment = 4
|
||||
|
||||
; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
|
||||
; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
|
||||
; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 58
|
||||
|
||||
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
|
||||
; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind {
|
||||
entry:
|
||||
store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v5i32_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 64
|
||||
; HSA-GFX9: kernarg_segment_alignment = 5
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
|
||||
; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
|
||||
; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
|
||||
; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
|
||||
define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind {
|
||||
entry:
|
||||
store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v5f32_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 64
|
||||
; HSA-GFX9: kernarg_segment_alignment = 5
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
|
||||
; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
|
||||
; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
|
||||
; HSA-GFX9: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
|
||||
define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind {
|
||||
entry:
|
||||
store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v5i64_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 128
|
||||
; HSA-GFX9: kernarg_segment_alignment = 6
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
|
||||
; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
|
||||
; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
|
||||
; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
|
||||
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
|
||||
; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
|
||||
; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
|
||||
define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
|
||||
entry:
|
||||
store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v5f64_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 128
|
||||
; HSA-GFX9: kernarg_segment_alignment = 6
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
|
||||
; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
|
||||
; SI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
|
||||
; SI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x21
|
||||
; MESA-VI-DAG: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
|
||||
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x84
|
||||
; HSA-GFX9-DAG: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
|
||||
; HSA-GFX9-DAG: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x60
|
||||
define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
|
||||
entry:
|
||||
store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Lots of unpack and re-pack junk on VI
|
||||
; FUNC-LABEL: {{^}}v8i8_arg:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 16
|
||||
|
|
Loading…
Reference in New Issue