forked from OSchip/llvm-project
parent
248a13057a
commit
46165b2409
|
@ -1,16 +1,56 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI
|
||||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16:
|
||||
; GFX89: {{flat|global}}_load_dword
|
||||
; GFX89: {{flat|global}}_load_dword
|
||||
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_e32 v2, v0, v1
|
||||
; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -22,15 +62,46 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_test_sub_v2i16:
|
||||
; GFX9: s_load_dword [[VAL0:s[0-9]+]]
|
||||
; GFX9: s_load_dword [[VAL1:s[0-9]+]]
|
||||
; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]]
|
||||
|
||||
; VI: s_sub_i32
|
||||
; VI: s_sub_i32
|
||||
define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
|
||||
; GFX9-LABEL: s_test_sub_v2i16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
|
||||
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_test_sub_v2i16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dword s6, s[6:7], 0x0
|
||||
; VI-NEXT: s_load_dword s7, s[0:1], 0x0
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; VI-NEXT: s_lshr_b32 s5, s7, 16
|
||||
; VI-NEXT: s_sub_i32 s4, s4, s5
|
||||
; VI-NEXT: s_sub_i32 s6, s6, s7
|
||||
; VI-NEXT: s_and_b32 s5, s6, 0xffff
|
||||
; VI-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; VI-NEXT: s_or_b32 s4, s5, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
|
||||
%b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
|
||||
%add = sub <2 x i16> %a, %b
|
||||
|
@ -38,10 +109,16 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
|
||||
; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
|
||||
; GCN: buffer_store_dword [[ZERO]]
|
||||
define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
|
||||
; GCN-LABEL: s_test_sub_self_v2i16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
|
||||
; GCN-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
%a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
|
||||
%add = sub <2 x i16> %a, %a
|
||||
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
||||
|
@ -49,33 +126,83 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <
|
|||
}
|
||||
|
||||
; FIXME: VI should not scalarize arg access.
|
||||
; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; VI: s_sub_i32
|
||||
; VI: s_sub_i32
|
||||
; VI: s_lshl_b32
|
||||
; VI: s_and_b32
|
||||
define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
|
||||
; GFX9-LABEL: s_test_sub_v2i16_kernarg:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: s_test_sub_v2i16_kernarg:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
|
||||
; VI-NEXT: s_load_dword s0, s[0:1], 0x30
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; VI-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; VI-NEXT: s_sub_i32 s1, s1, s3
|
||||
; VI-NEXT: s_sub_i32 s0, s2, s0
|
||||
; VI-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; VI-NEXT: s_and_b32 s0, s0, 0xffff
|
||||
; VI-NEXT: s_or_b32 s0, s0, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%add = sub <2 x i16> %a, %b
|
||||
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
|
||||
; GFX89-DAG: {{flat|global}}_load_dword
|
||||
|
||||
; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_or_b32
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_constant:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: s_mov_b32 s4, s0
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_constant:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -86,14 +213,46 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
|
|||
}
|
||||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant:
|
||||
; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: s_mov_b32 s4, s0
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_neg_constant:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0x3df
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -103,15 +262,45 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}}
|
||||
|
||||
; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]]
|
||||
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]]
|
||||
; VI: v_or_b32_e32
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: s_mov_b32 s4, s0
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 1
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_e32 v1, 1, v0
|
||||
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -121,14 +310,44 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}}
|
||||
|
||||
; VI: flat_load_dword [[LOAD:v[0-9]+]]
|
||||
; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]]
|
||||
; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]]
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]]
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: s_mov_b32 s4, s0
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
|
||||
; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -139,17 +358,45 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
|
|||
}
|
||||
|
||||
; The high element gives fp
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split:
|
||||
; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
|
||||
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
|
||||
|
||||
; VI-NOT: v_subrev_i16
|
||||
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
|
||||
; VI: flat_load_dword
|
||||
; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NOT: v_subrev_i16
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s8, 1.0
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s6, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: s_mov_b32 s4, s0
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
|
||||
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080
|
||||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s4, s0
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -160,22 +407,55 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
|
|||
}
|
||||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32:
|
||||
; GFX9: global_load_dword [[A:v[0-9]+]]
|
||||
; GFX9: global_load_dword [[B:v[0-9]+]]
|
||||
|
||||
; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
|
||||
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
||||
|
||||
; VI: flat_load_dword v[[A:[0-9]+]]
|
||||
; VI: flat_load_dword v[[B:[0-9]+]]
|
||||
|
||||
; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]]
|
||||
; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v1, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v1, v2
|
||||
; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -189,21 +469,59 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
|
|||
}
|
||||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64:
|
||||
; GFX9: global_load_dword [[A:v[0-9]+]]
|
||||
; GFX9: global_load_dword [[B:v[0-9]+]]
|
||||
|
||||
; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
|
||||
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||
; GFX9: buffer_store_dwordx4
|
||||
|
||||
; VI: flat_load_dword [[A:v[0-9]+]]
|
||||
; VI: flat_load_dword [[B:v[0-9]+]]
|
||||
; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]]
|
||||
; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: buffer_store_dwordx4
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v4, v[4:5]
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v3, v1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v2, v4
|
||||
; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -217,22 +535,57 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
|
|||
}
|
||||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32:
|
||||
; GFX9: global_load_dword [[A:v[0-9]+]]
|
||||
; GFX9: global_load_dword [[B:v[0-9]+]]
|
||||
|
||||
; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
|
||||
; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
|
||||
; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
|
||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
|
||||
|
||||
; VI: flat_load_dword
|
||||
; VI: flat_load_dword
|
||||
; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; VI: buffer_store_dwordx2
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: global_load_dword v1, v[2:3], off
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
||||
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: flat_load_dword v1, v[2:3]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v0, v1
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
@ -246,21 +599,62 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
|
|||
}
|
||||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64:
|
||||
; GCN: {{flat|global}}_load_dword
|
||||
; GCN: {{flat|global}}_load_dword
|
||||
|
||||
; GFX9: v_pk_sub_i16
|
||||
; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
|
||||
|
||||
; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_sub_u16_e32
|
||||
|
||||
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
||||
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
|
||||
; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
|
||||
; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
|
||||
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX9-NEXT: s_mov_b32 s2, -1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: global_load_dword v2, v[2:3], off
|
||||
; GFX9-NEXT: global_load_dword v0, v[0:1], off
|
||||
; GFX9-NEXT: s_mov_b32 s0, s4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v2
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
||||
; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
|
||||
; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
||||
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; VI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s2, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_load_dword v2, v[2:3]
|
||||
; VI-NEXT: flat_load_dword v0, v[0:1]
|
||||
; VI-NEXT: s_mov_b32 s0, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_sub_u16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-NEXT: v_sub_u16_e32 v0, v0, v2
|
||||
; VI-NEXT: v_bfe_i32 v2, v1, 0, 16
|
||||
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
||||
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
||||
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
||||
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
|
||||
%gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
|
||||
|
|
Loading…
Reference in New Issue