2019-08-24 01:58:49 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2019-03-09 04:30:51 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
|
2019-03-29 20:04:18 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
|
2019-03-09 04:30:51 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
|
2017-11-13 08:22:09 +08:00
|
|
|
|
|
|
|
define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_undeflo:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16_d16 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_undeflo:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_undeflo:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
|
|
|
%build = insertelement <2 x i16> undef, i16 %load, i32 0
|
|
|
|
ret <2 x i16> %build
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reglo:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reglo:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reglo:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
2019-03-09 04:30:51 +08:00
|
|
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
|
|
|
ret <2 x i16> %build1
|
|
|
|
}
|
|
|
|
|
|
|
|
; Show that we get reasonable regalloc without physreg constraints.
|
|
|
|
define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
2019-03-09 04:30:51 +08:00
|
|
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_zerolo:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 0
|
|
|
|
; GFX900-NEXT: ds_read_u16_d16 v1, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, v1
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_zerolo:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_zerolo:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
|
|
|
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
|
|
|
|
ret <2 x i16> %build
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2f16_fpimm:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 2.0
|
|
|
|
; GFX900-NEXT: ds_read_u16_d16 v1, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, v1
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2f16_fpimm:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: s_movk_i32 s4, 0x4000
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, s4, 16, v0
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2f16_fpimm:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, 2.0, v0
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load half, half addrspace(3)* %in
|
|
|
|
%build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
|
|
|
|
ret <2 x half> %build
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2f16_reghi_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16_d16 v1, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2f16_reghi_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2f16_reghi_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%load = load half, half addrspace(3)* %in
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load half, half addrspace(3)* %in
|
|
|
|
%build0 = insertelement <2 x half> undef, half %reg, i32 1
|
|
|
|
%build1 = insertelement <2 x half> %build0, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u8_d16 v1, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
|
|
|
%load = load i8, i8 addrspace(3)* %in
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load i8, i8 addrspace(3)* %in
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
|
|
|
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_i8_d16 v1, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
|
|
|
%load = load i8, i8 addrspace(3)* %in
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%load = load i8, i8 addrspace(3)* %in
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
|
|
|
%build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-03-09 04:30:51 +08:00
|
|
|
define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u8 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%load = load i8, i8 addrspace(3)* %in
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build0 = insertelement <2 x half> undef, half %reg, i32 1
|
|
|
|
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_i8 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%load = load i8, i8 addrspace(3)* %in
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build0 = insertelement <2 x half> undef, half %reg, i32 1
|
|
|
|
%build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16 v0, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, 0
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX900-NEXT: ds_write_b16 v2, v0
|
|
|
|
; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX906-NEXT: ds_write_b16 v2, v0
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
2020-01-14 06:54:17 +08:00
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v2, 0
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: ds_write_b16 v2, v0
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
|
|
|
%elt1 = extractelement <2 x i16> %reg, i32 1
|
|
|
|
store i16 %load, i16 addrspace(3)* null
|
|
|
|
%build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
|
|
; GFX900-NEXT: ds_read_u16_d16 v1, v0
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v0, 0
|
|
|
|
; GFX900-NEXT: ds_write_b16 v0, v2
|
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(1)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v3, 0
|
|
|
|
; GFX906-NEXT: ds_write_b16 v3, v2
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
2020-01-14 06:54:17 +08:00
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v2, 0
|
|
|
|
; GFX803-NEXT: ds_write_b16 v2, v1
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
|
|
|
%elt1 = extractelement <2 x i16> %reg, i32 1
|
|
|
|
store i16 %elt1, i16 addrspace(3)* null
|
|
|
|
%build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_read_u16 v0, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: ds_write_b16 v2, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX900-NEXT: ds_write_b16 v3, v4
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_read_u16 v0, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: ds_write_b16 v2, v0
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX906-NEXT: ds_write_b16 v3, v4
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_mov_b32 m0, -1
|
|
|
|
; GFX803-NEXT: ds_read_u16 v0, v0
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: ds_write_b16 v2, v0
|
|
|
|
; GFX803-NEXT: ds_write_b16 v3, v1
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%load = load i16, i16 addrspace(3)* %in
|
|
|
|
%elt1 = extractelement <2 x i16> %reg, i32 1
|
|
|
|
store i16 %load, i16 addrspace(3)* %out0
|
|
|
|
store i16 %elt1, i16 addrspace(3)* %out1
|
|
|
|
%build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-11-13 08:22:09 +08:00
|
|
|
define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
|
|
|
%gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
|
|
|
|
%load = load i16, i16 addrspace(1)* %gep
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
|
|
|
|
%load = load half, half addrspace(1)* %gep
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
|
|
|
|
%load = load i8, i8 addrspace(1)* %gep
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
|
|
|
|
%load = load i8, i8 addrspace(1)* %gep
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-03-09 04:30:51 +08:00
|
|
|
define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
|
|
|
|
%load = load i8, i8 addrspace(1)* %gep
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
|
|
|
|
%load = load i8, i8 addrspace(1)* %gep
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
2017-11-13 08:22:09 +08:00
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_flat_lo_v2i16_reghi_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: flat_load_short_d16 v2, v[0:1]
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_flat_lo_v2i16_reghi_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_flat_lo_v2i16_reghi_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i16, i16* %in
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_flat_lo_v2f16_reghi_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: flat_load_short_d16 v2, v[0:1]
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_flat_lo_v2f16_reghi_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_flat_lo_v2f16_reghi_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
[AMDGPU] Clean up update_llc_test_checks CodeGen tests
Summary:
Some tests have been hand edited without removing the
update_llc_test_checks header, some have slightly outdated CHECK lines
which still pass, and some have additional comments which
update_llc_test_checks pushes towards the function body.
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69402
2019-10-25 04:02:26 +08:00
|
|
|
|
|
|
|
; FIXME: the and above should be removable
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load half, half* %in
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1]
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: flat_load_ubyte v0, v[0:1]
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i8, i8* %in
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1]
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i8, i8* %in
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-03-09 04:30:51 +08:00
|
|
|
define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1]
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: flat_load_ubyte v0, v[0:1]
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%load = load i8, i8* %in
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1]
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%load = load i8, i8* %in
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i16, i16 addrspace(5)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
|
|
; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i16, i16 addrspace(5)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
|
|
|
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load half, half addrspace(5)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i8, i8 addrspace(5)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load i8, i8 addrspace(5)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-03 00:07:16 +08:00
|
|
|
define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v1, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
2020-01-22 06:27:57 +08:00
|
|
|
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
[AMDGPU] Remove dubious logic in bidirectional list scheduler
Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.
I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.
To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.
Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.
Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB
Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68338
2019-10-07 22:33:59 +08:00
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
|
2017-11-13 08:22:09 +08:00
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%bc.ext = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-14 02:00:25 +08:00
|
|
|
define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_constant_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_constant_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_constant_lo_v2i16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-14 02:00:25 +08:00
|
|
|
%gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
|
|
|
|
%load = load i16, i16 addrspace(4)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2018-02-14 02:00:25 +08:00
|
|
|
define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ushort v0, v[0:1], off offset:-4094
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ushort v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-13 08:22:09 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
2018-02-14 02:00:25 +08:00
|
|
|
%gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
|
|
|
|
%load = load half, half addrspace(4)* %gep
|
2017-11-13 08:22:09 +08:00
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-03-09 04:30:51 +08:00
|
|
|
define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
|
|
|
|
%load = load i8, i8 addrspace(4)* %gep
|
|
|
|
%ext = zext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v2, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: global_load_sbyte v0, v[0:1], off offset:-4095
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff001, v0
|
|
|
|
; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
|
|
|
|
; GFX803-NEXT: flat_load_sbyte v0, v[0:1]
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
|
|
|
|
%load = load i8, i8 addrspace(4)* %gep
|
|
|
|
%ext = sext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-11-14 07:24:26 +08:00
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX906-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_e32 v0, v1, v0
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-14 07:24:26 +08:00
|
|
|
entry:
|
2018-02-03 00:07:16 +08:00
|
|
|
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
|
|
|
%obj1 = alloca [4096 x i16], align 2, addrspace(5)
|
2017-11-14 07:24:26 +08:00
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
|
|
|
|
store volatile i32 123, i32 addrspace(5)* %bc
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i16, i16 addrspace(5)* %gep
|
2017-11-14 07:24:26 +08:00
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-14 07:24:26 +08:00
|
|
|
entry:
|
2018-02-03 00:07:16 +08:00
|
|
|
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
|
|
|
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
|
2017-11-14 07:24:26 +08:00
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
|
|
|
|
store volatile i32 123, i32 addrspace(5)* %bc
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* %gep
|
2017-11-14 07:24:26 +08:00
|
|
|
%load.ext = sext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2017-11-14 07:24:26 +08:00
|
|
|
entry:
|
2018-02-03 00:07:16 +08:00
|
|
|
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
|
|
|
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
|
2017-11-14 07:24:26 +08:00
|
|
|
%reg.bc = bitcast i32 %reg to <2 x i16>
|
2018-02-03 00:07:16 +08:00
|
|
|
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
|
|
|
|
store volatile i32 123, i32 addrspace(5)* %bc
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
|
2018-02-03 00:07:16 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* %gep
|
2017-11-14 07:24:26 +08:00
|
|
|
%load.ext = zext i8 %load to i16
|
|
|
|
%build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
|
|
|
|
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-03-09 04:30:51 +08:00
|
|
|
define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX906-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX803-NEXT: buffer_load_sbyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
|
|
|
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
|
|
|
|
store volatile i32 123, i32 addrspace(5)* %bc
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
|
2019-03-09 04:30:51 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* %gep
|
|
|
|
%load.ext = sext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %load.ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
|
2019-08-24 01:58:49 +08:00
|
|
|
; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
|
|
|
|
; GFX900: ; %bb.0: ; %entry
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX900-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX900-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
|
|
|
|
; GFX906: ; %bb.0: ; %entry
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX906-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX906-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX906-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
|
|
; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
|
|
|
; GFX906-NEXT: global_store_dword v[0:1], v0, off
|
|
|
|
; GFX906-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
|
|
|
;
|
|
|
|
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
|
|
|
|
; GFX803: ; %bb.0: ; %entry
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: v_mov_b32_e32 v1, 0x7b
|
|
|
|
; GFX803-NEXT: buffer_store_dword v1, off, s[0:3], s32
|
|
|
|
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 offset:4095
|
|
|
|
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
|
|
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0)
|
|
|
|
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
|
|
|
|
; GFX803-NEXT: flat_store_dword v[0:1], v0
|
|
|
|
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
|
|
; GFX803-NEXT: s_setpc_b64 s[30:31]
|
2019-03-09 04:30:51 +08:00
|
|
|
entry:
|
|
|
|
%obj0 = alloca [10 x i32], align 4, addrspace(5)
|
|
|
|
%obj1 = alloca [4096 x i8], align 2, addrspace(5)
|
|
|
|
%reg.bc = bitcast i32 %reg to <2 x half>
|
|
|
|
%bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
|
|
|
|
store volatile i32 123, i32 addrspace(5)* %bc
|
2019-06-06 06:37:50 +08:00
|
|
|
%gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
|
2019-03-09 04:30:51 +08:00
|
|
|
%load = load volatile i8, i8 addrspace(5)* %gep
|
|
|
|
%load.ext = zext i8 %load to i16
|
|
|
|
%bitcast = bitcast i16 %load.ext to half
|
|
|
|
%build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
|
|
|
|
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-11-13 08:22:09 +08:00
|
|
|
attributes #0 = { nounwind }
|