forked from OSchip/llvm-project
[AMDGPU] Make bfe patterns divergence-aware
This tends to increase code size but more importantly it reduces vgpr usage, and could avoid costly readfirstlanes if the result needs to be in an sgpr. Differential Revision: https://reviews.llvm.org/D88580
This commit is contained in:
parent
0d5989bb24
commit
16778b19f2
|
@ -2327,36 +2327,40 @@ def IMMPopCount : SDNodeXForm<imm, [{
|
|||
}]>;
|
||||
|
||||
def : AMDGPUPat <
|
||||
(and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask),
|
||||
(DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)),
|
||||
IMMZeroBasedBitfieldMask:$mask),
|
||||
(V_BFE_U32 $src, $rshift, (i32 (IMMPopCount $mask)))
|
||||
>;
|
||||
|
||||
// x & ((1 << y) - 1)
|
||||
def : AMDGPUPat <
|
||||
(and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
|
||||
(DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)),
|
||||
(V_BFE_U32 $src, (i32 0), $width)
|
||||
>;
|
||||
|
||||
// x & ~(-1 << y)
|
||||
def : AMDGPUPat <
|
||||
(and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)),
|
||||
(DivergentBinFrag<and> i32:$src,
|
||||
(xor_oneuse (shl_oneuse -1, i32:$width), -1)),
|
||||
(V_BFE_U32 $src, (i32 0), $width)
|
||||
>;
|
||||
|
||||
// x & (-1 >> (bitwidth - y))
|
||||
def : AMDGPUPat <
|
||||
(and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
|
||||
(DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))),
|
||||
(V_BFE_U32 $src, (i32 0), $width)
|
||||
>;
|
||||
|
||||
// x << (bitwidth - y) >> (bitwidth - y)
|
||||
def : AMDGPUPat <
|
||||
(srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
|
||||
(DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)),
|
||||
(sub 32, i32:$width)),
|
||||
(V_BFE_U32 $src, (i32 0), $width)
|
||||
>;
|
||||
|
||||
def : AMDGPUPat <
|
||||
(sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
|
||||
(DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)),
|
||||
(sub 32, i32:$width)),
|
||||
(V_BFE_I32 $src, (i32 0), $width)
|
||||
>;
|
||||
|
||||
|
|
|
@ -232,10 +232,9 @@ for.end:
|
|||
; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0
|
||||
|
||||
; SI-PROMOTE-VECT: s_load_dword [[IDX:s[0-9]+]]
|
||||
; SI-PROMOTE-VECT: s_mov_b32 [[SREG:s[0-9]+]], 0x10000
|
||||
; SI-PROMOTE-VECT: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 4
|
||||
; SI-PROMOTE-VECT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SCALED_IDX]]
|
||||
; SI-PROMOTE-VECT: v_bfe_u32 v{{[0-9]+}}, [[SREG]], [[VREG]], 16
|
||||
; SI-PROMOTE-VECT: s_lshr_b32 [[SREG:s[0-9]+]], 0x10000, [[SCALED_IDX]]
|
||||
; SI-PROMOTE-VECT: s_and_b32 s{{[0-9]+}}, [[SREG]], 0xffff
|
||||
define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
|
||||
entry:
|
||||
%0 = alloca [2 x i16], addrspace(5)
|
||||
|
|
|
@ -49,8 +49,9 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
|||
|
||||
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
|
||||
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
|
||||
; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
|
||||
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
|
||||
; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]]
|
||||
; GCN: s_lshr_b32 s{{[0-9]+}}, [[TMP]], [[SUB]]
|
||||
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
||||
|
@ -125,8 +126,9 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
|||
|
||||
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
|
||||
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
|
||||
; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
|
||||
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
|
||||
; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]]
|
||||
; GCN: s_ashr_i32 s{{[0-9]+}}, [[TMP]], [[SUB]]
|
||||
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
||||
|
|
|
@ -1622,10 +1622,11 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0
|
|||
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_bfe_u32 v0, s2, v0, 3
|
||||
; SI-NEXT: s_lshr_b32 s0, s2, s3
|
||||
; SI-NEXT: s_and_b32 s0, s0, 7
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
|
@ -1636,8 +1637,9 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0
|
|||
; VI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; VI-NEXT: s_mov_b32 s6, -1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_bfe_u32 v0, s0, v0, 3
|
||||
; VI-NEXT: s_lshr_b32 s0, s0, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 7
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
%c = lshr i32 %a, %b
|
||||
|
|
Loading…
Reference in New Issue