From 16778b19f2c2756a9e0dd04636fb2c269f684917 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 25 Sep 2020 16:07:27 +0100 Subject: [PATCH] [AMDGPU] Make bfe patterns divergence-aware This tends to increase code size but more importantly it reduces vgpr usage, and could avoid costly readfirstlanes if the result needs to be in an sgpr. Differential Revision: https://reviews.llvm.org/D88580 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 16 ++++++++++------ .../test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 5 ++--- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 10 ++++++---- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 12 +++++++----- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d55cf0fc65ec..7cffe615f3b3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2327,36 +2327,40 @@ def IMMPopCount : SDNodeXForm; def : AMDGPUPat < - (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask), + (DivergentBinFrag (i32 (srl i32:$src, i32:$rshift)), + IMMZeroBasedBitfieldMask:$mask), (V_BFE_U32 $src, $rshift, (i32 (IMMPopCount $mask))) >; // x & ((1 << y) - 1) def : AMDGPUPat < - (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), + (DivergentBinFrag i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), (V_BFE_U32 $src, (i32 0), $width) >; // x & ~(-1 << y) def : AMDGPUPat < - (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)), + (DivergentBinFrag i32:$src, + (xor_oneuse (shl_oneuse -1, i32:$width), -1)), (V_BFE_U32 $src, (i32 0), $width) >; // x & (-1 >> (bitwidth - y)) def : AMDGPUPat < - (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), + (DivergentBinFrag i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), (V_BFE_U32 $src, (i32 0), $width) >; // x << (bitwidth - y) >> (bitwidth - y) def : AMDGPUPat < - (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), + (DivergentBinFrag (shl_oneuse i32:$src, (sub 32, i32:$width)), + (sub 32, i32:$width)), (V_BFE_U32 $src, (i32 0), $width) >; def : AMDGPUPat < - (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), + (DivergentBinFrag (shl_oneuse i32:$src, (sub 32, i32:$width)), + (sub 32, i32:$width)), (V_BFE_I32 $src, (i32 0), $width) >; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 072a76780447..cbfd6415979d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -232,10 +232,9 @@ for.end: ; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 ; SI-PROMOTE-VECT: s_load_dword [[IDX:s[0-9]+]] -; SI-PROMOTE-VECT: s_mov_b32 [[SREG:s[0-9]+]], 0x10000 ; SI-PROMOTE-VECT: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 4 -; SI-PROMOTE-VECT: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SCALED_IDX]] -; SI-PROMOTE-VECT: v_bfe_u32 v{{[0-9]+}}, [[SREG]], [[VREG]], 16 +; SI-PROMOTE-VECT: s_lshr_b32 [[SREG:s[0-9]+]], 0x10000, [[SCALED_IDX]] +; SI-PROMOTE-VECT: s_and_b32 s{{[0-9]+}}, [[SREG]], 0xffff define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index 69237cfabb85..ce6340fb3953 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -49,8 +49,9 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}s_ubfe_sub_i32: ; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}} -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]] -; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]] +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] +; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_lshr_b32 s{{[0-9]+}}, [[TMP]], [[SUB]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x @@ -125,8 +126,9 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}s_sbfe_sub_i32: ; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}} -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]] -; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]] +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] +; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_ashr_i32 s{{[0-9]+}}, [[TMP]], [[SUB]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 5ba8edb2c1c0..805ca6f5d8af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1622,10 +1622,11 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_bfe_u32 v0, s2, v0, 3 +; SI-NEXT: s_lshr_b32 s0, s2, s3 +; SI-NEXT: s_and_b32 s0, s0, 7 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -1636,8 +1637,9 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_bfe_u32 v0, s0, v0, 3 +; VI-NEXT: s_lshr_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s0, s0, 7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b