AMDGPU: Refine user-sgpr-init16-bug

It only applies to gfx1100 and gfx1102, and for wave32.
This commit is contained in:
Matt Arsenault 2022-07-20 20:30:12 -04:00
parent 1f69503107
commit 5a5439cb73
4 changed files with 54 additions and 27 deletions

View File

@ -1272,11 +1272,14 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard]>;
// Features for GFX 11.0.0 and 11.0.1
def FeatureISAVersion11_0 : FeatureSet<
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureUserSGPRInit16Bug])>;
def FeatureISAVersion11_0_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[])>;
def FeatureISAVersion11_0_2 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureUserSGPRInit16Bug])>;

View File

@ -249,11 +249,11 @@ def : ProcessorModel<"gfx1036", GFX10SpeedModel,
//===----------------------------------------------------------------------===//
def : ProcessorModel<"gfx1100", GFX11SpeedModel,
FeatureISAVersion11_0.Features
FeatureISAVersion11_0_0.Features
>;
def : ProcessorModel<"gfx1101", GFX11SpeedModel,
FeatureISAVersion11_0.Features
FeatureISAVersion11_0_1.Features
>;
def : ProcessorModel<"gfx1102", GFX11SpeedModel,

View File

@ -935,7 +935,7 @@ public:
}
bool hasUserSGPRInit16Bug() const {
return UserSGPRInit16Bug;
return UserSGPRInit16Bug && isWave32();
}
bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }

View File

@ -1,7 +1,16 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
; Does not apply to wave64
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; Does not apply to gfx1101
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
; There aren't any stack objects, but we still enable the
; private_segment_wavefront_offset to get to 16, and the workgroup ID
@ -10,17 +19,19 @@
; private_segment_buffer + workgroup_id_x = 5, + 11 padding
; GCN-LABEL: {{^}}minimal_kernel_inputs:
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15
; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s0
; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
; GCN: .amdhsa_kernel minimal_kernel_inputs
; GCN: .amdhsa_user_sgpr_count 15
; WORKAROUND: .amdhsa_user_sgpr_count 15
; NOWORKAROUND: .amdhsa_user_sgpr_count 0
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32 1
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
@ -28,7 +39,8 @@
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @minimal_kernel_inputs() {
%id = call i32 @llvm.amdgcn.workgroup.id.x()
store volatile i32 %id, i32 addrspace(1)* undef
@ -36,17 +48,19 @@ define amdgpu_kernel void @minimal_kernel_inputs() {
}
; GCN-LABEL: {{^}}minimal_kernel_inputs_with_stack:
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15
; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s0
; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
; GCN: .amdhsa_kernel minimal_kernel_inputs
; GCN: .amdhsa_user_sgpr_count 15
; WORKAROUND: .amdhsa_user_sgpr_count 15
; NOWORKAROUND: .amdhsa_user_sgpr_count 0
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32 1
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
@ -54,7 +68,8 @@ define amdgpu_kernel void @minimal_kernel_inputs() {
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
%alloca = alloca i32, addrspace(5)
%id = call i32 @llvm.amdgcn.workgroup.id.x()
@ -66,17 +81,19 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
; GCN-LABEL: {{^}}queue_ptr:
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], s15
; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s2
; GCN-NEXT: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V]], off
; GCN: .amdhsa_kernel queue_ptr
; GCN: .amdhsa_user_sgpr_count 15
; WORKAROUND: .amdhsa_user_sgpr_count 15
; NOWORKAROUND: .amdhsa_user_sgpr_count 2
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32 1
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
@ -84,7 +101,8 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() {
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 15
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 2
define amdgpu_kernel void @queue_ptr() {
%queue.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
%load = load volatile i8, i8 addrspace(4)* %queue.ptr
@ -94,9 +112,13 @@ define amdgpu_kernel void @queue_ptr() {
}
; GCN-LABEL: {{^}}all_inputs:
; GCN: v_mov_b32_e32 [[V_X:v[0-9]+]], s13
; GCN: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14
; GCN: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15
; WORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s13
; WORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14
; WORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15
; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s8
; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9
; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3]
@ -111,13 +133,14 @@ define amdgpu_kernel void @queue_ptr() {
; GCN: global_store_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[DISPATCH_LO]]:[[DISPATCH_HI]]{{\]}}, off
; GCN: .amdhsa_kernel all_inputs
; GCN: .amdhsa_user_sgpr_count 13
; WORKAROUND: .amdhsa_user_sgpr_count 13
; NOWORKAROUND: .amdhsa_user_sgpr_count 8
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32 1
; GCN-NEXT: .amdhsa_wavefront_size32
; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
@ -125,7 +148,8 @@ define amdgpu_kernel void @queue_ptr() {
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
; GCN: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13
; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13
; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8
define amdgpu_kernel void @all_inputs() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca