forked from OSchip/llvm-project
AMDGPU: Select workitem ID intrinsics to 0 with req_work_group_size
Shockingly we weren't doing this already. We should probably have this be done earlier in the IR too, but it's still helpful to have the lowering guarantee it so that we can modify the ABI implicit inputs based on it.
This commit is contained in:
parent
a6f49423c1
commit
59994c25f9
|
@ -2888,6 +2888,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
|
|||
|
||||
Register AndMaskSrc = LiveIn;
|
||||
|
||||
// TODO: Avoid clearing the high bits if we know workitem id y/z are always
|
||||
// 0.
|
||||
if (Shift != 0) {
|
||||
auto ShiftAmt = B.buildConstant(S32, Shift);
|
||||
AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
|
||||
|
@ -4966,6 +4968,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
|
||||
B.buildConstant(MI.getOperand(0).getReg(), C);
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
||||
MachineInstr &MI) const {
|
||||
MachineIRBuilder &B = Helper.MIRBuilder;
|
||||
|
@ -5069,12 +5077,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
|
|||
case Intrinsic::amdgcn_implicitarg_ptr:
|
||||
return legalizeImplicitArgPtr(MI, MRI, B);
|
||||
case Intrinsic::amdgcn_workitem_id_x:
|
||||
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
|
||||
return replaceWithConstant(B, MI, 0);
|
||||
return legalizePreloadedArgIntrin(MI, MRI, B,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_X);
|
||||
case Intrinsic::amdgcn_workitem_id_y:
|
||||
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
|
||||
return replaceWithConstant(B, MI, 0);
|
||||
|
||||
return legalizePreloadedArgIntrin(MI, MRI, B,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
|
||||
case Intrinsic::amdgcn_workitem_id_z:
|
||||
if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
|
||||
return replaceWithConstant(B, MI, 0);
|
||||
|
||||
return legalizePreloadedArgIntrin(MI, MRI, B,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
|
||||
case Intrinsic::amdgcn_workgroup_id_x:
|
||||
|
|
|
@ -6756,14 +6756,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
return getPreloadedValue(DAG, *MFI, VT,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
|
||||
case Intrinsic::amdgcn_workitem_id_x:
|
||||
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
|
||||
return DAG.getConstant(0, DL, MVT::i32);
|
||||
|
||||
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
||||
SDLoc(DAG.getEntryNode()),
|
||||
MFI->getArgInfo().WorkItemIDX);
|
||||
case Intrinsic::amdgcn_workitem_id_y:
|
||||
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
|
||||
return DAG.getConstant(0, DL, MVT::i32);
|
||||
|
||||
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
||||
SDLoc(DAG.getEntryNode()),
|
||||
MFI->getArgInfo().WorkItemIDY);
|
||||
case Intrinsic::amdgcn_workitem_id_z:
|
||||
if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
|
||||
return DAG.getConstant(0, DL, MVT::i32);
|
||||
|
||||
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
||||
SDLoc(DAG.getEntryNode()),
|
||||
MFI->getArgInfo().WorkItemIDZ);
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,HSA,CO-V2,UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=hawaii -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mattr=+flat-for-global -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
|
||||
; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
@ -125,5 +125,75 @@ define void @test_workitem_id_z_func(i32 addrspace(1)* %out) #1 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to avoid enabling in kernel inputs
|
||||
; FIXME: Packed tid should avoid the and
|
||||
; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only:
|
||||
; CO-V2: enable_vgpr_workitem_id = 2
|
||||
|
||||
; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
|
||||
; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0
|
||||
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %id.x, i32* %out
|
||||
store volatile i32 %id.y, i32* %out
|
||||
store volatile i32 %id.z, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only:
|
||||
; CO-V2: enable_vgpr_workitem_id = 2
|
||||
|
||||
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
|
||||
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
|
||||
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10
|
||||
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %id.x, i32* %out
|
||||
store volatile i32 %id.y, i32* %out
|
||||
store volatile i32 %id.z, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only:
|
||||
; CO-V2: enable_vgpr_workitem_id = 2
|
||||
|
||||
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
|
||||
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2
|
||||
|
||||
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
|
||||
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %id.x, i32* %out
|
||||
store volatile i32 %id.y, i32* %out
|
||||
store volatile i32 %id.z, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
||||
!0 = !{i32 64, i32 1, i32 1}
|
||||
!1 = !{i32 1, i32 64, i32 1}
|
||||
!2 = !{i32 1, i32 1, i32 64}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s
|
||||
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
|
||||
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,CO-V2,UNPACKED %s
|
||||
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA,UNPACKED %s
|
||||
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
|
||||
; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,UNPACKED %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
@ -63,5 +63,75 @@ define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to avoid enabling in kernel inputs
|
||||
; FIXME: Packed tid should avoid the and
|
||||
; ALL-LABEL: {{^}}test_reqd_workgroup_size_x_only:
|
||||
; CO-V2: enable_vgpr_workitem_id = 2
|
||||
|
||||
; ALL-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; UNPACKED-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
|
||||
; PACKED: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x3ff, v0
|
||||
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
define amdgpu_kernel void @test_reqd_workgroup_size_x_only(i32* %out) !reqd_work_group_size !0 {
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %id.x, i32* %out
|
||||
store volatile i32 %id.y, i32* %out
|
||||
store volatile i32 %id.z, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}test_reqd_workgroup_size_y_only:
|
||||
; CO-V2: enable_vgpr_workitem_id = 2
|
||||
|
||||
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
|
||||
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
|
||||
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 10
|
||||
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
define amdgpu_kernel void @test_reqd_workgroup_size_y_only(i32* %out) !reqd_work_group_size !1 {
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %id.x, i32* %out
|
||||
store volatile i32 %id.y, i32* %out
|
||||
store volatile i32 %id.z, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}test_reqd_workgroup_size_z_only:
|
||||
; CO-V2: enable_vgpr_workitem_id = 2
|
||||
|
||||
; ALL: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
; ALL: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]]
|
||||
|
||||
; UNPACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v2
|
||||
|
||||
; PACKED: v_bfe_u32 [[MASKED:v[0-9]+]], v0, 10, 20
|
||||
; PACKED: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
define amdgpu_kernel void @test_reqd_workgroup_size_z_only(i32* %out) !reqd_work_group_size !2 {
|
||||
%id.x = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.y = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%id.z = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %id.x, i32* %out
|
||||
store volatile i32 %id.y, i32* %out
|
||||
store volatile i32 %id.z, i32* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
||||
!0 = !{i32 64, i32 1, i32 1}
|
||||
!1 = !{i32 1, i32 64, i32 1}
|
||||
!2 = !{i32 1, i32 1, i32 64}
|
||||
|
|
Loading…
Reference in New Issue