forked from OSchip/llvm-project
[AMDGPU] Packed thread ids in function call ABI
Differential Revision: https://reviews.llvm.org/D63851 llvm-svn: 364619
This commit is contained in:
parent
3018d1845b
commit
07fd88d735
|
@ -9,6 +9,7 @@
|
|||
#include "AMDGPU.h"
|
||||
#include "AMDGPUArgumentUsageInfo.h"
|
||||
#include "SIRegisterInfo.h"
|
||||
#include "llvm/Support/NativeFormatting.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
@ -26,9 +27,16 @@ void ArgDescriptor::print(raw_ostream &OS,
|
|||
}
|
||||
|
||||
if (isRegister())
|
||||
OS << "Reg " << printReg(getRegister(), TRI) << '\n';
|
||||
OS << "Reg " << printReg(getRegister(), TRI);
|
||||
else
|
||||
OS << "Stack offset " << getStackOffset() << '\n';
|
||||
OS << "Stack offset " << getStackOffset();
|
||||
|
||||
if (isMasked()) {
|
||||
OS << " & ";
|
||||
llvm::write_hex(OS, Mask, llvm::HexPrintStyle::PrefixLower);
|
||||
}
|
||||
|
||||
OS << '\n';
|
||||
}
|
||||
|
||||
char AMDGPUArgumentUsageInfo::ID = 0;
|
||||
|
|
|
@ -32,18 +32,27 @@ private:
|
|||
unsigned StackOffset;
|
||||
};
|
||||
|
||||
// Bitmask to locate argument within the register.
|
||||
unsigned Mask;
|
||||
|
||||
bool IsStack : 1;
|
||||
bool IsSet : 1;
|
||||
|
||||
ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false)
|
||||
: Register(Val), IsStack(IsStack), IsSet(IsSet) {}
|
||||
public:
|
||||
static ArgDescriptor createRegister(unsigned Reg) {
|
||||
return ArgDescriptor(Reg, false, true);
|
||||
ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
|
||||
bool IsStack = false, bool IsSet = false)
|
||||
: Register(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
|
||||
|
||||
static ArgDescriptor createRegister(unsigned Reg, unsigned Mask = ~0u) {
|
||||
return ArgDescriptor(Reg, Mask, false, true);
|
||||
}
|
||||
|
||||
static ArgDescriptor createStack(unsigned Reg) {
|
||||
return ArgDescriptor(Reg, true, true);
|
||||
static ArgDescriptor createStack(unsigned Reg, unsigned Mask = ~0u) {
|
||||
return ArgDescriptor(Reg, Mask, true, true);
|
||||
}
|
||||
|
||||
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
|
||||
return ArgDescriptor(Arg.Register, Mask, Arg.IsStack, Arg.IsSet);
|
||||
}
|
||||
|
||||
bool isSet() const {
|
||||
|
@ -68,6 +77,14 @@ public:
|
|||
return StackOffset;
|
||||
}
|
||||
|
||||
unsigned getMask() const {
|
||||
return Mask;
|
||||
}
|
||||
|
||||
bool isMasked() const {
|
||||
return Mask != ~0u;
|
||||
}
|
||||
|
||||
void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const;
|
||||
};
|
||||
|
||||
|
|
|
@ -4233,9 +4233,19 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
|
|||
const ArgDescriptor &Arg) const {
|
||||
assert(Arg && "Attempting to load missing argument");
|
||||
|
||||
if (Arg.isRegister())
|
||||
return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
|
||||
return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
|
||||
SDValue V = Arg.isRegister() ?
|
||||
CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
|
||||
loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
|
||||
|
||||
if (!Arg.isMasked())
|
||||
return V;
|
||||
|
||||
unsigned Mask = Arg.getMask();
|
||||
unsigned Shift = countTrailingZeros<unsigned>(Mask);
|
||||
V = DAG.getNode(ISD::SRL, SL, VT, V,
|
||||
DAG.getShiftAmountConstant(Shift, VT, SL));
|
||||
return DAG.getNode(ISD::AND, SL, VT, V,
|
||||
DAG.getConstant(Mask >> Shift, SL, VT));
|
||||
}
|
||||
|
||||
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
|
||||
|
|
|
@ -1585,7 +1585,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
|
|||
|
||||
// Try to allocate a VGPR at the end of the argument list, or if no argument
|
||||
// VGPRs are left allocating a stack slot.
|
||||
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
|
||||
// If \p Mask is is given it indicates bitfield position in the register.
|
||||
// If \p Arg is given use it with new ]p Mask instead of allocating new.
|
||||
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
|
||||
ArgDescriptor Arg = ArgDescriptor()) {
|
||||
if (Arg.isSet())
|
||||
return ArgDescriptor::createArg(Arg, Mask);
|
||||
|
||||
ArrayRef<MCPhysReg> ArgVGPRs
|
||||
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
|
||||
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
|
||||
|
@ -1593,7 +1599,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
|
|||
// Spill to stack required.
|
||||
int64_t Offset = CCInfo.AllocateStack(4, 4);
|
||||
|
||||
return ArgDescriptor::createStack(Offset);
|
||||
return ArgDescriptor::createStack(Offset, Mask);
|
||||
}
|
||||
|
||||
unsigned Reg = ArgVGPRs[RegIdx];
|
||||
|
@ -1602,7 +1608,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
|
|||
|
||||
MachineFunction &MF = CCInfo.getMachineFunction();
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
return ArgDescriptor::createRegister(Reg);
|
||||
return ArgDescriptor::createRegister(Reg, Mask);
|
||||
}
|
||||
|
||||
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
|
||||
|
@ -1634,14 +1640,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
|
|||
MachineFunction &MF,
|
||||
const SIRegisterInfo &TRI,
|
||||
SIMachineFunctionInfo &Info) {
|
||||
if (Info.hasWorkItemIDX())
|
||||
Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
|
||||
const unsigned Mask = 0x3ff;
|
||||
ArgDescriptor Arg;
|
||||
|
||||
if (Info.hasWorkItemIDY())
|
||||
Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
|
||||
if (Info.hasWorkItemIDX()) {
|
||||
Arg = allocateVGPR32Input(CCInfo, Mask);
|
||||
Info.setWorkItemIDX(Arg);
|
||||
}
|
||||
|
||||
if (Info.hasWorkItemIDY()) {
|
||||
Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
|
||||
Info.setWorkItemIDY(Arg);
|
||||
}
|
||||
|
||||
if (Info.hasWorkItemIDZ())
|
||||
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
|
||||
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
|
||||
}
|
||||
|
||||
static void allocateSpecialInputSGPRs(CCState &CCInfo,
|
||||
|
@ -2387,9 +2400,6 @@ void SITargetLowering::passSpecialInputs(
|
|||
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_X,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
|
||||
AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
|
||||
};
|
||||
|
||||
|
@ -2429,6 +2439,71 @@ void SITargetLowering::passSpecialInputs(
|
|||
MemOpChains.push_back(ArgStore);
|
||||
}
|
||||
}
|
||||
|
||||
// Pack workitem IDs into a single register or pass it as is if already
|
||||
// packed.
|
||||
const ArgDescriptor *OutgoingArg;
|
||||
const TargetRegisterClass *ArgRC;
|
||||
|
||||
std::tie(OutgoingArg, ArgRC) =
|
||||
CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
|
||||
if (!OutgoingArg)
|
||||
std::tie(OutgoingArg, ArgRC) =
|
||||
CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
|
||||
if (!OutgoingArg)
|
||||
std::tie(OutgoingArg, ArgRC) =
|
||||
CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
|
||||
if (!OutgoingArg)
|
||||
return;
|
||||
|
||||
const ArgDescriptor *IncomingArgX
|
||||
= CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
|
||||
const ArgDescriptor *IncomingArgY
|
||||
= CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
|
||||
const ArgDescriptor *IncomingArgZ
|
||||
= CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
|
||||
|
||||
SDValue InputReg;
|
||||
SDLoc SL;
|
||||
|
||||
// If incoming ids are not packed we need to pack them.
|
||||
if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
|
||||
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
|
||||
|
||||
if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
|
||||
SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
|
||||
Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
|
||||
DAG.getShiftAmountConstant(10, MVT::i32, SL));
|
||||
InputReg = InputReg.getNode() ?
|
||||
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
|
||||
}
|
||||
|
||||
if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
|
||||
SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
|
||||
Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
|
||||
DAG.getShiftAmountConstant(20, MVT::i32, SL));
|
||||
InputReg = InputReg.getNode() ?
|
||||
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
|
||||
}
|
||||
|
||||
if (!InputReg.getNode()) {
|
||||
// Workitem ids are already packed, any of present incoming arguments
|
||||
// will carry all required fields.
|
||||
ArgDescriptor IncomingArg = ArgDescriptor::createArg(
|
||||
IncomingArgX ? *IncomingArgX :
|
||||
IncomingArgY ? *IncomingArgY :
|
||||
*IncomingArgZ, ~0u);
|
||||
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
|
||||
}
|
||||
|
||||
if (OutgoingArg->isRegister()) {
|
||||
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
|
||||
} else {
|
||||
unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
|
||||
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
|
||||
SpecialArgOffset);
|
||||
MemOpChains.push_back(ArgStore);
|
||||
}
|
||||
}
|
||||
|
||||
static bool canGuaranteeTCO(CallingConv::ID CC) {
|
||||
|
|
|
@ -65,6 +65,7 @@ define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_x() #1 {
|
||||
|
@ -13,7 +14,8 @@ define void @use_workitem_id_x() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_y() #1 {
|
||||
|
@ -24,7 +26,8 @@ define void @use_workitem_id_y() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_z() #1 {
|
||||
|
@ -35,8 +38,10 @@ define void @use_workitem_id_z() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xy:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_xy() #1 {
|
||||
|
@ -49,9 +54,12 @@ define void @use_workitem_id_xy() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xyz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_xyz() #1 {
|
||||
|
@ -66,8 +74,10 @@ define void @use_workitem_id_xyz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_xz() #1 {
|
||||
|
@ -80,8 +90,10 @@ define void @use_workitem_id_xz() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_yz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_yz() #1 {
|
||||
|
@ -108,7 +120,7 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
|
|||
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: v_mov_b32_e32 v0, v1
|
||||
; GCN: v_lshlrev_b32_e32 v0, 10, v1
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
|
@ -122,15 +134,72 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
|
|||
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN: v_mov_b32_e32 v0, v2
|
||||
; GCN: v_lshlrev_b32_e32 v0, 20, v2
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
|
||||
call void @use_workitem_id_z()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; GCN: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
|
||||
call void @use_workitem_id_xy()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 v0, v0, [[IDZ]]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
|
||||
call void @use_workitem_id_xz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
|
||||
; GCN-NOT: v1
|
||||
; GCN-NOT: v2
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; GCN: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
|
||||
; GCN-NOT: v1
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
|
||||
call void @use_workitem_id_yz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN-NOT: v2
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
|
||||
; GCN-DAG: v_or_b32_e32 v0, v0, [[IDY]]
|
||||
; GCN-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
|
||||
call void @use_workitem_id_xyz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
|
||||
; GCN-NOT: v0
|
||||
; GCN: s_swappc_b64
|
||||
|
@ -160,8 +229,9 @@ define void @func_indirect_use_workitem_id_z() #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
|
@ -171,8 +241,9 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
|
@ -182,8 +253,9 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
|||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
|
||||
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
|
@ -207,6 +279,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
|
|||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
|
||||
; GCN: enable_vgpr_workitem_id = 1
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-NOT: v1
|
||||
; GCN: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN-NOT: v1
|
||||
|
@ -221,7 +294,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
|
|||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN-DAG: v_mov_b32_e32 v1, v2
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v1, 20, v2
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
|
||||
|
@ -232,6 +305,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
|
|||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
|
@ -357,6 +431,7 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
|
|||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
|
||||
; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
|
||||
|
@ -469,15 +544,18 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: v_and_b32_e32 v32, 0x3ff, v32
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: v_bfe_u32 v32, v32, 10, 10
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}}
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
|
||||
; GCN: v_bfe_u32 v32, v32, 20, 10
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_xyz(
|
||||
|
@ -531,19 +609,19 @@ define void @too_many_args_use_workitem_id_xyz(
|
|||
ret void
|
||||
}
|
||||
|
||||
; frame[0] = ID X
|
||||
; frame[1] = ID Y
|
||||
; frame[2] = ID Z
|
||||
; frame[0] = ID { Z, Y, X }
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN-DAG: s_mov_b32 s33, s7
|
||||
; GCN-DAG: s_mov_b32 s32, s33
|
||||
|
||||
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
|
||||
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-DAG: v_or_b32_e32 v0, v0, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-DAG: v_or_b32_e32 v0, v0, v2
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
|
||||
call void @too_many_args_use_workitem_id_xyz(
|
||||
|
@ -560,19 +638,19 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
|
|||
|
||||
; workitem ID X in register, yz on stack
|
||||
; v31 = workitem ID X
|
||||
; frame[0] = workitem Y
|
||||
; frame[1] = workitem Z
|
||||
; frame[0] = workitem { Z, Y, X }
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
|
||||
; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
|
||||
; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
|
||||
; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
|
||||
; GCN-DAG: flat_store_dword v[0:1], [[IDX]]
|
||||
; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]]
|
||||
; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
|
||||
; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]
|
||||
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
; GCN: ScratchSize: 12
|
||||
; GCN: ScratchSize: 8
|
||||
define void @too_many_args_use_workitem_id_x_stack_yz(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
|
@ -623,18 +701,18 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
|
|||
ret void
|
||||
}
|
||||
|
||||
; frame[0] = ID Y
|
||||
; frame[1] = ID Z
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v31, v0
|
||||
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}}
|
||||
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4
|
||||
; GCN-NOT: v0
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
|
||||
; GCN-DAG: v_or_b32_e32 v0, v0, v1
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
|
||||
; GCN-DAG: v_or_b32_e32 v31, v0, v2
|
||||
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
|
||||
call void @too_many_args_use_workitem_id_x_stack_yz(
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
; RUN: llc -march=amdgcn < %s | FileCheck %s
|
||||
; RUN: llc -O0 -march=amdgcn < %s | FileCheck %s
|
||||
; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,O2 %s
|
||||
; RUN: llc -O0 -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
|
||||
|
||||
; CHECK-NOT: and_b32
|
||||
; GCN-LABEL: {{^}}zext_grp_size_128:
|
||||
; GCN-NOT: and_b32
|
||||
|
||||
; OPT-LABEL: @zext_grp_size_128
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
|
||||
|
@ -24,6 +25,9 @@ bb:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_32x4x1:
|
||||
; GCN-NOT: and_b32
|
||||
|
||||
; OPT-LABEL: @zext_grp_size_32x4x1
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3
|
||||
|
@ -44,6 +48,9 @@ bb:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_1x1x1:
|
||||
; GCN-NOT: and_b32
|
||||
|
||||
; When EarlyCSE is not run this call produces a range max with 0 active bits,
|
||||
; which is a special case as an AssertZext from width 0 is invalid.
|
||||
; OPT-LABEL: @zext_grp_size_1x1x1
|
||||
|
@ -55,6 +62,9 @@ define amdgpu_kernel void @zext_grp_size_1x1x1(i32 addrspace(1)* nocapture %arg)
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_512:
|
||||
; GCN-NOT: and_b32
|
||||
|
||||
; OPT-LABEL: @zext_grp_size_512
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6
|
||||
|
@ -75,6 +85,11 @@ bb:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_test_workitem_id_x_known_max_range:
|
||||
; O2-NOT: and_b32
|
||||
; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
|
||||
; O2-NOT: and_b32
|
||||
|
||||
; OPT-LABEL: @func_test_workitem_id_x_known_max_range(
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
|
||||
define void @func_test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
|
||||
|
@ -85,6 +100,11 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_test_workitem_id_x_default_range:
|
||||
; O2-NOT: and_b32
|
||||
; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
|
||||
; O2-NOT: and_b32
|
||||
|
||||
; OPT-LABEL: @func_test_workitem_id_x_default_range(
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7
|
||||
define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {
|
||||
|
|
Loading…
Reference in New Issue