forked from OSchip/llvm-project
AMDGPU: Pass special input registers to functions
llvm-svn: 309998
This commit is contained in:
parent
52854dcd34
commit
8623e8d864
|
@ -41,7 +41,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
|
|||
unsigned Offset) const {
|
||||
|
||||
MachineFunction &MF = MIRBuilder.getMF();
|
||||
const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const Function &F = *MF.getFunction();
|
||||
const DataLayout &DL = F.getParent()->getDataLayout();
|
||||
|
@ -49,7 +49,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
|
|||
LLT PtrType = getLLTForType(*PtrTy, DL);
|
||||
unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
|
||||
unsigned KernArgSegmentPtr =
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
|
||||
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
||||
unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
|
||||
|
||||
unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
|
||||
|
|
|
@ -3582,6 +3582,49 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
|
|||
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
|
||||
EVT VT,
|
||||
const SDLoc &SL,
|
||||
int64_t Offset) const {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
|
||||
int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
|
||||
auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
|
||||
SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
|
||||
|
||||
return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
|
||||
MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOInvariant);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
|
||||
const SDLoc &SL,
|
||||
SDValue Chain,
|
||||
SDValue StackPtr,
|
||||
SDValue ArgVal,
|
||||
int64_t Offset) const {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
|
||||
SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32);
|
||||
SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset);
|
||||
|
||||
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
|
||||
MachineMemOperand::MODereferenceable);
|
||||
return Store;
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
|
||||
const TargetRegisterClass *RC,
|
||||
EVT VT, const SDLoc &SL,
|
||||
const ArgDescriptor &Arg) const {
|
||||
assert(Arg && "Attempting to load missing argument");
|
||||
|
||||
if (Arg.isRegister())
|
||||
return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
|
||||
return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
|
||||
}
|
||||
|
||||
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
|
||||
const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
|
||||
unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
|
||||
|
|
|
@ -24,7 +24,7 @@ namespace llvm {
|
|||
|
||||
class AMDGPUMachineFunction;
|
||||
class AMDGPUSubtarget;
|
||||
class MachineRegisterInfo;
|
||||
struct ArgDescriptor;
|
||||
|
||||
class AMDGPUTargetLowering : public TargetLowering {
|
||||
private:
|
||||
|
@ -237,6 +237,25 @@ public:
|
|||
return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
|
||||
}
|
||||
|
||||
/// Similar to CreateLiveInRegister, except value maybe loaded from a stack
|
||||
/// slot rather than passed in a register.
|
||||
SDValue loadStackInputValue(SelectionDAG &DAG,
|
||||
EVT VT,
|
||||
const SDLoc &SL,
|
||||
int64_t Offset) const;
|
||||
|
||||
SDValue storeStackInputValue(SelectionDAG &DAG,
|
||||
const SDLoc &SL,
|
||||
SDValue Chain,
|
||||
SDValue StackPtr,
|
||||
SDValue ArgVal,
|
||||
int64_t Offset) const;
|
||||
|
||||
SDValue loadInputValue(SelectionDAG &DAG,
|
||||
const TargetRegisterClass *RC,
|
||||
EVT VT, const SDLoc &SL,
|
||||
const ArgDescriptor &Arg) const;
|
||||
|
||||
enum ImplicitParameter {
|
||||
FIRST_IMPLICIT,
|
||||
GRID_DIM = FIRST_IMPLICIT,
|
||||
|
|
|
@ -38,6 +38,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
|
|||
MachineBasicBlock &MBB) const {
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
const SIRegisterInfo* TRI = &TII->getRegisterInfo();
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
// We don't need this if we only have spills since there is no user facing
|
||||
// scratch.
|
||||
|
@ -55,7 +56,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
|
|||
MachineBasicBlock::iterator I = MBB.begin();
|
||||
|
||||
unsigned FlatScratchInitReg
|
||||
= TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
|
||||
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
|
||||
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
MRI.addLiveIn(FlatScratchInitReg);
|
||||
|
@ -64,7 +65,6 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
|
|||
unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
|
||||
unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
|
||||
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
|
||||
|
||||
// Do a 64-bit pointer add.
|
||||
|
@ -283,13 +283,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
|
|||
}
|
||||
|
||||
// We need to insert initialization of the scratch resource descriptor.
|
||||
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
|
||||
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
||||
unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
|
||||
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
||||
|
||||
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
|
||||
if (ST.isAmdCodeObjectV2(MF)) {
|
||||
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
|
||||
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
PreloadedPrivateBufferReg = MFI->getPreloadedReg(
|
||||
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
}
|
||||
|
||||
bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
|
||||
|
|
|
@ -45,6 +45,7 @@
|
|||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineMemOperand.h"
|
||||
#include "llvm/CodeGen/MachineModuleInfo.h"
|
||||
#include "llvm/CodeGen/MachineOperand.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/MachineValueType.h"
|
||||
|
@ -895,14 +896,19 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
|
|||
uint64_t Offset) const {
|
||||
const DataLayout &DL = DAG.getDataLayout();
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
||||
unsigned InputPtrReg = TRI->getPreloadedValue(MF,
|
||||
SIRegisterInfo::KERNARG_SEGMENT_PTR);
|
||||
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
const ArgDescriptor *InputPtrReg;
|
||||
const TargetRegisterClass *RC;
|
||||
|
||||
std::tie(InputPtrReg, RC)
|
||||
= Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
||||
|
||||
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
|
||||
MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
|
||||
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
|
||||
MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
|
||||
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
|
||||
|
||||
return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
|
||||
DAG.getConstant(Offset, SL, PtrVT));
|
||||
}
|
||||
|
@ -1005,6 +1011,17 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA
|
|||
return ArgValue;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
|
||||
const SIMachineFunctionInfo &MFI,
|
||||
EVT VT,
|
||||
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
|
||||
const ArgDescriptor *Reg;
|
||||
const TargetRegisterClass *RC;
|
||||
|
||||
std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
|
||||
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
|
||||
}
|
||||
|
||||
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
|
||||
CallingConv::ID CallConv,
|
||||
ArrayRef<ISD::InputArg> Ins,
|
||||
|
@ -1055,27 +1072,129 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
|
|||
}
|
||||
|
||||
// Allocate special inputs passed in VGPRs.
|
||||
static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
|
||||
MachineFunction &MF,
|
||||
const SIRegisterInfo &TRI,
|
||||
SIMachineFunctionInfo &Info) {
|
||||
if (Info.hasWorkItemIDX()) {
|
||||
unsigned Reg = AMDGPU::VGPR0;
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
assert(Reg == AMDGPU::VGPR0);
|
||||
|
||||
CCInfo.AllocateReg(Reg);
|
||||
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
|
||||
}
|
||||
|
||||
if (Info.hasWorkItemIDY()) {
|
||||
unsigned Reg = AMDGPU::VGPR1;
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
|
||||
assert(Reg == AMDGPU::VGPR1);
|
||||
CCInfo.AllocateReg(Reg);
|
||||
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
|
||||
}
|
||||
|
||||
if (Info.hasWorkItemIDZ()) {
|
||||
unsigned Reg = AMDGPU::VGPR2;
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
|
||||
assert(Reg == AMDGPU::VGPR2);
|
||||
CCInfo.AllocateReg(Reg);
|
||||
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
|
||||
}
|
||||
}
|
||||
|
||||
// Try to allocate a VGPR at the end of the argument list, or if no argument
|
||||
// VGPRs are left allocating a stack slot.
|
||||
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
|
||||
ArrayRef<MCPhysReg> ArgVGPRs
|
||||
= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
|
||||
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
|
||||
if (RegIdx == ArgVGPRs.size()) {
|
||||
// Spill to stack required.
|
||||
int64_t Offset = CCInfo.AllocateStack(4, 4);
|
||||
|
||||
return ArgDescriptor::createStack(Offset);
|
||||
}
|
||||
|
||||
unsigned Reg = ArgVGPRs[RegIdx];
|
||||
Reg = CCInfo.AllocateReg(Reg);
|
||||
assert(Reg != AMDGPU::NoRegister);
|
||||
|
||||
MachineFunction &MF = CCInfo.getMachineFunction();
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
return ArgDescriptor::createRegister(Reg);
|
||||
}
|
||||
|
||||
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
|
||||
const TargetRegisterClass *RC,
|
||||
unsigned NumArgRegs) {
|
||||
ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
|
||||
unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
|
||||
if (RegIdx == ArgSGPRs.size())
|
||||
report_fatal_error("ran out of SGPRs for arguments");
|
||||
|
||||
unsigned Reg = ArgSGPRs[RegIdx];
|
||||
Reg = CCInfo.AllocateReg(Reg);
|
||||
assert(Reg != AMDGPU::NoRegister);
|
||||
|
||||
MachineFunction &MF = CCInfo.getMachineFunction();
|
||||
MF.addLiveIn(Reg, RC);
|
||||
return ArgDescriptor::createRegister(Reg);
|
||||
}
|
||||
|
||||
static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
|
||||
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
|
||||
}
|
||||
|
||||
static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
|
||||
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
|
||||
}
|
||||
|
||||
static void allocateSpecialInputVGPRs(CCState &CCInfo,
|
||||
MachineFunction &MF,
|
||||
const SIRegisterInfo &TRI,
|
||||
SIMachineFunctionInfo &Info) {
|
||||
if (Info.hasWorkItemIDX()) {
|
||||
unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
CCInfo.AllocateReg(Reg);
|
||||
}
|
||||
if (Info.hasWorkItemIDX())
|
||||
Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
|
||||
|
||||
if (Info.hasWorkItemIDY()) {
|
||||
unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
CCInfo.AllocateReg(Reg);
|
||||
}
|
||||
if (Info.hasWorkItemIDY())
|
||||
Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
|
||||
|
||||
if (Info.hasWorkItemIDZ()) {
|
||||
unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
|
||||
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
|
||||
CCInfo.AllocateReg(Reg);
|
||||
}
|
||||
if (Info.hasWorkItemIDZ())
|
||||
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
|
||||
}
|
||||
|
||||
static void allocateSpecialInputSGPRs(CCState &CCInfo,
|
||||
MachineFunction &MF,
|
||||
const SIRegisterInfo &TRI,
|
||||
SIMachineFunctionInfo &Info) {
|
||||
auto &ArgInfo = Info.getArgInfo();
|
||||
|
||||
// TODO: Unify handling with private memory pointers.
|
||||
|
||||
if (Info.hasDispatchPtr())
|
||||
ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
|
||||
|
||||
if (Info.hasQueuePtr())
|
||||
ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
|
||||
|
||||
if (Info.hasKernargSegmentPtr())
|
||||
ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
|
||||
|
||||
if (Info.hasDispatchID())
|
||||
ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
|
||||
|
||||
// flat_scratch_init is not applicable for non-kernel functions.
|
||||
|
||||
if (Info.hasWorkGroupIDX())
|
||||
ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
|
||||
|
||||
if (Info.hasWorkGroupIDY())
|
||||
ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
|
||||
|
||||
if (Info.hasWorkGroupIDZ())
|
||||
ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
|
||||
}
|
||||
|
||||
// Allocate special inputs passed in user SGPRs.
|
||||
|
@ -1212,8 +1331,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
|||
// resource. For the Code Object V2 ABI, this will be the first 4 user
|
||||
// SGPR inputs. We can reserve those and use them directly.
|
||||
|
||||
unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
|
||||
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
|
||||
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
|
||||
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
|
||||
|
||||
if (MFI.hasCalls()) {
|
||||
|
@ -1229,8 +1348,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
|||
= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
|
||||
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
|
||||
} else {
|
||||
unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
|
||||
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
||||
unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
|
||||
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
||||
Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
|
||||
}
|
||||
} else {
|
||||
|
@ -1256,8 +1375,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
|||
Info.setScratchRSrcReg(ReservedBufferReg);
|
||||
|
||||
if (HasStackObjects && !MFI.hasCalls()) {
|
||||
unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
|
||||
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
||||
unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
|
||||
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
||||
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
|
||||
} else {
|
||||
unsigned ReservedOffsetReg
|
||||
|
@ -1390,7 +1509,7 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
}
|
||||
|
||||
if (IsEntryFunc) {
|
||||
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
|
||||
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
|
||||
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
|
||||
}
|
||||
|
||||
|
@ -1509,6 +1628,11 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
InVals.push_back(Val);
|
||||
}
|
||||
|
||||
if (!IsEntryFunc) {
|
||||
// Special inputs come after user arguments.
|
||||
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
|
||||
}
|
||||
|
||||
// Start adding system SGPRs.
|
||||
if (IsEntryFunc) {
|
||||
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
|
||||
|
@ -1516,8 +1640,13 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
CCInfo.AllocateReg(Info->getScratchRSrcReg());
|
||||
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
|
||||
CCInfo.AllocateReg(Info->getFrameOffsetReg());
|
||||
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
|
||||
}
|
||||
|
||||
auto &ArgUsageInfo =
|
||||
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
|
||||
ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
|
||||
|
||||
return Chains.empty() ? Chain :
|
||||
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
||||
}
|
||||
|
@ -1741,6 +1870,81 @@ SDValue SITargetLowering::LowerCallResult(
|
|||
return Chain;
|
||||
}
|
||||
|
||||
// Add code to pass special inputs required depending on used features separate
|
||||
// from the explicit user arguments present in the IR.
|
||||
void SITargetLowering::passSpecialInputs(
|
||||
CallLoweringInfo &CLI,
|
||||
const SIMachineFunctionInfo &Info,
|
||||
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
|
||||
SmallVectorImpl<SDValue> &MemOpChains,
|
||||
SDValue Chain,
|
||||
SDValue StackPtr) const {
|
||||
// If we don't have a call site, this was a call inserted by
|
||||
// legalization. These can never use special inputs.
|
||||
if (!CLI.CS)
|
||||
return;
|
||||
|
||||
const Function *CalleeFunc = CLI.CS.getCalledFunction();
|
||||
if (!CalleeFunc)
|
||||
report_fatal_error("indirect calls not handled");
|
||||
|
||||
SelectionDAG &DAG = CLI.DAG;
|
||||
const SDLoc &DL = CLI.DL;
|
||||
|
||||
const SISubtarget *ST = getSubtarget();
|
||||
const SIRegisterInfo *TRI = ST->getRegisterInfo();
|
||||
|
||||
auto &ArgUsageInfo =
|
||||
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
|
||||
const AMDGPUFunctionArgInfo &CalleeArgInfo
|
||||
= ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
|
||||
|
||||
const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
|
||||
|
||||
// TODO: Unify with private memory register handling. This is complicated by
|
||||
// the fact that at least in kernels, the input argument is not necessarily
|
||||
// in the same location as the input.
|
||||
AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
|
||||
AMDGPUFunctionArgInfo::DISPATCH_PTR,
|
||||
AMDGPUFunctionArgInfo::QUEUE_PTR,
|
||||
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
|
||||
AMDGPUFunctionArgInfo::DISPATCH_ID,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_X,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
|
||||
AMDGPUFunctionArgInfo::WORKITEM_ID_Z
|
||||
};
|
||||
|
||||
for (auto InputID : InputRegs) {
|
||||
const ArgDescriptor *OutgoingArg;
|
||||
const TargetRegisterClass *ArgRC;
|
||||
|
||||
std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
|
||||
if (!OutgoingArg)
|
||||
continue;
|
||||
|
||||
const ArgDescriptor *IncomingArg;
|
||||
const TargetRegisterClass *IncomingArgRC;
|
||||
std::tie(IncomingArg, IncomingArgRC)
|
||||
= CallerArgInfo.getPreloadedValue(InputID);
|
||||
assert(IncomingArgRC == ArgRC);
|
||||
|
||||
// All special arguments are ints for now.
|
||||
EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
|
||||
SDValue InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
|
||||
if (OutgoingArg->isRegister()) {
|
||||
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
|
||||
} else {
|
||||
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
|
||||
InputReg,
|
||||
OutgoingArg->getStackOffset());
|
||||
MemOpChains.push_back(ArgStore);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The wave scratch offset register is used as the global base pointer.
|
||||
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
SmallVectorImpl<SDValue> &InVals) const {
|
||||
|
@ -1897,6 +2101,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
|||
}
|
||||
}
|
||||
|
||||
// Copy special input registers after user input arguments.
|
||||
passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
|
||||
|
||||
if (!MemOpChains.empty())
|
||||
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
|
||||
|
||||
|
@ -3424,7 +3631,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
SelectionDAG &DAG) const {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
SDLoc DL(Op);
|
||||
|
@ -3436,10 +3642,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
case Intrinsic::amdgcn_implicit_buffer_ptr: {
|
||||
if (getSubtarget()->isAmdCodeObjectV2(MF))
|
||||
return emitNonHSAIntrinsicError(DAG, DL, VT);
|
||||
|
||||
unsigned Reg = TRI->getPreloadedValue(MF,
|
||||
SIRegisterInfo::IMPLICIT_BUFFER_PTR);
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
|
||||
return getPreloadedValue(DAG, *MFI, VT,
|
||||
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
|
||||
}
|
||||
case Intrinsic::amdgcn_dispatch_ptr:
|
||||
case Intrinsic::amdgcn_queue_ptr: {
|
||||
|
@ -3451,10 +3655,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
return DAG.getUNDEF(VT);
|
||||
}
|
||||
|
||||
auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
|
||||
SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
|
||||
TRI->getPreloadedValue(MF, Reg), VT);
|
||||
auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
|
||||
AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
|
||||
return getPreloadedValue(DAG, *MFI, VT, RegID);
|
||||
}
|
||||
case Intrinsic::amdgcn_implicitarg_ptr: {
|
||||
if (MFI->isEntryFunction())
|
||||
|
@ -3462,13 +3665,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
report_fatal_error("amdgcn.implicitarg.ptr not implemented for functions");
|
||||
}
|
||||
case Intrinsic::amdgcn_kernarg_segment_ptr: {
|
||||
unsigned Reg
|
||||
= TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
|
||||
return getPreloadedValue(DAG, *MFI, VT,
|
||||
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
||||
}
|
||||
case Intrinsic::amdgcn_dispatch_id: {
|
||||
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID);
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
|
||||
return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
|
||||
}
|
||||
case Intrinsic::amdgcn_rcp:
|
||||
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
|
||||
|
@ -3553,28 +3754,32 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
SI::KernelInputOffsets::LOCAL_SIZE_Z);
|
||||
case Intrinsic::amdgcn_workgroup_id_x:
|
||||
case Intrinsic::r600_read_tgid_x:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
|
||||
return getPreloadedValue(DAG, *MFI, VT,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
|
||||
case Intrinsic::amdgcn_workgroup_id_y:
|
||||
case Intrinsic::r600_read_tgid_y:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
|
||||
return getPreloadedValue(DAG, *MFI, VT,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
|
||||
case Intrinsic::amdgcn_workgroup_id_z:
|
||||
case Intrinsic::r600_read_tgid_z:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
|
||||
case Intrinsic::amdgcn_workitem_id_x:
|
||||
return getPreloadedValue(DAG, *MFI, VT,
|
||||
AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
|
||||
case Intrinsic::amdgcn_workitem_id_x: {
|
||||
case Intrinsic::r600_read_tidig_x:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
|
||||
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
||||
SDLoc(DAG.getEntryNode()),
|
||||
MFI->getArgInfo().WorkItemIDX);
|
||||
}
|
||||
case Intrinsic::amdgcn_workitem_id_y:
|
||||
case Intrinsic::r600_read_tidig_y:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
|
||||
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
||||
SDLoc(DAG.getEntryNode()),
|
||||
MFI->getArgInfo().WorkItemIDY);
|
||||
case Intrinsic::amdgcn_workitem_id_z:
|
||||
case Intrinsic::r600_read_tidig_z:
|
||||
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
|
||||
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
|
||||
return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
|
||||
SDLoc(DAG.getEntryNode()),
|
||||
MFI->getArgInfo().WorkItemIDZ);
|
||||
case AMDGPUIntrinsic::SI_load_const: {
|
||||
SDValue Ops[] = {
|
||||
Op.getOperand(1),
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H
|
||||
|
||||
#include "AMDGPUISelLowering.h"
|
||||
#include "AMDGPUArgumentUsageInfo.h"
|
||||
#include "SIInstrInfo.h"
|
||||
|
||||
namespace llvm {
|
||||
|
@ -32,6 +33,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
|||
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
|
||||
const SDLoc &SL, SDValue Chain,
|
||||
const ISD::InputArg &Arg) const;
|
||||
SDValue getPreloadedValue(SelectionDAG &DAG,
|
||||
const SIMachineFunctionInfo &MFI,
|
||||
EVT VT,
|
||||
AMDGPUFunctionArgInfo::PreloadedValue) const;
|
||||
|
||||
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
|
||||
SelectionDAG &DAG) const override;
|
||||
|
@ -205,6 +210,14 @@ public:
|
|||
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
|
||||
SelectionDAG &DAG) const override;
|
||||
|
||||
void passSpecialInputs(
|
||||
CallLoweringInfo &CLI,
|
||||
const SIMachineFunctionInfo &Info,
|
||||
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
|
||||
SmallVectorImpl<SDValue> &MemOpChains,
|
||||
SDValue Chain,
|
||||
SDValue StackPtr) const;
|
||||
|
||||
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
|
||||
CallingConv::ID CallConv, bool isVarArg,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins,
|
||||
|
|
|
@ -916,7 +916,6 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
|
|||
MachineFunction *MF = MBB.getParent();
|
||||
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
DebugLoc DL = MBB.findDebugLoc(MI);
|
||||
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
|
||||
unsigned WavefrontSize = ST.getWavefrontSize();
|
||||
|
@ -936,13 +935,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
|
|||
WorkGroupSize > WavefrontSize) {
|
||||
|
||||
unsigned TIDIGXReg
|
||||
= TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
|
||||
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
|
||||
unsigned TIDIGYReg
|
||||
= TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
|
||||
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
|
||||
unsigned TIDIGZReg
|
||||
= TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
|
||||
= MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
|
||||
unsigned InputPtrReg =
|
||||
TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
|
||||
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
|
||||
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
|
||||
if (!Entry.isLiveIn(Reg))
|
||||
Entry.addLiveIn(Reg);
|
||||
|
|
|
@ -27,24 +27,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
|
||||
FrameOffsetReg(AMDGPU::FP_REG),
|
||||
StackPtrOffsetReg(AMDGPU::SP_REG),
|
||||
PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
|
||||
DispatchPtrUserSGPR(AMDGPU::NoRegister),
|
||||
QueuePtrUserSGPR(AMDGPU::NoRegister),
|
||||
KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
|
||||
DispatchIDUserSGPR(AMDGPU::NoRegister),
|
||||
FlatScratchInitUserSGPR(AMDGPU::NoRegister),
|
||||
PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
|
||||
GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
|
||||
GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
|
||||
GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
|
||||
WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
|
||||
WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
|
||||
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
|
||||
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
|
||||
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
|
||||
WorkItemIDXVGPR(AMDGPU::NoRegister),
|
||||
WorkItemIDYVGPR(AMDGPU::NoRegister),
|
||||
WorkItemIDZVGPR(AMDGPU::NoRegister),
|
||||
ArgInfo(),
|
||||
PSInputAddr(0),
|
||||
PSInputEnable(0),
|
||||
ReturnsVoid(true),
|
||||
|
@ -91,8 +74,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
FrameOffsetReg = AMDGPU::SGPR5;
|
||||
StackPtrOffsetReg = AMDGPU::SGPR32;
|
||||
|
||||
// FIXME: Not really a system SGPR.
|
||||
PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
|
||||
ArgInfo.PrivateSegmentBuffer =
|
||||
ArgDescriptor::createRegister(ScratchRSrcReg);
|
||||
ArgInfo.PrivateSegmentWaveByteOffset =
|
||||
ArgDescriptor::createRegister(ScratchWaveOffsetReg);
|
||||
|
||||
if (F->hasFnAttribute("amdgpu-implicitarg-ptr"))
|
||||
ImplicitArgPtr = true;
|
||||
} else {
|
||||
|
@ -151,10 +137,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
if (HasStackObjects || MaySpill) {
|
||||
PrivateSegmentWaveByteOffset = true;
|
||||
|
||||
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
|
||||
(CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
|
||||
PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
|
||||
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
|
||||
(CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
|
||||
ArgInfo.PrivateSegmentWaveByteOffset
|
||||
= ArgDescriptor::createRegister(AMDGPU::SGPR5);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -189,52 +176,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||
|
||||
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
|
||||
const SIRegisterInfo &TRI) {
|
||||
PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
|
||||
ArgInfo.PrivateSegmentBuffer =
|
||||
ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
|
||||
NumUserSGPRs += 4;
|
||||
return PrivateSegmentBufferUserSGPR;
|
||||
return ArgInfo.PrivateSegmentBuffer.getRegister();
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
|
||||
DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
|
||||
NumUserSGPRs += 2;
|
||||
return DispatchPtrUserSGPR;
|
||||
return ArgInfo.DispatchPtr.getRegister();
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
|
||||
QueuePtrUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
|
||||
NumUserSGPRs += 2;
|
||||
return QueuePtrUserSGPR;
|
||||
return ArgInfo.QueuePtr.getRegister();
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
|
||||
KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
ArgInfo.KernargSegmentPtr
|
||||
= ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
|
||||
NumUserSGPRs += 2;
|
||||
return KernargSegmentPtrUserSGPR;
|
||||
return ArgInfo.KernargSegmentPtr.getRegister();
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
|
||||
DispatchIDUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
|
||||
NumUserSGPRs += 2;
|
||||
return DispatchIDUserSGPR;
|
||||
return ArgInfo.DispatchID.getRegister();
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
|
||||
FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
|
||||
NumUserSGPRs += 2;
|
||||
return FlatScratchInitUserSGPR;
|
||||
return ArgInfo.FlatScratchInit.getRegister();
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
|
||||
ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
|
||||
ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
|
||||
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
|
||||
NumUserSGPRs += 2;
|
||||
return ImplicitBufferPtrUserSGPR;
|
||||
return ArgInfo.ImplicitBufferPtr.getRegister();
|
||||
}
|
||||
|
||||
static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "AMDGPUMachineFunction.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "AMDGPUArgumentUsageInfo.h"
|
||||
#include "SIRegisterInfo.h"
|
||||
#include "llvm/CodeGen/PseudoSourceValue.h"
|
||||
#include "llvm/MC/MCRegisterInfo.h"
|
||||
|
@ -96,33 +97,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
|
|||
// Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
|
||||
unsigned StackPtrOffsetReg;
|
||||
|
||||
// Input registers for non-HSA ABI
|
||||
unsigned ImplicitBufferPtrUserSGPR;
|
||||
|
||||
// Input registers setup for the HSA ABI.
|
||||
// User SGPRs in allocation order.
|
||||
unsigned PrivateSegmentBufferUserSGPR;
|
||||
unsigned DispatchPtrUserSGPR;
|
||||
unsigned QueuePtrUserSGPR;
|
||||
unsigned KernargSegmentPtrUserSGPR;
|
||||
unsigned DispatchIDUserSGPR;
|
||||
unsigned FlatScratchInitUserSGPR;
|
||||
unsigned PrivateSegmentSizeUserSGPR;
|
||||
unsigned GridWorkGroupCountXUserSGPR;
|
||||
unsigned GridWorkGroupCountYUserSGPR;
|
||||
unsigned GridWorkGroupCountZUserSGPR;
|
||||
|
||||
// System SGPRs in allocation order.
|
||||
unsigned WorkGroupIDXSystemSGPR;
|
||||
unsigned WorkGroupIDYSystemSGPR;
|
||||
unsigned WorkGroupIDZSystemSGPR;
|
||||
unsigned WorkGroupInfoSystemSGPR;
|
||||
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
|
||||
|
||||
// VGPR inputs. These are always v0, v1 and v2 for entry functions.
|
||||
unsigned WorkItemIDXVGPR;
|
||||
unsigned WorkItemIDYVGPR;
|
||||
unsigned WorkItemIDZVGPR;
|
||||
AMDGPUFunctionArgInfo ArgInfo;
|
||||
|
||||
// Graphics info.
|
||||
unsigned PSInputAddr;
|
||||
|
@ -235,7 +210,6 @@ private:
|
|||
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
|
||||
|
||||
public:
|
||||
|
||||
SIMachineFunctionInfo(const MachineFunction &MF);
|
||||
|
||||
ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
|
||||
|
@ -266,37 +240,52 @@ public:
|
|||
|
||||
// Add system SGPRs.
|
||||
unsigned addWorkGroupIDX() {
|
||||
WorkGroupIDXSystemSGPR = getNextSystemSGPR();
|
||||
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
|
||||
NumSystemSGPRs += 1;
|
||||
return WorkGroupIDXSystemSGPR;
|
||||
return ArgInfo.WorkGroupIDX.getRegister();
|
||||
}
|
||||
|
||||
unsigned addWorkGroupIDY() {
|
||||
WorkGroupIDYSystemSGPR = getNextSystemSGPR();
|
||||
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
|
||||
NumSystemSGPRs += 1;
|
||||
return WorkGroupIDYSystemSGPR;
|
||||
return ArgInfo.WorkGroupIDY.getRegister();
|
||||
}
|
||||
|
||||
unsigned addWorkGroupIDZ() {
|
||||
WorkGroupIDZSystemSGPR = getNextSystemSGPR();
|
||||
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
|
||||
NumSystemSGPRs += 1;
|
||||
return WorkGroupIDZSystemSGPR;
|
||||
return ArgInfo.WorkGroupIDZ.getRegister();
|
||||
}
|
||||
|
||||
unsigned addWorkGroupInfo() {
|
||||
WorkGroupInfoSystemSGPR = getNextSystemSGPR();
|
||||
ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR());
|
||||
NumSystemSGPRs += 1;
|
||||
return WorkGroupInfoSystemSGPR;
|
||||
return ArgInfo.WorkGroupInfo.getRegister();
|
||||
}
|
||||
|
||||
// Add special VGPR inputs
|
||||
void setWorkItemIDX(ArgDescriptor Arg) {
|
||||
ArgInfo.WorkItemIDX = Arg;
|
||||
}
|
||||
|
||||
void setWorkItemIDY(ArgDescriptor Arg) {
|
||||
ArgInfo.WorkItemIDY = Arg;
|
||||
}
|
||||
|
||||
void setWorkItemIDZ(ArgDescriptor Arg) {
|
||||
ArgInfo.WorkItemIDZ = Arg;
|
||||
}
|
||||
|
||||
|
||||
unsigned addPrivateSegmentWaveByteOffset() {
|
||||
PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
|
||||
ArgInfo.PrivateSegmentWaveByteOffset
|
||||
= ArgDescriptor::createRegister(getNextSystemSGPR());
|
||||
NumSystemSGPRs += 1;
|
||||
return PrivateSegmentWaveByteOffsetSystemSGPR;
|
||||
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
|
||||
}
|
||||
|
||||
void setPrivateSegmentWaveByteOffset(unsigned Reg) {
|
||||
PrivateSegmentWaveByteOffsetSystemSGPR = Reg;
|
||||
ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
|
||||
}
|
||||
|
||||
bool hasPrivateSegmentBuffer() const {
|
||||
|
@ -375,6 +364,23 @@ public:
|
|||
return ImplicitBufferPtr;
|
||||
}
|
||||
|
||||
AMDGPUFunctionArgInfo &getArgInfo() {
|
||||
return ArgInfo;
|
||||
}
|
||||
|
||||
const AMDGPUFunctionArgInfo &getArgInfo() const {
|
||||
return ArgInfo;
|
||||
}
|
||||
|
||||
std::pair<const ArgDescriptor *, const TargetRegisterClass *>
|
||||
getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
|
||||
return ArgInfo.getPreloadedValue(Value);
|
||||
}
|
||||
|
||||
unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
|
||||
return ArgInfo.getPreloadedValue(Value).first->getRegister();
|
||||
}
|
||||
|
||||
unsigned getNumUserSGPRs() const {
|
||||
return NumUserSGPRs;
|
||||
}
|
||||
|
@ -384,7 +390,7 @@ public:
|
|||
}
|
||||
|
||||
unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
|
||||
return PrivateSegmentWaveByteOffsetSystemSGPR;
|
||||
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
|
||||
}
|
||||
|
||||
/// \brief Returns the physical register reserved for use as the resource
|
||||
|
@ -426,11 +432,11 @@ public:
|
|||
}
|
||||
|
||||
unsigned getQueuePtrUserSGPR() const {
|
||||
return QueuePtrUserSGPR;
|
||||
return ArgInfo.QueuePtr.getRegister();
|
||||
}
|
||||
|
||||
unsigned getImplicitBufferPtrUserSGPR() const {
|
||||
return ImplicitBufferPtrUserSGPR;
|
||||
return ArgInfo.ImplicitBufferPtr.getRegister();
|
||||
}
|
||||
|
||||
bool hasSpilledSGPRs() const {
|
||||
|
@ -562,13 +568,13 @@ public:
|
|||
switch (Dim) {
|
||||
case 0:
|
||||
assert(hasWorkGroupIDX());
|
||||
return WorkGroupIDXSystemSGPR;
|
||||
return ArgInfo.WorkGroupIDX.getRegister();
|
||||
case 1:
|
||||
assert(hasWorkGroupIDY());
|
||||
return WorkGroupIDYSystemSGPR;
|
||||
return ArgInfo.WorkGroupIDY.getRegister();
|
||||
case 2:
|
||||
assert(hasWorkGroupIDZ());
|
||||
return WorkGroupIDZSystemSGPR;
|
||||
return ArgInfo.WorkGroupIDZ.getRegister();
|
||||
}
|
||||
llvm_unreachable("unexpected dimension");
|
||||
}
|
||||
|
|
|
@ -1338,61 +1338,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
|
|||
return getCommonSubClass(DefRC, SrcRC) != nullptr;
|
||||
}
|
||||
|
||||
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
|
||||
// as input registers if unused. Whether the dispatch ptr is necessary should be
|
||||
// easy to detect from used intrinsics. Scratch setup is harder to know.
|
||||
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
|
||||
enum PreloadedValue Value) const {
|
||||
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
(void)ST;
|
||||
switch (Value) {
|
||||
case SIRegisterInfo::WORKGROUP_ID_X:
|
||||
assert(MFI->hasWorkGroupIDX());
|
||||
return MFI->WorkGroupIDXSystemSGPR;
|
||||
case SIRegisterInfo::WORKGROUP_ID_Y:
|
||||
assert(MFI->hasWorkGroupIDY());
|
||||
return MFI->WorkGroupIDYSystemSGPR;
|
||||
case SIRegisterInfo::WORKGROUP_ID_Z:
|
||||
assert(MFI->hasWorkGroupIDZ());
|
||||
return MFI->WorkGroupIDZSystemSGPR;
|
||||
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
|
||||
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
|
||||
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
|
||||
assert(MFI->hasPrivateSegmentBuffer());
|
||||
return MFI->PrivateSegmentBufferUserSGPR;
|
||||
case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
|
||||
assert(MFI->hasImplicitBufferPtr());
|
||||
return MFI->ImplicitBufferPtrUserSGPR;
|
||||
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
|
||||
assert(MFI->hasKernargSegmentPtr());
|
||||
return MFI->KernargSegmentPtrUserSGPR;
|
||||
case SIRegisterInfo::DISPATCH_ID:
|
||||
assert(MFI->hasDispatchID());
|
||||
return MFI->DispatchIDUserSGPR;
|
||||
case SIRegisterInfo::FLAT_SCRATCH_INIT:
|
||||
assert(MFI->hasFlatScratchInit());
|
||||
return MFI->FlatScratchInitUserSGPR;
|
||||
case SIRegisterInfo::DISPATCH_PTR:
|
||||
assert(MFI->hasDispatchPtr());
|
||||
return MFI->DispatchPtrUserSGPR;
|
||||
case SIRegisterInfo::QUEUE_PTR:
|
||||
assert(MFI->hasQueuePtr());
|
||||
return MFI->QueuePtrUserSGPR;
|
||||
case SIRegisterInfo::WORKITEM_ID_X:
|
||||
assert(MFI->hasWorkItemIDX());
|
||||
return AMDGPU::VGPR0;
|
||||
case SIRegisterInfo::WORKITEM_ID_Y:
|
||||
assert(MFI->hasWorkItemIDY());
|
||||
return AMDGPU::VGPR1;
|
||||
case SIRegisterInfo::WORKITEM_ID_Z:
|
||||
assert(MFI->hasWorkItemIDZ());
|
||||
return AMDGPU::VGPR2;
|
||||
}
|
||||
llvm_unreachable("unexpected preloaded value type");
|
||||
}
|
||||
|
||||
/// \brief Returns a register that is not used at any point in the function.
|
||||
/// If all registers are used, then this function will return
|
||||
// AMDGPU::NoRegister.
|
||||
|
|
|
@ -186,31 +186,6 @@ public:
|
|||
OpType <= AMDGPU::OPERAND_SRC_LAST;
|
||||
}
|
||||
|
||||
enum PreloadedValue {
|
||||
// SGPRS:
|
||||
PRIVATE_SEGMENT_BUFFER = 0,
|
||||
DISPATCH_PTR = 1,
|
||||
QUEUE_PTR = 2,
|
||||
KERNARG_SEGMENT_PTR = 3,
|
||||
DISPATCH_ID = 4,
|
||||
FLAT_SCRATCH_INIT = 5,
|
||||
WORKGROUP_ID_X = 10,
|
||||
WORKGROUP_ID_Y = 11,
|
||||
WORKGROUP_ID_Z = 12,
|
||||
PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
|
||||
IMPLICIT_BUFFER_PTR = 15,
|
||||
|
||||
// VGPRS:
|
||||
FIRST_VGPR_VALUE = 16,
|
||||
WORKITEM_ID_X = FIRST_VGPR_VALUE,
|
||||
WORKITEM_ID_Y = 17,
|
||||
WORKITEM_ID_Z = 18
|
||||
};
|
||||
|
||||
/// \brief Returns the physical register that \p Value is stored in.
|
||||
unsigned getPreloadedValue(const MachineFunction &MF,
|
||||
enum PreloadedValue Value) const;
|
||||
|
||||
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
|
||||
const TargetRegisterClass *RC,
|
||||
const MachineFunction &MF) const;
|
||||
|
|
|
@ -0,0 +1,612 @@
|
|||
; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
|
||||
; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; GCN-LABEL: {{^}}use_dispatch_ptr:
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
|
||||
define void @use_dispatch_ptr() #1 {
|
||||
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
|
||||
%header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
|
||||
%value = load volatile i32, i32 addrspace(2)* %header_ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr:
|
||||
; GCN: enable_sgpr_dispatch_ptr = 1
|
||||
; GCN: s_mov_b64 s[6:7], s[4:5]
|
||||
define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
|
||||
call void @use_dispatch_ptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_queue_ptr:
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
|
||||
define void @use_queue_ptr() #1 {
|
||||
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
|
||||
%header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
|
||||
%value = load volatile i32, i32 addrspace(2)* %header_ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
|
||||
; GCN: enable_sgpr_queue_ptr = 1
|
||||
; GCN: s_mov_b64 s[6:7], s[4:5]
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
|
||||
call void @use_queue_ptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast:
|
||||
; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10
|
||||
; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]]
|
||||
|
||||
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
|
||||
define void @use_queue_ptr_addrspacecast() #1 {
|
||||
%asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)*
|
||||
store volatile i32 0, i32 addrspace(4)* %asc
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
|
||||
; CIVI: enable_sgpr_queue_ptr = 1
|
||||
|
||||
; CIVI: s_mov_b64 s[6:7], s[4:5]
|
||||
; GFX9-NOT: s_mov_b64
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
|
||||
call void @use_queue_ptr_addrspacecast()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
|
||||
define void @use_kernarg_segment_ptr() #1 {
|
||||
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
|
||||
%header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
|
||||
%value = load volatile i32, i32 addrspace(2)* %header_ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
|
||||
; GCN: enable_sgpr_kernarg_segment_ptr = 1
|
||||
; GCN: s_mov_b64 s[6:7], s[4:5]
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
|
||||
call void @use_kernarg_segment_ptr()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_dispatch_id:
|
||||
; GCN: ; use s[6:7]
|
||||
define void @use_dispatch_id() #1 {
|
||||
%id = call i64 @llvm.amdgcn.dispatch.id()
|
||||
call void asm sideeffect "; use $0", "s"(i64 %id)
|
||||
ret void
|
||||
}
|
||||
|
||||
; No kernarg segment so that there is a mov to check. With kernarg
|
||||
; pointer enabled, it happens to end up in the right place anyway.
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id:
|
||||
; GCN: enable_sgpr_dispatch_id = 1
|
||||
|
||||
; GCN: s_mov_b64 s[6:7], s[4:5]
|
||||
define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 {
|
||||
call void @use_dispatch_id()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: ; use s6
|
||||
define void @use_workgroup_id_x() #1 {
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_stack_workgroup_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4
|
||||
; GCN: ; use s6
|
||||
; GCN: s_setpc_b64
|
||||
define void @use_stack_workgroup_id_x() #1 {
|
||||
%alloca = alloca i32
|
||||
store volatile i32 0, i32* %alloca
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: ; use s6
|
||||
define void @use_workgroup_id_y() #1 {
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN: ; use s6
|
||||
define void @use_workgroup_id_z() #1 {
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_xy:
|
||||
; GCN: ; use s6
|
||||
; GCN: ; use s7
|
||||
define void @use_workgroup_id_xy() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val0)
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val1)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_xyz:
|
||||
; GCN: ; use s6
|
||||
; GCN: ; use s7
|
||||
; GCN: ; use s8
|
||||
define void @use_workgroup_id_xyz() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
%val2 = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val0)
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val1)
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val2)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_xz:
|
||||
; GCN: ; use s6
|
||||
; GCN: ; use s7
|
||||
define void @use_workgroup_id_xz() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val0)
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val1)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workgroup_id_yz:
|
||||
; GCN: ; use s6
|
||||
; GCN: ; use s7
|
||||
define void @use_workgroup_id_yz() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val0)
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val1)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 0
|
||||
; GCN: enable_sgpr_workgroup_id_z = 0
|
||||
|
||||
; GCN-NOT: s6
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN-NOT: s6
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN-NOT: s6
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
|
||||
call void @use_workgroup_id_x()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 1
|
||||
; GCN: enable_sgpr_workgroup_id_z = 0
|
||||
|
||||
; GCN: s_mov_b32 s33, s8
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN: s_mov_b32 s6, s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
|
||||
call void @use_workgroup_id_y()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 0
|
||||
; GCN: enable_sgpr_workgroup_id_z = 1
|
||||
|
||||
; GCN: s_mov_b32 s33, s8
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN: s_mov_b32 s6, s7
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
|
||||
call void @use_workgroup_id_z()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 1
|
||||
; GCN: enable_sgpr_workgroup_id_z = 0
|
||||
|
||||
; GCN: s_mov_b32 s33, s8
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
|
||||
call void @use_workgroup_id_xy()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 1
|
||||
; GCN: enable_sgpr_workgroup_id_z = 1
|
||||
|
||||
; GCN: s_mov_b32 s33, s9
|
||||
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN-NOT: s8
|
||||
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN-NOT: s8
|
||||
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN-NOT: s8
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
|
||||
call void @use_workgroup_id_xyz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 0
|
||||
; GCN: enable_sgpr_workgroup_id_z = 1
|
||||
|
||||
; GCN: s_mov_b32 s33, s8
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
|
||||
call void @use_workgroup_id_xz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 1
|
||||
; GCN: enable_sgpr_workgroup_id_z = 1
|
||||
|
||||
; GCN: s_mov_b32 s33, s9
|
||||
; GCN: s_mov_b32 s6, s7
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN: s_mov_b32 s7, s8
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
|
||||
call void @use_workgroup_id_yz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; Argument is in right place already
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x:
|
||||
; GCN-NOT: s6
|
||||
define void @func_indirect_use_workgroup_id_x() #1 {
|
||||
call void @use_workgroup_id_x()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
|
||||
; GCN-NOT: s6
|
||||
define void @func_indirect_use_workgroup_id_y() #1 {
|
||||
call void @use_workgroup_id_y()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
|
||||
; GCN-NOT: s6
|
||||
define void @func_indirect_use_workgroup_id_z() #1 {
|
||||
call void @use_workgroup_id_z()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x:
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN: ; use s6
|
||||
define void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y:
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN: ; use s6
|
||||
define void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z:
|
||||
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN: ; use s6
|
||||
define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_x:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 0
|
||||
; GCN: enable_sgpr_workgroup_id_z = 0
|
||||
|
||||
; GCN-DAG: s_mov_b32 s33, s7
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
|
||||
|
||||
; GCN-NOT: s6
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN-NOT: s6
|
||||
; GCN-DAG: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
|
||||
call void @other_arg_use_workgroup_id_x(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 1
|
||||
; GCN: enable_sgpr_workgroup_id_z = 0
|
||||
|
||||
; GCN-DAG: s_mov_b32 s33, s8
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN-DAG: s_mov_b32 s6, s7
|
||||
; GCN-DAG: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
|
||||
call void @other_arg_use_workgroup_id_y(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 0
|
||||
; GCN: enable_sgpr_workgroup_id_z = 1
|
||||
|
||||
; GCN: s_mov_b32 s33, s8
|
||||
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN-DAG: s_mov_b32 s6, s7
|
||||
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 {
|
||||
call void @other_arg_use_workgroup_id_z(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_every_sgpr_input:
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
|
||||
; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0
|
||||
; GCN: ; use s[12:13]
|
||||
; GCN: ; use s14
|
||||
; GCN: ; use s15
|
||||
; GCN: ; use s16
|
||||
define void @use_every_sgpr_input() #1 {
|
||||
%alloca = alloca i32, align 4
|
||||
store volatile i32 0, i32* %alloca
|
||||
|
||||
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
|
||||
%dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
|
||||
%val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
|
||||
|
||||
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
|
||||
%queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
|
||||
%val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
|
||||
|
||||
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
|
||||
%kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
|
||||
%val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
|
||||
|
||||
%val3 = call i64 @llvm.amdgcn.dispatch.id()
|
||||
call void asm sideeffect "; use $0", "s"(i64 %val3)
|
||||
|
||||
%val4 = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val4)
|
||||
|
||||
%val5 = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val5)
|
||||
|
||||
%val6 = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val6)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
|
||||
; GCN: enable_sgpr_workgroup_id_x = 1
|
||||
; GCN: enable_sgpr_workgroup_id_y = 1
|
||||
; GCN: enable_sgpr_workgroup_id_z = 1
|
||||
; GCN: enable_sgpr_workgroup_info = 0
|
||||
|
||||
; GCN: enable_sgpr_private_segment_buffer = 1
|
||||
; GCN: enable_sgpr_dispatch_ptr = 1
|
||||
; GCN: enable_sgpr_queue_ptr = 1
|
||||
; GCN: enable_sgpr_kernarg_segment_ptr = 1
|
||||
; GCN: enable_sgpr_dispatch_id = 1
|
||||
; GCN: enable_sgpr_flat_scratch_init = 1
|
||||
|
||||
; GCN: s_mov_b32 s33, s17
|
||||
; GCN: s_mov_b64 s[12:13], s[10:11]
|
||||
; GCN: s_mov_b64 s[10:11], s[8:9]
|
||||
; GCN: s_mov_b64 s[8:9], s[6:7]
|
||||
; GCN: s_mov_b64 s[6:7], s[4:5]
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
|
||||
call void @use_every_sgpr_input()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input:
|
||||
; GCN-NOT: s6
|
||||
; GCN-NOT: s7
|
||||
; GCN-NOT: s8
|
||||
; GCN-NOT: s9
|
||||
; GCN-NOT: s10
|
||||
; GCN-NOT: s11
|
||||
; GCN-NOT: s12
|
||||
; GCN-NOT: s13
|
||||
; GCN-NOT: s[6:7]
|
||||
; GCN-NOT: s[8:9]
|
||||
; GCN-NOT: s[10:11]
|
||||
; GCN-NOT: s[12:13]
|
||||
define void @func_indirect_use_every_sgpr_input() #1 {
|
||||
call void @use_every_sgpr_input()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz:
|
||||
; GCN-DAG: s_mov_b32 s6, s14
|
||||
; GCN-DAG: s_mov_b32 s7, s15
|
||||
; GCN-DAG: s_mov_b32 s8, s16
|
||||
; GCN: s_swappc_b64
|
||||
define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
|
||||
%alloca = alloca i32, align 4
|
||||
store volatile i32 0, i32* %alloca
|
||||
|
||||
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
|
||||
%dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
|
||||
%val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
|
||||
|
||||
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
|
||||
%queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
|
||||
%val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
|
||||
|
||||
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
|
||||
%kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
|
||||
%val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
|
||||
|
||||
%val3 = call i64 @llvm.amdgcn.dispatch.id()
|
||||
call void asm sideeffect "; use $0", "s"(i64 %val3)
|
||||
|
||||
%val4 = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val4)
|
||||
|
||||
%val5 = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val5)
|
||||
|
||||
%val6 = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val6)
|
||||
|
||||
call void @use_workgroup_id_xyz()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: s_add_u32 s32, s32, 0x300
|
||||
|
||||
; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-9]+]], s14
|
||||
; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-9]+]], s15
|
||||
; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-9]+]], s16
|
||||
; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7]
|
||||
; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9]
|
||||
; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11]
|
||||
|
||||
; GCN-DAG: s_mov_b32 s6, [[SAVE_X]]
|
||||
; GCN-DAG: s_mov_b32 s7, [[SAVE_Y]]
|
||||
; GCN-DAG: s_mov_b32 s8, [[SAVE_Z]]
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
|
||||
; GCN: s_load_dword s{{[0-9]+}},
|
||||
; GCN: s_load_dword s{{[0-9]+}},
|
||||
; GCN: s_load_dword s{{[0-9]+}},
|
||||
; GCN: ; use
|
||||
; GCN: ; use [[SAVE_X]]
|
||||
; GCN: ; use [[SAVE_Y]]
|
||||
; GCN: ; use [[SAVE_Z]]
|
||||
define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 {
|
||||
%alloca = alloca i32, align 4
|
||||
call void @use_workgroup_id_xyz()
|
||||
|
||||
store volatile i32 0, i32* %alloca
|
||||
|
||||
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
|
||||
%dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
|
||||
%val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
|
||||
|
||||
%queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
|
||||
%queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
|
||||
%val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
|
||||
|
||||
%kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
|
||||
%kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
|
||||
%val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
|
||||
|
||||
%val3 = call i64 @llvm.amdgcn.dispatch.id()
|
||||
call void asm sideeffect "; use $0", "s"(i64 %val3)
|
||||
|
||||
%val4 = call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val4)
|
||||
|
||||
%val5 = call i32 @llvm.amdgcn.workgroup.id.y()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val5)
|
||||
|
||||
%val6 = call i32 @llvm.amdgcn.workgroup.id.z()
|
||||
call void asm sideeffect "; use $0", "s"(i32 %val6)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #0
|
||||
declare i32 @llvm.amdgcn.workgroup.id.y() #0
|
||||
declare i32 @llvm.amdgcn.workgroup.id.z() #0
|
||||
declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
|
||||
declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
|
||||
declare i64 @llvm.amdgcn.dispatch.id() #0
|
||||
declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
||||
attributes #1 = { nounwind noinline }
|
|
@ -0,0 +1,671 @@
|
|||
; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_x() #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_y() #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_z() #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xy:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_xy() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
store volatile i32 %val0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val1, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xyz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_xyz() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %val0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val1, i32 addrspace(1)* undef
|
||||
store volatile i32 %val2, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_xz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_xz() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%val1 = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %val0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val1, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use_workitem_id_yz:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @use_workitem_id_yz() #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
%val1 = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %val0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val1, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
|
||||
; GCN: enable_vgpr_workitem_id = 0
|
||||
|
||||
; GCN-NOT: v0
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
|
||||
call void @use_workitem_id_x()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
|
||||
; GCN: enable_vgpr_workitem_id = 1
|
||||
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: v_mov_b32_e32 v0, v1
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
|
||||
call void @use_workitem_id_y()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN: v_mov_b32_e32 v0, v2
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v2
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
|
||||
call void @use_workitem_id_z()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
|
||||
; GCN-NOT: v0
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define void @func_indirect_use_workitem_id_x() #1 {
|
||||
call void @use_workitem_id_x()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y:
|
||||
; GCN-NOT: v0
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define void @func_indirect_use_workitem_id_y() #1 {
|
||||
call void @use_workitem_id_y()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z:
|
||||
; GCN-NOT: v0
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define void @func_indirect_use_workitem_id_z() #1 {
|
||||
call void @use_workitem_id_z()
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
|
||||
; GCN: enable_vgpr_workitem_id = 0
|
||||
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
|
||||
call void @other_arg_use_workitem_id_x(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
|
||||
; GCN: enable_vgpr_workitem_id = 1
|
||||
|
||||
; GCN-NOT: v1
|
||||
; GCN: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN-NOT: v1
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
|
||||
call void @other_arg_use_workitem_id_y(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN: v_mov_b32_e32 v0, 0x22b
|
||||
; GCN: v_mov_b32_e32 v1, v2
|
||||
; GCN: s_swappc_b64
|
||||
; GCN-NOT: v0
|
||||
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
|
||||
call void @other_arg_use_workitem_id_z(i32 555)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_x(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
|
||||
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg1, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg2, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg3, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg4, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg5, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg6, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg7, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg8, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg9, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg10, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg11, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg12, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg13, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg14, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg15, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg16, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg17, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg18, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg19, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg20, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg21, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg22, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg23, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg24, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg25, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg26, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg27, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg28, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg29, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg30, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg31, i32 addrspace(1)* undef
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
|
||||
; GCN: enable_vgpr_workitem_id = 0
|
||||
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
||||
; GCN: s_mov_b32 s4, s33
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
|
||||
call void @too_many_args_use_workitem_id_x(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
i32 50, i32 60, i32 70, i32 80,
|
||||
i32 90, i32 100, i32 110, i32 120,
|
||||
i32 130, i32 140, i32 150, i32 160,
|
||||
i32 170, i32 180, i32 190, i32 200,
|
||||
i32 210, i32 220, i32 230, i32 240,
|
||||
i32 250, i32 260, i32 270, i32 280,
|
||||
i32 290, i32 300, i32 310, i32 320)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8
|
||||
; GCN: s_swappc_b64
|
||||
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
call void @too_many_args_use_workitem_id_x(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
i32 50, i32 60, i32 70, i32 80,
|
||||
i32 90, i32 100, i32 110, i32 120,
|
||||
i32 130, i32 140, i32 150, i32 160,
|
||||
i32 170, i32 180, i32 190, i32 200,
|
||||
i32 210, i32 220, i32 230, i32 240,
|
||||
i32 250, i32 260, i32 270, i32 280,
|
||||
i32 290, i32 300, i32 310, i32 320)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Requires loading and storing to stack slot.
|
||||
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
|
||||
; GCN: s_add_u32 s32, s32, 0x400{{$}}
|
||||
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
|
||||
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
|
||||
; GCN: s_sub_u32 s32, s32, 0x400{{$}}
|
||||
; GCN: s_setpc_b64
|
||||
define void @too_many_args_call_too_many_args_use_workitem_id_x(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
|
||||
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
|
||||
call void @too_many_args_use_workitem_id_x(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
|
||||
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31)
|
||||
ret void
|
||||
}
|
||||
|
||||
; stack layout:
|
||||
; frame[0] = emergency stack slot
|
||||
; frame[1] = byval arg32
|
||||
; frame[2] = stack passed workitem ID x
|
||||
; frame[3] = VGPR spill slot
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
|
||||
; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
|
||||
; GCN: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_x_byval(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
|
||||
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 {
|
||||
%val = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %val, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg1, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg2, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg3, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg4, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg5, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg6, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg7, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg8, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg9, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg10, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg11, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg12, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg13, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg14, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg15, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg16, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg17, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg18, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg19, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg20, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg21, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg22, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg23, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg24, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg25, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg26, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg27, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg28, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg29, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg30, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg31, i32 addrspace(1)* undef
|
||||
%private = load volatile i32, i32* %arg32
|
||||
ret void
|
||||
}
|
||||
|
||||
; frame[0] = emergency stack slot
|
||||
; frame[1] =
|
||||
|
||||
; sp[0] = callee emergency stack slot reservation
|
||||
; sp[1] = byval
|
||||
; sp[2] = ??
|
||||
; sp[3] = stack passed workitem ID x
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
|
||||
; GCN: enable_vgpr_workitem_id = 0
|
||||
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN: s_add_u32 s32, s33, 0x200{{$}}
|
||||
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x100{{$}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
|
||||
|
||||
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
|
||||
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
%alloca = alloca i32, align 4
|
||||
store volatile i32 999, i32* %alloca
|
||||
call void @too_many_args_use_workitem_id_x_byval(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
i32 50, i32 60, i32 70, i32 80,
|
||||
i32 90, i32 100, i32 110, i32 120,
|
||||
i32 130, i32 140, i32 150, i32 160,
|
||||
i32 170, i32 180, i32 190, i32 200,
|
||||
i32 210, i32 220, i32 230, i32 240,
|
||||
i32 250, i32 260, i32 270, i32 280,
|
||||
i32 290, i32 300, i32 310, i32 320,
|
||||
i32* %alloca)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
|
||||
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
|
||||
; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
|
||||
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
|
||||
|
||||
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
|
||||
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
|
||||
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
|
||||
; GCN: s_swappc_b64
|
||||
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
|
||||
%alloca = alloca i32, align 4
|
||||
store volatile i32 999, i32* %alloca
|
||||
call void @too_many_args_use_workitem_id_x_byval(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
i32 50, i32 60, i32 70, i32 80,
|
||||
i32 90, i32 100, i32 110, i32 120,
|
||||
i32 130, i32 140, i32 150, i32 160,
|
||||
i32 170, i32 180, i32 190, i32 200,
|
||||
i32 210, i32 220, i32 230, i32 240,
|
||||
i32 250, i32 260, i32 270, i32 280,
|
||||
i32 290, i32 300, i32 310, i32 320,
|
||||
i32* %alloca)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
|
||||
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @too_many_args_use_workitem_id_xyz(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
|
||||
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %val0, i32 addrspace(1)* undef
|
||||
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
store volatile i32 %val1, i32 addrspace(1)* undef
|
||||
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %val2, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg1, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg2, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg3, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg4, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg5, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg6, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg7, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg8, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg9, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg10, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg11, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg12, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg13, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg14, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg15, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg16, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg17, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg18, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg19, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg20, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg21, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg22, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg23, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg24, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg25, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg26, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg27, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg28, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg29, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg30, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg31, i32 addrspace(1)* undef
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; frame[0] = kernel emergency stack slot
|
||||
; frame[1] = callee emergency stack slot
|
||||
; frame[2] = ID X
|
||||
; frame[3] = ID Y
|
||||
; frame[4] = ID Z
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
|
||||
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
||||
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12
|
||||
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
|
||||
call void @too_many_args_use_workitem_id_xyz(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
i32 50, i32 60, i32 70, i32 80,
|
||||
i32 90, i32 100, i32 110, i32 120,
|
||||
i32 130, i32 140, i32 150, i32 160,
|
||||
i32 170, i32 180, i32 190, i32 200,
|
||||
i32 210, i32 220, i32 230, i32 240,
|
||||
i32 250, i32 260, i32 270, i32 280,
|
||||
i32 290, i32 300, i32 310, i32 320)
|
||||
ret void
|
||||
}
|
||||
|
||||
; workitem ID X in register, yz on stack
|
||||
; v31 = workitem ID X
|
||||
; frame[0] = emergency slot
|
||||
; frame[1] = workitem Y
|
||||
; frame[2] = workitem Z
|
||||
|
||||
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
|
||||
; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
|
||||
; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
|
||||
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
; GCN: ScratchSize: 12
|
||||
define void @too_many_args_use_workitem_id_x_stack_yz(
|
||||
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
|
||||
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
|
||||
i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
|
||||
i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 {
|
||||
%val0 = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
store volatile i32 %val0, i32 addrspace(1)* undef
|
||||
%val1 = call i32 @llvm.amdgcn.workitem.id.y()
|
||||
store volatile i32 %val1, i32 addrspace(1)* undef
|
||||
%val2 = call i32 @llvm.amdgcn.workitem.id.z()
|
||||
store volatile i32 %val2, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg1, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg2, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg3, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg4, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg5, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg6, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg7, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg8, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg9, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg10, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg11, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg12, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg13, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg14, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg15, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg16, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg17, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg18, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg19, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg20, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg21, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg22, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg23, i32 addrspace(1)* undef
|
||||
|
||||
store volatile i32 %arg24, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg25, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg26, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg27, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg28, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg29, i32 addrspace(1)* undef
|
||||
store volatile i32 %arg30, i32 addrspace(1)* undef
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; frame[0] = kernel emergency stack slot
|
||||
; frame[1] = callee emergency stack slot
|
||||
; frame[2] = ID Y
|
||||
; frame[3] = ID Z
|
||||
|
||||
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
|
||||
; GCN: enable_vgpr_workitem_id = 2
|
||||
|
||||
; GCN: s_mov_b32 s33, s7
|
||||
; GCN: s_mov_b32 s32, s33
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v31, v0
|
||||
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
|
||||
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
|
||||
; GCN: s_swappc_b64
|
||||
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
|
||||
call void @too_many_args_use_workitem_id_x_stack_yz(
|
||||
i32 10, i32 20, i32 30, i32 40,
|
||||
i32 50, i32 60, i32 70, i32 80,
|
||||
i32 90, i32 100, i32 110, i32 120,
|
||||
i32 130, i32 140, i32 150, i32 160,
|
||||
i32 170, i32 180, i32 190, i32 200,
|
||||
i32 210, i32 220, i32 230, i32 240,
|
||||
i32 250, i32 260, i32 270, i32 280,
|
||||
i32 290, i32 300, i32 310)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #0
|
||||
declare i32 @llvm.amdgcn.workitem.id.z() #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
||||
attributes #1 = { nounwind noinline }
|
Loading…
Reference in New Issue