diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 314ef721c1fc..b96c48f0561d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -452,18 +452,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -524,9 +533,44 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index aa3909060e52..6aa4fddd3ec4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1062,14 +1062,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast(Subtarget->getRegisterInfo()); const SIMachineFunctionInfo *Info = MF.getInfo(); - unsigned ScratchOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 6aff4b5700d4..6b3c81c3af74 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -36,6 +36,16 @@ static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, return true; } +static ArrayRef getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { if (!MF.getFrameInfo()->hasStackObjects()) @@ -43,7 +53,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - const SIMachineFunctionInfo *MFI = MF.getInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); // If we only have SGPR spills, we won't actually be using scratch memory // since these spill to VGPRs. @@ -56,31 +66,159 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, const SIInstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); assert(ScratchRsrcReg != AMDGPU::NoRegister); - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0"); + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1"); + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff); + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32); + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; + + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } } void SIFrameLowering::processFunctionBeforeFrameFinalized( diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index de52b483bbbc..05b54d0bae51 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -542,6 +542,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, Align); // Alignment } +static ArrayRef getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, @@ -619,39 +624,30 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs += 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - - if (MFI->hasDispatchPtr()) { - unsigned DispatchPtrReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); - MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); - } - } - if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -739,14 +735,114 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); } - if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info)) - Info->setScratchRSrcReg(TRI); + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } if (Chains.empty()) return Chain; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5350edbd74e2..e1668649139d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -551,16 +551,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( - *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) // src .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(ScratchOffsetPreloadReg) // scratch_offset + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } @@ -638,14 +635,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); - unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue( - *MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(ScratchOffsetPreloadReg) // scratch_offset + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addMemOperand(MMO); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index d042844aa138..935aad427198 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), + NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), DispatchID(false), - KernargSegmentPtr(true), + KernargSegmentPtr(false), FlatScratchInit(false), GridWorkgroupCountX(false), GridWorkgroupCountY(false), @@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(true), WorkItemIDY(false), WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); - if (F->hasFnAttribute("amdgpu-dispatch-ptr")) - DispatchPtr = true; + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; @@ -66,14 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; } -void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) { - // We need to round up to next multiple of 4. - unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4); - unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128); - ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0, - &AMDGPU::SReg_128RegClass); +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; } SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 377c5ce94846..9c528d63bd0e 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,10 +26,36 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; public: // FIXME: Make private @@ -38,12 +64,14 @@ public: std::map LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; - // Feature bits required for inputs passed in user / system SGPRs. + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; bool DispatchPtr : 1; bool QueuePtr : 1; bool DispatchID : 1; @@ -53,15 +81,27 @@ private: bool GridWorkgroupCountY : 1; bool GridWorkgroupCountZ : 1; + // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; bool WorkItemIDX : 1; // Always initialized. bool WorkItemIDY : 1; bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + public: struct SpilledReg { unsigned VGPR; @@ -80,6 +120,47 @@ public: unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + bool hasDispatchPtr() const { return DispatchPtr; } @@ -128,6 +209,10 @@ public: return WorkGroupInfo; } + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + bool hasWorkItemIDX() const { return WorkItemIDX; } @@ -140,13 +225,37 @@ public: return WorkItemIDZ; } + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + /// \brief Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. unsigned getScratchRSrcReg() const { return ScratchRSrcReg; } - void setScratchRSrcReg(const SIRegisterInfo *TRI); + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } bool hasSpilledSGPRs() const { return HasSpilledSGPRs; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 68629cd50995..d9799812d453 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -32,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the + // next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -69,19 +103,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } const SIMachineFunctionInfo *MFI = MF.getInfo(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { - unsigned ScratchOffsetPreloadReg - = getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - // We will need to use this user SGPR argument for spilling, and thus never - // want it to be spilled. - reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg); - // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg)); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } return Reserved; @@ -204,11 +239,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) + .addReg(ScratchRsrcReg) .addReg(SOffset) .addImm(Offset) .addImm(0) // glc @@ -526,6 +560,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { @@ -533,29 +570,36 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, const SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Value) { case SIRegisterInfo::WORKGROUP_ID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; case SIRegisterInfo::WORKGROUP_ID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; case SIRegisterInfo::WORKGROUP_ID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - llvm_unreachable("currently unused"); + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: - return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); - return AMDGPU::SGPR0_SGPR1; + return MFI->DispatchPtrUserSGPR; case SIRegisterInfo::QUEUE_PTR: llvm_unreachable("not implemented"); case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index bb93242b0f88..eafe4053e878 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -29,6 +29,15 @@ private: public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll index ab87fdbc00da..d9bb586163dc 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa.ll @@ -38,8 +38,10 @@ ; HSA: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 5e8cf5bb3d25..c348a2e7980f 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -1,31 +1,46 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s -; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s ; FIXME: align on alloca seems to be ignored for private_segment_alignment ; ALL-LABEL: {{^}}large_alloca_compute_shader: -; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s14, -1 -; CI: s_mov_b32 s15, 0x80f000 -; VI: s_mov_b32 s15, 0x800000 +; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN: s_mov_b32 s10, -1 +; CI: s_mov_b32 s11, 0x80f000 +; VI: s_mov_b32 s11, 0x800000 ; GCNHSA: .amd_kernel_code_t + +; GCNHSA: compute_pgm_rsrc2_scratch_en = 1 +; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6 +; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1 +; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0 +; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0 +; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0 +; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 + +; GCNHSA: enable_sgpr_private_segment_buffer = 1 +; GCNHSA: enable_sgpr_dispatch_ptr = 0 +; GCNHSA: enable_sgpr_queue_ptr = 0 +; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1 +; GCNHSA: enable_sgpr_dispatch_id = 0 +; GCNHSA: enable_sgpr_flat_scratch_init = 0 +; GCNHSA: enable_sgpr_private_segment_size = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0 +; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCNHSA: workitem_private_segment_byte_size = 0 ; GCNHSA: private_segment_alignment = 4 ; GCNHSA: .end_amd_kernel_code_t -; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCNHSA: s_mov_b32 s10, -1 -; CIHSA: s_mov_b32 s11, 0x180f000 -; VIHSA: s_mov_b32 s11, 0x11800000 -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen ; Scratch size = alloca size + emergency stack slot ; ALL: ; ScratchSize: 32772 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll index 208b9a10050c..141ee2560152 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -8,8 +8,8 @@ ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { @@ -29,8 +29,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { ; CI: s_mov_b32 s11, 0x80f000 ; VI: s_mov_b32 s11, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll new file mode 100644 index 000000000000..6dc9d050eee6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.read.workdim.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}read_workdim: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] +define void @read_workdim(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.AMDGPU.read.workdim() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}read_workdim_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOT: 0xff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @read_workdim_known_bits(i32 addrspace(1)* %out) { +entry: + %dim = call i32 @llvm.AMDGPU.read.workdim() #0 + %shl = shl i32 %dim, 24 + %shr = lshr i32 %shl, 24 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.AMDGPU.read.workdim() #0 + +attributes #0 = { readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll index 719f7ffe0f1c..dc95cd1ee012 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -2,7 +2,7 @@ ; GCN-LABEL: {{^}}test: ; GCN: enable_sgpr_dispatch_ptr = 1 -; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 define void @test(i32 addrspace(1)* %out) { %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll index c5aba2b76b89..cc109327d929 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -1,8 +1,8 @@ ; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test_debug_value: -; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1 +; CHECK: s_load_dwordx2 s[4:5] +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5 ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll new file mode 100644 index 000000000000..f2a7256e812d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -0,0 +1,184 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + + +; FUNC-LABEL: {{^}}local_size_x: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].Z + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4 + +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_x(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.x() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[1].W + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_y(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.y() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z: +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] +; EG: MOV [[VAL]], KC0[2].X + +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @local_size_z(i32 addrspace(1)* %out) { +entry: + %0 = call i32 @llvm.r600.read.local.size.z() #0 + store i32 %0, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xy: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xy(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %val = mul i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xz: + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_yz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_yz(i32 addrspace(1)* %out) { +entry: + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xyz: +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 1 + +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xyz(i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %xy = mul i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_x_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_x_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.x() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_y_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_y_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.y() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_z_known_bits: +; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; GCN-NOT: 0xffff +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NEXT: buffer_store_dword [[VVAL]] +define void @local_size_z_known_bits(i32 addrspace(1)* %out) { +entry: + %size = call i32 @llvm.r600.read.local.size.z() #0 + %shl = shl i32 %size, 16 + %shr = lshr i32 %shl, 16 + store i32 %shr, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.local.size.x() #0 +declare i32 @llvm.r600.read.local.size.y() #0 +declare i32 @llvm.r600.read.local.size.z() #0 + +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll index 7f31ef45b628..6b52b80ba082 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -10,7 +10,7 @@ ; EG: .long 166120 ; EG-NEXT: .long 8 ; GCN: .long 47180 -; GCN-NEXT: .long 38792 +; GCN-NEXT: .long 32900 ; EG: {{^}}local_memory_two_objects: diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll index 9494ed75bd0c..9ffb59e70920 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll @@ -9,9 +9,9 @@ ; EG: .long 166120 ; EG-NEXT: .long 128 ; SI: .long 47180 -; SI-NEXT: .long 71560 +; SI-NEXT: .long 65668 ; CI: .long 47180 -; CI-NEXT: .long 38792 +; CI-NEXT: .long 32900 ; FUNC-LABEL: {{^}}local_memory: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index 2cbb67d85fba..cd7c78f408dd 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -17,16 +17,18 @@ declare i32 @llvm.r600.read.tgid.z() #1 ; GCN-LABEL: {{^}}spill_vgpr_compute: +; GCN: s_mov_b32 s16, s3 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_mov_b32 s15, 0x80f000 ; VI-NEXT: s_mov_b32 s15, 0x800000 -; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index ef492cef5767..16abb89bb0b8 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -11,14 +11,14 @@ ; GCN-LABEL: {{^}}main: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s11, 0x80f000 -; VI-NEXT: s_mov_b32 s11, 0x800000 +; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s14, -1 +; SI-NEXT: s_mov_b32 s15, 0x80f000 +; VI-NEXT: s_mov_b32 s15, 0x800000 ; s12 is offset user SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 diff --git a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll index ebe73429da9c..a704a23b0f92 100644 --- a/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -1,5 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -7,9 +9,26 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].X -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; HSA: .amd_kernel_code_t + +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; HSA: .end_amd_kernel_code_t + + +; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] + define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -21,10 +40,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 @@ -36,10 +55,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 @@ -51,10 +70,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 @@ -66,10 +85,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 @@ -81,10 +100,10 @@ entry: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 @@ -92,74 +111,33 @@ entry: ret void } -; FUNC-LABEL: {{^}}local_size_x: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_x (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.x() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[1].W - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_y (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.y() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].X - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @local_size_z (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.r600.read.local.size.z() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim: -; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] -; EG: MOV [[VAL]], KC0[2].Z - -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim (i32 addrspace(1)* %out) { -entry: - %0 = call i32 @llvm.AMDGPU.read.workdim() #0 - store i32 %0, i32 addrspace(1)* %out - ret void -} - -; The tgid values are stored in sgprs offset by the number of user sgprs. -; Currently we always use exactly 2 user sgprs for the pointer to the -; kernel arguments, but this may change in the future. +; The tgid values are stored in sgprs offset by the number of user +; sgprs. ; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; HSA: .amd_kernel_code_t +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; HSA: .end_amd_kernel_code_t + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_x (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0 store i32 %0, i32 addrspace(1)* %out @@ -167,9 +145,25 @@ entry: } ; FUNC-LABEL: {{^}}tgid_y: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 1 +; HSA: compute_pgm_rsrc2_tgid_z_en = 0 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3 +; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7 ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_y (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.y() #0 store i32 %0, i32 addrspace(1)* %out @@ -177,102 +171,83 @@ entry: } ; FUNC-LABEL: {{^}}tgid_z: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6 +; HSA: compute_pgm_rsrc2_user_sgpr = 6 +; HSA: compute_pgm_rsrc2_tgid_x_en = 1 +; HSA: compute_pgm_rsrc2_tgid_y_en = 0 +; HSA: compute_pgm_rsrc2_tgid_z_en = 1 +; HSA: compute_pgm_rsrc2_tg_size_en = 0 +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 +; HSA: enable_sgpr_private_segment_buffer = 1 +; HSA: enable_sgpr_dispatch_ptr = 0 +; HSA: enable_sgpr_queue_ptr = 0 +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_dispatch_id = 0 +; HSA: enable_sgpr_flat_scratch_init = 0 +; HSA: enable_sgpr_private_segment_size = 0 +; HSA: enable_sgpr_grid_workgroup_count_x = 0 +; HSA: enable_sgpr_grid_workgroup_count_y = 0 +; HSA: enable_sgpr_grid_workgroup_count_z = 0 + +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}} +; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}} ; GCN: buffer_store_dword [[VVAL]] -define void @tgid_z (i32 addrspace(1)* %out) { + +; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6 +; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2 +; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 +; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 +; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 +define void @tgid_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 132{{$}} + ; FUNC-LABEL: {{^}}tidig_x: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0 ; GCN: buffer_store_dword v0 -define void @tidig_x (i32 addrspace(1)* %out) { +define void @tidig_x(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.x() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 2180{{$}} + ; FUNC-LABEL: {{^}}tidig_y: + +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1 ; GCN: buffer_store_dword v1 -define void @tidig_y (i32 addrspace(1)* %out) { +define void @tidig_y(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.y() #0 store i32 %0, i32 addrspace(1)* %out ret void } +; GCN-NOHSA: .section .AMDGPU.config +; GCN-NOHSA: .long 47180 +; GCN-NOHSA-NEXT: .long 4228{{$}} + ; FUNC-LABEL: {{^}}tidig_z: +; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2 ; GCN: buffer_store_dword v2 -define void @tidig_z (i32 addrspace(1)* %out) { +define void @tidig_z(i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tidig.z() #0 store i32 %0, i32 addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}local_size_x_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_x_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.x() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_y_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_y_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.y() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}local_size_z_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 -; GCN-NOT: 0xffff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN-NEXT: buffer_store_dword [[VVAL]] -define void @local_size_z_known_bits(i32 addrspace(1)* %out) { -entry: - %size = call i32 @llvm.r600.read.local.size.z() #0 - %shl = shl i32 %size, 16 - %shr = lshr i32 %shl, 16 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - -; FUNC-LABEL: {{^}}get_work_dim_known_bits: -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN-NOT: 0xff -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] -define void @get_work_dim_known_bits(i32 addrspace(1)* %out) { -entry: - %dim = call i32 @llvm.AMDGPU.read.workdim() #0 - %shl = shl i32 %dim, 24 - %shr = lshr i32 %shl, 24 - store i32 %shr, i32 addrspace(1)* %out - ret void -} - declare i32 @llvm.r600.read.ngroups.x() #0 declare i32 @llvm.r600.read.ngroups.y() #0 declare i32 @llvm.r600.read.ngroups.z() #0 @@ -281,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0 declare i32 @llvm.r600.read.global.size.y() #0 declare i32 @llvm.r600.read.global.size.z() #0 -declare i32 @llvm.r600.read.local.size.x() #0 -declare i32 @llvm.r600.read.local.size.y() #0 -declare i32 @llvm.r600.read.local.size.z() #0 - declare i32 @llvm.r600.read.tgid.x() #0 declare i32 @llvm.r600.read.tgid.y() #0 declare i32 @llvm.r600.read.tgid.z() #0