forked from OSchip/llvm-project
AMDGPU: Use the implicit kernargs for code object version 5
Summary: Specifically, for trap handling, for targets that do not support getDoorbellID, we load the queue_ptr from the implicit kernarg, and move queue_ptr to s[0:1]. To get aperture bases when targets do not have aperture registers, we load private_base or shared_base directly from the implicit kernarg. In clang, we use implicitarg_ptr + offsets to implement __builtin_amdgcn_workgroup_size_{xyz}. Reviewers: arsenm, sameerds, yaxunl Differential Revision: https://reviews.llvm.org/D120265
This commit is contained in:
parent
2c9995c117
commit
dd5895cc39
|
@ -16258,12 +16258,31 @@ Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
|
|||
return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
|
||||
}
|
||||
|
||||
Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
|
||||
auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
|
||||
auto *Call = CGF.Builder.CreateCall(F);
|
||||
Call->addRetAttr(
|
||||
Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
|
||||
Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
|
||||
return Call;
|
||||
}
|
||||
|
||||
// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
|
||||
Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
|
||||
const unsigned XOffset = 4;
|
||||
auto *DP = EmitAMDGPUDispatchPtr(CGF);
|
||||
// Indexing the HSA kernel_dispatch_packet struct.
|
||||
auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2);
|
||||
bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion ==
|
||||
clang::TargetOptions::COV_5;
|
||||
Constant *Offset;
|
||||
Value *DP;
|
||||
if (IsCOV_5) {
|
||||
// Indexing the implicit kernarg segment.
|
||||
Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
|
||||
DP = EmitAMDGPUImplicitArgPtr(CGF);
|
||||
} else {
|
||||
// Indexing the HSA kernel_dispatch_packet struct.
|
||||
Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
|
||||
DP = EmitAMDGPUDispatchPtr(CGF);
|
||||
}
|
||||
|
||||
auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
|
||||
auto *DstTy =
|
||||
CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
|
||||
|
|
|
@ -1,17 +1,31 @@
|
|||
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
|
||||
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
|
||||
// RUN: | FileCheck %s
|
||||
// RUN: | FileCheck -check-prefix=PRECOV5 %s
|
||||
|
||||
|
||||
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
|
||||
// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \
|
||||
// RUN: | FileCheck -check-prefix=COV5 %s
|
||||
|
||||
#include "Inputs/cuda.h"
|
||||
|
||||
// CHECK-LABEL: test_get_workgroup_size
|
||||
// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4
|
||||
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6
|
||||
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8
|
||||
// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// PRECOV5-LABEL: test_get_workgroup_size
|
||||
// PRECOV5: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4
|
||||
// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6
|
||||
// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8
|
||||
// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
|
||||
// COV5-LABEL: test_get_workgroup_size
|
||||
// COV5: call align 8 dereferenceable(256) i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||
// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 12
|
||||
// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 14
|
||||
// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 16
|
||||
// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load
|
||||
__device__ void test_get_workgroup_size(int d, int *out)
|
||||
{
|
||||
switch (d) {
|
||||
|
|
|
@ -542,16 +542,14 @@ private:
|
|||
bool funcRetrievesHeapPtr(Attributor &A) {
|
||||
if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
|
||||
return false;
|
||||
auto Pos = llvm::AMDGPU::getHeapPtrImplicitArgPosition();
|
||||
AAPointerInfo::OffsetAndSize OAS(Pos, 8);
|
||||
AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
|
||||
return funcRetrievesImplicitKernelArg(A, OAS);
|
||||
}
|
||||
|
||||
bool funcRetrievesQueuePtr(Attributor &A) {
|
||||
if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
|
||||
return false;
|
||||
auto Pos = llvm::AMDGPU::getQueuePtrImplicitArgPosition();
|
||||
AAPointerInfo::OffsetAndSize OAS(Pos, 8);
|
||||
AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
|
||||
return funcRetrievesImplicitKernelArg(A, OAS);
|
||||
}
|
||||
|
||||
|
|
|
@ -4382,10 +4382,14 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
|
|||
uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
|
||||
ExplicitArgOffset;
|
||||
switch (Param) {
|
||||
case GRID_DIM:
|
||||
case FIRST_IMPLICIT:
|
||||
return ArgOffset;
|
||||
case GRID_OFFSET:
|
||||
return ArgOffset + 4;
|
||||
case PRIVATE_BASE:
|
||||
return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
|
||||
case SHARED_BASE:
|
||||
return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
|
||||
case QUEUE_PTR:
|
||||
return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
|
||||
}
|
||||
llvm_unreachable("unexpected implicit parameter type");
|
||||
}
|
||||
|
|
|
@ -320,8 +320,9 @@ public:
|
|||
|
||||
enum ImplicitParameter {
|
||||
FIRST_IMPLICIT,
|
||||
GRID_DIM = FIRST_IMPLICIT,
|
||||
GRID_OFFSET,
|
||||
PRIVATE_BASE,
|
||||
SHARED_BASE,
|
||||
QUEUE_PTR,
|
||||
};
|
||||
|
||||
/// Helper function that returns the byte offset of the given
|
||||
|
|
|
@ -1810,6 +1810,39 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
|
|||
return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
|
||||
}
|
||||
|
||||
// TODO: can we be smarter about machine pointer info?
|
||||
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
||||
Register LoadAddr = MRI.createGenericVirtualRegister(
|
||||
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
||||
// For code object version 5, private_base and shared_base are passed through
|
||||
// implicit kernargs.
|
||||
if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
|
||||
AMDGPUTargetLowering::ImplicitParameter Param =
|
||||
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
|
||||
: AMDGPUTargetLowering::PRIVATE_BASE;
|
||||
uint64_t Offset =
|
||||
ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
|
||||
|
||||
Register KernargPtrReg = MRI.createGenericVirtualRegister(
|
||||
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
||||
|
||||
if (!loadInputValue(KernargPtrReg, B,
|
||||
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
|
||||
return Register();
|
||||
|
||||
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
||||
PtrInfo,
|
||||
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOInvariant,
|
||||
LLT::scalar(32), commonAlignment(Align(64), Offset));
|
||||
|
||||
// Pointer address
|
||||
B.buildPtrAdd(LoadAddr, KernargPtrReg,
|
||||
B.buildConstant(LLT::scalar(64), Offset).getReg(0));
|
||||
// Load address
|
||||
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
|
||||
}
|
||||
|
||||
Register QueuePtr = MRI.createGenericVirtualRegister(
|
||||
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
||||
|
||||
|
@ -1820,17 +1853,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
|
|||
// private_segment_aperture_base_hi.
|
||||
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
|
||||
|
||||
// TODO: can we be smarter about machine pointer info?
|
||||
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
||||
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
||||
PtrInfo,
|
||||
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOInvariant,
|
||||
LLT::scalar(32), commonAlignment(Align(64), StructOffset));
|
||||
|
||||
Register LoadAddr;
|
||||
|
||||
B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
|
||||
B.buildPtrAdd(LoadAddr, QueuePtr,
|
||||
B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
|
||||
return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
|
||||
}
|
||||
|
||||
|
@ -4817,6 +4847,47 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
|
|||
|
||||
bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
|
||||
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
|
||||
MachineFunction &MF = B.getMF();
|
||||
const LLT S64 = LLT::scalar(64);
|
||||
|
||||
Register SGPR01(AMDGPU::SGPR0_SGPR1);
|
||||
// For code object version 5, queue_ptr is passed through implicit kernarg.
|
||||
if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
|
||||
AMDGPUTargetLowering::ImplicitParameter Param =
|
||||
AMDGPUTargetLowering::QUEUE_PTR;
|
||||
uint64_t Offset =
|
||||
ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
|
||||
|
||||
Register KernargPtrReg = MRI.createGenericVirtualRegister(
|
||||
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
||||
|
||||
if (!loadInputValue(KernargPtrReg, B,
|
||||
AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
|
||||
return false;
|
||||
|
||||
// TODO: can we be smarter about machine pointer info?
|
||||
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
||||
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
||||
PtrInfo,
|
||||
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOInvariant,
|
||||
LLT::scalar(64), commonAlignment(Align(64), Offset));
|
||||
|
||||
// Pointer address
|
||||
Register LoadAddr = MRI.createGenericVirtualRegister(
|
||||
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
|
||||
B.buildPtrAdd(LoadAddr, KernargPtrReg,
|
||||
B.buildConstant(LLT::scalar(64), Offset).getReg(0));
|
||||
// Load address
|
||||
Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
|
||||
B.buildCopy(SGPR01, Temp);
|
||||
B.buildInstr(AMDGPU::S_TRAP)
|
||||
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
|
||||
.addReg(SGPR01, RegState::Implicit);
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Pass queue pointer to trap handler as input, and insert trap instruction
|
||||
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
|
||||
Register LiveIn =
|
||||
|
@ -4824,7 +4895,6 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
|
|||
if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
|
||||
return false;
|
||||
|
||||
Register SGPR01(AMDGPU::SGPR0_SGPR1);
|
||||
B.buildCopy(SGPR01, LiveIn);
|
||||
B.buildInstr(AMDGPU::S_TRAP)
|
||||
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
|
||||
|
|
|
@ -780,6 +780,17 @@ enum OpSel : uint64_t {
|
|||
|
||||
} // namespace VOP3PEncoding
|
||||
|
||||
namespace ImplicitArg {
|
||||
// Implicit kernel argument offset for code object version 5.
|
||||
enum Offset_COV5 : unsigned {
|
||||
HOSTCALL_PTR_OFFSET = 80,
|
||||
HEAP_PTR_OFFSET = 96,
|
||||
PRIVATE_BASE_OFFSET = 192,
|
||||
SHARED_BASE_OFFSET = 196,
|
||||
QUEUE_PTR_OFFSET = 200,
|
||||
};
|
||||
|
||||
} // namespace ImplicitArg
|
||||
} // namespace AMDGPU
|
||||
|
||||
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
|
||||
|
|
|
@ -5442,24 +5442,41 @@ SDValue SITargetLowering::lowerTrapEndpgm(
|
|||
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
|
||||
const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
uint64_t Offset = getImplicitParameterOffset(MF, Param);
|
||||
SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
|
||||
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
||||
return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
|
||||
MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOInvariant);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerTrapHsaQueuePtr(
|
||||
SDValue Op, SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
SDValue Chain = Op.getOperand(0);
|
||||
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
Register UserSGPR = Info->getQueuePtrUserSGPR();
|
||||
|
||||
SDValue QueuePtr;
|
||||
if (UserSGPR == AMDGPU::NoRegister) {
|
||||
// We probably are in a function incorrectly marked with
|
||||
// amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap,
|
||||
// so just use a null pointer.
|
||||
QueuePtr = DAG.getConstant(0, SL, MVT::i64);
|
||||
// For code object version 5, QueuePtr is passed through implicit kernarg.
|
||||
if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
|
||||
QueuePtr =
|
||||
loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
|
||||
} else {
|
||||
QueuePtr = CreateLiveInRegister(
|
||||
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
Register UserSGPR = Info->getQueuePtrUserSGPR();
|
||||
|
||||
if (UserSGPR == AMDGPU::NoRegister) {
|
||||
// We probably are in a function incorrectly marked with
|
||||
// amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
|
||||
// trap, so just use a null pointer.
|
||||
QueuePtr = DAG.getConstant(0, SL, MVT::i64);
|
||||
} else {
|
||||
QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
|
||||
MVT::i64);
|
||||
}
|
||||
}
|
||||
|
||||
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
|
||||
|
@ -5535,6 +5552,14 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
|
|||
return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
|
||||
}
|
||||
|
||||
// For code object version 5, private_base and shared_base are passed through
|
||||
// implicit kernargs.
|
||||
if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
|
||||
ImplicitParameter Param =
|
||||
(AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
|
||||
return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
|
||||
}
|
||||
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
Register UserSGPR = Info->getQueuePtrUserSGPR();
|
||||
|
|
|
@ -53,6 +53,9 @@ private:
|
|||
uint64_t Offset, Align Alignment,
|
||||
bool Signed,
|
||||
const ISD::InputArg *Arg = nullptr) const;
|
||||
SDValue loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, const SDLoc &DL,
|
||||
Align Alignment,
|
||||
ImplicitParameter Param) const;
|
||||
|
||||
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
|
||||
const SDLoc &SL, SDValue Chain,
|
||||
|
|
|
@ -149,27 +149,13 @@ unsigned getHostcallImplicitArgPosition() {
|
|||
case 4:
|
||||
return 24;
|
||||
case 5:
|
||||
return 80;
|
||||
return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET;
|
||||
default:
|
||||
llvm_unreachable("Unexpected code object version");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned getHeapPtrImplicitArgPosition() {
|
||||
if (AmdhsaCodeObjectVersion == 5)
|
||||
return 96;
|
||||
llvm_unreachable("hidden_heap is supported only by code object version 5");
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned getQueuePtrImplicitArgPosition() {
|
||||
if (AmdhsaCodeObjectVersion == 5)
|
||||
return 200;
|
||||
llvm_unreachable("queue_ptr is supported only by code object version 5");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define GET_MIMGBaseOpcodesTable_IMPL
|
||||
#define GET_MIMGDimInfoTable_IMPL
|
||||
#define GET_MIMGInfoTable_IMPL
|
||||
|
|
|
@ -57,12 +57,6 @@ bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
|
|||
/// \returns The offset of the hostcall pointer argument from implicitarg_ptr
|
||||
unsigned getHostcallImplicitArgPosition();
|
||||
|
||||
/// \returns The offset of the heap ptr argument from implicitarg_ptr
|
||||
unsigned getHeapPtrImplicitArgPosition();
|
||||
|
||||
/// \returns The offset of the queue ptr argument from implicitarg_ptr
|
||||
unsigned getQueuePtrImplicitArgPosition();
|
||||
|
||||
/// \returns Code object version.
|
||||
unsigned getAmdhsaCodeObjectVersion();
|
||||
|
||||
|
|
|
@ -0,0 +1,546 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s
|
||||
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s
|
||||
|
||||
define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) {
|
||||
; GFX8V3-LABEL: addrspacecast:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x44
|
||||
; GFX8V3-NEXT: s_load_dword s5, s[4:5], 0x40
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_mov_b32 s2, s0
|
||||
; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX8V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
|
||||
; GFX8V3-NEXT: s_mov_b32 s4, s1
|
||||
; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: addrspacecast:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x44
|
||||
; GFX8V4-NEXT: s_load_dword s5, s[4:5], 0x40
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_mov_b32 s2, s0
|
||||
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX8V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
|
||||
; GFX8V4-NEXT: s_mov_b32 s4, s1
|
||||
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: addrspacecast:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xc8
|
||||
; GFX8V5-NEXT: s_load_dword s5, s[4:5], 0xcc
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_mov_b32 s2, s0
|
||||
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX8V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
|
||||
; GFX8V5-NEXT: s_mov_b32 s4, s1
|
||||
; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: addrspacecast:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s3, s2, 16
|
||||
; GFX9V3-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: s_mov_b32 s2, s0
|
||||
; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
|
||||
; GFX9V3-NEXT: s_lshl_b32 s5, s4, 16
|
||||
; GFX9V3-NEXT: s_mov_b32 s4, s1
|
||||
; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9V3-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9V3-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: addrspacecast:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s3, s2, 16
|
||||
; GFX9V4-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: s_mov_b32 s2, s0
|
||||
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
|
||||
; GFX9V4-NEXT: s_lshl_b32 s5, s4, 16
|
||||
; GFX9V4-NEXT: s_mov_b32 s4, s1
|
||||
; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: addrspacecast:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s3, s2, 16
|
||||
; GFX9V5-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v2, 1
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: s_mov_b32 s2, s0
|
||||
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0
|
||||
; GFX9V5-NEXT: s_lshl_b32 s5, s4, 16
|
||||
; GFX9V5-NEXT: s_mov_b32 s4, s1
|
||||
; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v2, 2
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32*
|
||||
%flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32*
|
||||
store volatile i32 1, i32* %flat.private
|
||||
store volatile i32 2, i32* %flat.local
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) {
|
||||
; GFX8V3-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
|
||||
%zext = zext i1 %is.shared to i32
|
||||
store volatile i32 %zext, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) {
|
||||
; GFX8V3-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr)
|
||||
%zext = zext i1 %is.private to i32
|
||||
store volatile i32 %zext, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_trap() {
|
||||
; GFX8V3-LABEL: llvm_trap:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8V3-NEXT: s_trap 2
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_trap:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8V4-NEXT: s_trap 2
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_trap:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_trap 2
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_trap:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX9V3-NEXT: s_trap 2
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_trap:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_trap 2
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_trap:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_trap 2
|
||||
call void @llvm.trap()
|
||||
unreachable
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_debugtrap() {
|
||||
; GFX8V3-LABEL: llvm_debugtrap:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_trap 3
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_debugtrap:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_trap 3
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_debugtrap:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_trap 3
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_debugtrap:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_trap 3
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_debugtrap:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_trap 3
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_debugtrap:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_trap 3
|
||||
call void @llvm.debugtrap()
|
||||
unreachable
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
|
||||
; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX8V3-NEXT: s_add_u32 s0, s8, 8
|
||||
; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V3-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8V3-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
|
||||
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V4-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8V4-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_add_u32 s0, s6, 8
|
||||
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8V5-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
|
||||
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7
|
||||
; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
|
||||
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9V4-NEXT: ; kill: killed $sgpr6_sgpr7
|
||||
; GFX9V4-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9V5-NEXT: global_load_ubyte v0, v[0:1], off glc
|
||||
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
|
||||
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||
%dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
%dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
|
||||
%queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr
|
||||
%implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr
|
||||
%dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr
|
||||
store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
|
||||
declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||
declare i64 @llvm.amdgcn.dispatch.id()
|
||||
declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
declare i1 @llvm.amdgcn.is.shared(i8*)
|
||||
declare i1 @llvm.amdgcn.is.private(i8*)
|
||||
declare void @llvm.trap()
|
||||
declare void @llvm.debugtrap()
|
|
@ -0,0 +1,550 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s
|
||||
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s
|
||||
|
||||
define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) {
|
||||
; GFX8V3-LABEL: addrspacecast:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V3-NEXT: s_load_dword s2, s[4:5], 0x44
|
||||
; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x40
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX8V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX8V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX8V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX8V3-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: addrspacecast:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V4-NEXT: s_load_dword s2, s[4:5], 0x44
|
||||
; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x40
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX8V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX8V4-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: addrspacecast:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX8V5-NEXT: s_load_dword s2, s[4:5], 0xc8
|
||||
; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xcc
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX8V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX8V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX8V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX8V5-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: addrspacecast:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX9V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX9V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX9V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; GFX9V3-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX9V3-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: addrspacecast:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX9V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX9V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX9V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; GFX9V4-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX9V4-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: addrspacecast:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
|
||||
; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX9V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1
|
||||
; GFX9V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0
|
||||
; GFX9V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
|
||||
; GFX9V5-NEXT: flat_store_dword v[0:1], v4
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, 2
|
||||
; GFX9V5-NEXT: flat_store_dword v[2:3], v0
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32*
|
||||
%flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32*
|
||||
store volatile i32 1, i32* %flat.private
|
||||
store volatile i32 2, i32* %flat.local
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) {
|
||||
; GFX8V3-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40
|
||||
; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40
|
||||
; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xcc
|
||||
; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4
|
||||
; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4
|
||||
; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_amdgcn_is_shared:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4
|
||||
; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
|
||||
%zext = zext i1 %is.shared to i32
|
||||
store volatile i32 %zext, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) {
|
||||
; GFX8V3-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44
|
||||
; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4
|
||||
; GFX8V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX8V3-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44
|
||||
; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4
|
||||
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX8V4-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dword s0, s[4:5], 0xc8
|
||||
; GFX8V5-NEXT: s_load_dword s1, s[4:5], 0x4
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
|
||||
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX8V5-NEXT: flat_store_dword v[0:1], v0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_load_dword s0, s[4:5], 0x4
|
||||
; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9V3-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_load_dword s0, s[4:5], 0x4
|
||||
; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9V4-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_amdgcn_is_private:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_load_dword s0, s[4:5], 0x4
|
||||
; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
|
||||
; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9V5-NEXT: global_store_dword v[0:1], v0, off
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr)
|
||||
%zext = zext i1 %is.private to i32
|
||||
store volatile i32 %zext, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_trap() {
|
||||
; GFX8V3-LABEL: llvm_trap:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8V3-NEXT: s_trap 2
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_trap:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8V4-NEXT: s_trap 2
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_trap:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xc8
|
||||
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8V5-NEXT: s_trap 2
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_trap:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX9V3-NEXT: s_trap 2
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_trap:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_trap 2
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_trap:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_trap 2
|
||||
call void @llvm.trap()
|
||||
unreachable
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_debugtrap() {
|
||||
; GFX8V3-LABEL: llvm_debugtrap:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: s_trap 3
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_debugtrap:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: s_trap 3
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_debugtrap:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_trap 3
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_debugtrap:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: s_trap 3
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_debugtrap:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: s_trap 3
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_debugtrap:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: s_trap 3
|
||||
call void @llvm.debugtrap()
|
||||
unreachable
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) {
|
||||
; GFX8V3-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX8V3: ; %bb.0:
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX8V3-NEXT: s_add_u32 s0, s8, 8
|
||||
; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V3-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V3-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX8V4: ; %bb.0:
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
|
||||
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V4-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V4-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX8V5: ; %bb.0:
|
||||
; GFX8V5-NEXT: s_add_u32 s0, s6, 8
|
||||
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V5-NEXT: s_addc_u32 s1, s7, 0
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
|
||||
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8V5-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8V5-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V3-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX9V3: ; %bb.0:
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc
|
||||
; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9V3-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7
|
||||
; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX9V3-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9V3-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V3-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V4-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX9V4: ; %bb.0:
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc
|
||||
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX9V4-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX9V4-NEXT: ; kill: killed $sgpr6_sgpr7
|
||||
; GFX9V4-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V4-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9V5-LABEL: llvm_amdgcn_queue_ptr:
|
||||
; GFX9V5: ; %bb.0:
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[0:1] glc
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] offset:8 glc
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc
|
||||
; GFX9V5-NEXT: ; kill: killed $sgpr0_sgpr1
|
||||
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9V5-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9V5-NEXT: s_endpgm
|
||||
%queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
|
||||
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||
%dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
%dispatch.id = call i64 @llvm.amdgcn.dispatch.id()
|
||||
%queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr
|
||||
%implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr
|
||||
%dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr
|
||||
store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
|
||||
declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
|
||||
declare i64 @llvm.amdgcn.dispatch.id()
|
||||
declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
declare i1 @llvm.amdgcn.is.shared(i8*)
|
||||
declare i1 @llvm.amdgcn.is.private(i8*)
|
||||
declare void @llvm.trap()
|
||||
declare void @llvm.debugtrap()
|
Loading…
Reference in New Issue