forked from OSchip/llvm-project
parent
48ab526f12
commit
99c14524ec
|
@ -24,6 +24,8 @@ namespace {
|
|||
|
||||
class AMDGPUAnnotateKernelFeatures : public ModulePass {
|
||||
private:
|
||||
static bool hasAddrSpaceCast(const Function &F);
|
||||
|
||||
void addAttrToCallers(Function *Intrin, StringRef AttrName);
|
||||
bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
|
||||
|
||||
|
@ -48,12 +50,29 @@ char AMDGPUAnnotateKernelFeatures::ID = 0;
|
|||
|
||||
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
|
||||
|
||||
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
|
||||
"Add AMDGPU function attributes", false, false)
|
||||
|
||||
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
|
||||
"Add AMDGPU function attributes", false, false)
|
||||
INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
|
||||
"Add AMDGPU function attributes", false, false)
|
||||
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
|
||||
unsigned SrcAS = ASC->getSrcAddressSpace();
|
||||
|
||||
// The queue ptr is only needed when casting to flat, not from it.
|
||||
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
|
||||
}
|
||||
|
||||
// Return true if an addrspacecast is used that requires the queue ptr.
|
||||
bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
|
||||
for (const BasicBlock &BB : F) {
|
||||
for (const Instruction &I : BB) {
|
||||
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
|
||||
if (castRequiresQueuePtr(ASC))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
|
||||
StringRef AttrName) {
|
||||
|
@ -117,9 +136,18 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
|
|||
// always initialized.
|
||||
|
||||
bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
|
||||
if (TT.getOS() == Triple::AMDHSA)
|
||||
if (TT.getOS() == Triple::AMDHSA) {
|
||||
Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
|
||||
|
||||
for (Function &F : M) {
|
||||
if (F.hasFnAttribute("amdgpu-queue-ptr"))
|
||||
continue;
|
||||
|
||||
if (hasAddrSpaceCast(F))
|
||||
F.addFnAttr("amdgpu-queue-ptr");
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
|
|
@ -147,7 +147,6 @@ private:
|
|||
bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
|
||||
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
|
||||
bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
|
||||
SDNode *SelectAddrSpaceCast(SDNode *N);
|
||||
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
|
||||
|
@ -526,8 +525,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
|||
Lowering.legalizeTargetIndependentNode(N, *CurDAG);
|
||||
break;
|
||||
}
|
||||
case ISD::ADDRSPACECAST:
|
||||
return SelectAddrSpaceCast(N);
|
||||
case ISD::AND:
|
||||
case ISD::SRL:
|
||||
case ISD::SRA:
|
||||
|
@ -1332,69 +1329,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
|
|||
!isa<ConstantSDNode>(Offset);
|
||||
}
|
||||
|
||||
// FIXME: This is incorrect and only enough to be able to compile.
|
||||
SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
|
||||
AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
|
||||
SDLoc DL(N);
|
||||
|
||||
const MachineFunction &MF = CurDAG->getMachineFunction();
|
||||
DiagnosticInfoUnsupported NotImplemented(
|
||||
*MF.getFunction(), "addrspacecast not implemented", DL.getDebugLoc());
|
||||
CurDAG->getContext()->diagnose(NotImplemented);
|
||||
|
||||
assert(Subtarget->hasFlatAddressSpace() &&
|
||||
"addrspacecast only supported with flat address space!");
|
||||
|
||||
assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
|
||||
ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
|
||||
"Can only cast to / from flat address space!");
|
||||
|
||||
// The flat instructions read the address as the index of the VGPR holding the
|
||||
// address, so casting should just be reinterpreting the base VGPR, so just
|
||||
// insert trunc / bitcast / zext.
|
||||
|
||||
SDValue Src = ASC->getOperand(0);
|
||||
EVT DestVT = ASC->getValueType(0);
|
||||
EVT SrcVT = Src.getValueType();
|
||||
|
||||
unsigned SrcSize = SrcVT.getSizeInBits();
|
||||
unsigned DestSize = DestVT.getSizeInBits();
|
||||
|
||||
if (SrcSize > DestSize) {
|
||||
assert(SrcSize == 64 && DestSize == 32);
|
||||
return CurDAG->getMachineNode(
|
||||
TargetOpcode::EXTRACT_SUBREG,
|
||||
DL,
|
||||
DestVT,
|
||||
Src,
|
||||
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
|
||||
}
|
||||
|
||||
if (DestSize > SrcSize) {
|
||||
assert(SrcSize == 32 && DestSize == 64);
|
||||
|
||||
// FIXME: This is probably wrong, we should never be defining
|
||||
// a register class with both VGPRs and SGPRs
|
||||
SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
|
||||
MVT::i32);
|
||||
|
||||
const SDValue Ops[] = {
|
||||
RC,
|
||||
Src,
|
||||
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
|
||||
SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
|
||||
CurDAG->getConstant(0, DL, MVT::i32)), 0),
|
||||
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
|
||||
};
|
||||
|
||||
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
|
||||
DL, N->getValueType(0), Ops);
|
||||
}
|
||||
|
||||
assert(SrcSize == 64 && DestSize == 64);
|
||||
return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
|
||||
}
|
||||
|
||||
SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
|
||||
uint32_t Offset, uint32_t Width) {
|
||||
// Transformation function, pack the offset and width of a BFE into
|
||||
|
|
|
@ -278,6 +278,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
|
|||
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
|
||||
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
|
||||
|
||||
if (Subtarget->hasFlatAddressSpace()) {
|
||||
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
|
||||
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
|
||||
}
|
||||
|
||||
setTargetDAGCombine(ISD::FADD);
|
||||
setTargetDAGCombine(ISD::FSUB);
|
||||
setTargetDAGCombine(ISD::FMINNUM);
|
||||
|
@ -1232,6 +1237,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
||||
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
|
||||
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
|
||||
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
@ -1390,6 +1396,84 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
|
|||
return Chain;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::getSegmentAperture(unsigned AS,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc SL;
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
SDValue QueuePtr = CreateLiveInRegister(
|
||||
DAG, &AMDGPU::SReg_64RegClass, Info->getQueuePtrUserSGPR(), MVT::i64);
|
||||
|
||||
// Offset into amd_queue_t for group_segment_aperture_base_hi /
|
||||
// private_segment_aperture_base_hi.
|
||||
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
|
||||
|
||||
SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
|
||||
DAG.getConstant(StructOffset, SL, MVT::i64));
|
||||
|
||||
// TODO: Use custom target PseudoSourceValue.
|
||||
// TODO: We should use the value from the IR intrinsic call, but it might not
|
||||
// be available and how do we get it?
|
||||
Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
|
||||
AMDGPUAS::CONSTANT_ADDRESS));
|
||||
|
||||
MachinePointerInfo PtrInfo(V, StructOffset);
|
||||
return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr,
|
||||
PtrInfo, false,
|
||||
false, true,
|
||||
MinAlign(64, StructOffset));
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
|
||||
|
||||
SDValue Src = ASC->getOperand(0);
|
||||
|
||||
// FIXME: Really support non-0 null pointers.
|
||||
SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
|
||||
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
|
||||
|
||||
// flat -> local/private
|
||||
if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
|
||||
if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
|
||||
ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
|
||||
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
|
||||
|
||||
return DAG.getNode(ISD::SELECT, SL, MVT::i32,
|
||||
NonNull, Ptr, SegmentNullPtr);
|
||||
}
|
||||
}
|
||||
|
||||
// local/private -> flat
|
||||
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
|
||||
if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
|
||||
ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
|
||||
SDValue NonNull
|
||||
= DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
|
||||
|
||||
SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
|
||||
SDValue CvtPtr
|
||||
= DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
|
||||
|
||||
return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
|
||||
DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
|
||||
FlatNullPtr);
|
||||
}
|
||||
}
|
||||
|
||||
// global <-> flat are no-ops and never emitted.
|
||||
|
||||
const MachineFunction &MF = DAG.getMachineFunction();
|
||||
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
|
||||
*MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
|
||||
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
|
||||
|
||||
return DAG.getUNDEF(ASC->getValueType(0));
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
|
||||
SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
|
|
|
@ -45,6 +45,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
|||
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
|
||||
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue performUCharToFloatCombine(SDNode *N,
|
||||
|
|
|
@ -270,6 +270,10 @@ public:
|
|||
ScratchWaveOffsetReg = Reg;
|
||||
}
|
||||
|
||||
unsigned getQueuePtrUserSGPR() const {
|
||||
return QueuePtrUserSGPR;
|
||||
}
|
||||
|
||||
bool hasSpilledSGPRs() const {
|
||||
return HasSpilledSGPRs;
|
||||
}
|
||||
|
|
|
@ -1,18 +1,208 @@
|
|||
; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
|
||||
|
||||
; ERROR: addrspacecast not implemented
|
||||
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
|
||||
; HSA: enable_sgpr_private_segment_buffer = 1
|
||||
; HSA: enable_sgpr_dispatch_ptr = 0
|
||||
; HSA: enable_sgpr_queue_ptr = 1
|
||||
|
||||
; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
|
||||
; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
|
||||
; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
|
||||
; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
|
||||
; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
|
||||
; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
|
||||
|
||||
; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
|
||||
; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
|
||||
; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
|
||||
; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
|
||||
; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
||||
|
||||
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
|
||||
define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
|
||||
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
|
||||
store volatile i32 7, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
|
||||
; HSA: enable_sgpr_private_segment_buffer = 1
|
||||
; HSA: enable_sgpr_dispatch_ptr = 0
|
||||
; HSA: enable_sgpr_queue_ptr = 1
|
||||
|
||||
; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
|
||||
; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
|
||||
|
||||
; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
|
||||
; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||
|
||||
; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
|
||||
; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
|
||||
; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
||||
|
||||
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
|
||||
define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
|
||||
%stof = addrspacecast i32* %ptr to i32 addrspace(4)*
|
||||
store volatile i32 7, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; no-op
|
||||
; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
|
||||
; HSA: enable_sgpr_queue_ptr = 0
|
||||
|
||||
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
|
||||
; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
|
||||
define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
|
||||
%stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
|
||||
store volatile i32 7, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; no-op
|
||||
; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
|
||||
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
||||
; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
|
||||
define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
|
||||
%stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
|
||||
%ld = load volatile i32, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
|
||||
; HSA: enable_sgpr_private_segment_buffer = 1
|
||||
; HSA: enable_sgpr_dispatch_ptr = 0
|
||||
; HSA: enable_sgpr_queue_ptr = 0
|
||||
|
||||
; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
|
||||
; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
|
||||
; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
|
||||
; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
|
||||
define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
|
||||
store volatile i32 0, i32 addrspace(3)* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
|
||||
; HSA: enable_sgpr_private_segment_buffer = 1
|
||||
; HSA: enable_sgpr_dispatch_ptr = 0
|
||||
; HSA: enable_sgpr_queue_ptr = 0
|
||||
|
||||
; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
|
||||
; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
|
||||
; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
|
||||
; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
|
||||
define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
|
||||
store volatile i32 0, i32* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
|
||||
; HSA: enable_sgpr_queue_ptr = 0
|
||||
|
||||
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
|
||||
; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
|
||||
define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
|
||||
store volatile i32 0, i32 addrspace(1)* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
|
||||
; HSA: enable_sgpr_queue_ptr = 0
|
||||
|
||||
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
|
||||
; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
|
||||
define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
|
||||
load volatile i32, i32 addrspace(2)* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
|
||||
; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
|
||||
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
|
||||
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
|
||||
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
|
||||
define void @cast_0_group_to_flat_addrspacecast() #0 {
|
||||
%cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
|
||||
store i32 7, i32 addrspace(4)* %cast
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
|
||||
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
|
||||
; HSA: ds_write_b32 [[PTR]], [[K]]
|
||||
define void @cast_0_flat_to_group_addrspacecast() #0 {
|
||||
%cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
|
||||
store i32 7, i32 addrspace(3)* %cast
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
|
||||
; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
|
||||
; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
|
||||
; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
|
||||
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
|
||||
define void @cast_neg1_group_to_flat_addrspacecast() #0 {
|
||||
%cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
|
||||
store i32 7, i32 addrspace(4)* %cast
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
|
||||
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
|
||||
; HSA: ds_write_b32 [[PTR]], [[K]]
|
||||
define void @cast_neg1_flat_to_group_addrspacecast() #0 {
|
||||
%cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
|
||||
store i32 7, i32 addrspace(3)* %cast
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
|
||||
; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
|
||||
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
|
||||
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
|
||||
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
|
||||
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
|
||||
define void @cast_0_private_to_flat_addrspacecast() #0 {
|
||||
%cast = addrspacecast i32* null to i32 addrspace(4)*
|
||||
store i32 7, i32 addrspace(4)* %cast
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
|
||||
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
|
||||
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
|
||||
; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
|
||||
define void @cast_0_flat_to_private_addrspacecast() #0 {
|
||||
%cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
|
||||
store i32 7, i32* %cast
|
||||
ret void
|
||||
}
|
||||
|
||||
; Disable optimizations in case there are optimizations added that
|
||||
; specialize away generic pointer accesses.
|
||||
|
||||
; CHECK-LABEL: {{^}}branch_use_flat_i32:
|
||||
; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
|
||||
; CHECK: s_endpgm
|
||||
; HSA-LABEL: {{^}}branch_use_flat_i32:
|
||||
; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
|
||||
; HSA: s_endpgm
|
||||
define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
|
||||
entry:
|
||||
%cmp = icmp ne i32 %c, 0
|
||||
|
@ -34,20 +224,17 @@ end:
|
|||
ret void
|
||||
}
|
||||
|
||||
; TODO: This should not be zero when registers are used for small
|
||||
; scratch allocations again.
|
||||
|
||||
; Check for prologue initializing special SGPRs pointing to scratch.
|
||||
; CHECK-LABEL: {{^}}store_flat_scratch:
|
||||
; CHECK: s_movk_i32 flat_scratch_lo, 0
|
||||
; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
|
||||
; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
|
||||
; CHECK: flat_store_dword
|
||||
; CHECK: s_barrier
|
||||
; CHECK: flat_load_dword
|
||||
; HSA-LABEL: {{^}}store_flat_scratch:
|
||||
; HSA: s_mov_b32 flat_scratch_lo, s9
|
||||
; HSA: s_add_u32 [[ADD:s[0-9]+]], s8, s11
|
||||
; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
|
||||
; HSA: flat_store_dword
|
||||
; HSA: s_barrier
|
||||
; HSA: flat_load_dword
|
||||
define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
|
||||
%alloca = alloca i32, i32 9, align 4
|
||||
%x = call i32 @llvm.amdgcn.workitem.id.x() #3
|
||||
%x = call i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
%pptr = getelementptr i32, i32* %alloca, i32 %x
|
||||
%fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
|
||||
store i32 %x, i32 addrspace(4)* %fptr
|
||||
|
@ -59,8 +246,8 @@ define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
|
|||
}
|
||||
|
||||
declare void @llvm.amdgcn.s.barrier() #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #3
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind convergent }
|
||||
attributes #3 = { nounwind readnone }
|
||||
attributes #2 = { nounwind readnone }
|
||||
|
|
|
@ -164,6 +164,63 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
|
||||
define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
|
||||
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
|
||||
store volatile i32 0, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
|
||||
define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
|
||||
%stof = addrspacecast i32* %ptr to i32 addrspace(4)*
|
||||
store volatile i32 0, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
|
||||
store volatile i32 0, i32 addrspace(3)* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
|
||||
store volatile i32 0, i32* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; No-op addrspacecast should not use queue ptr
|
||||
; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
|
||||
define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
|
||||
%stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
|
||||
store volatile i32 0, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
|
||||
define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
|
||||
%stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
|
||||
%ld = load volatile i32, i32 addrspace(4)* %stof
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
|
||||
store volatile i32 0, i32 addrspace(1)* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
|
||||
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
|
||||
%ld = load volatile i32, i32 addrspace(2)* %ftos
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
|
||||
|
||||
; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
|
||||
define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
|
||||
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
|
||||
store volatile i32 0, i32 addrspace(1)* %stof
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue