AMDGPU: Implement addrspacecast

llvm-svn: 267452
This commit is contained in:
Matt Arsenault 2016-04-25 19:27:24 +00:00
parent 48ab526f12
commit 99c14524ec
8 changed files with 398 additions and 93 deletions

View File

@ -24,6 +24,8 @@ namespace {
class AMDGPUAnnotateKernelFeatures : public ModulePass {
private:
static bool hasAddrSpaceCast(const Function &F);
void addAttrToCallers(Function *Intrin, StringRef AttrName);
bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
@ -48,12 +50,29 @@ char AMDGPUAnnotateKernelFeatures::ID = 0;
char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
"Add AMDGPU function attributes", false, false)
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
"Add AMDGPU function attributes", false, false)
INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
"Add AMDGPU function attributes", false, false)
static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
unsigned SrcAS = ASC->getSrcAddressSpace();
// The queue ptr is only needed when casting to flat, not from it.
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
}
// Return true if an addrspacecast is used that requires the queue ptr.
bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
for (const BasicBlock &BB : F) {
for (const Instruction &I : BB) {
if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
if (castRequiresQueuePtr(ASC))
return true;
}
}
}
return false;
}
void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
StringRef AttrName) {
@ -117,9 +136,18 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
// always initialized.
bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
if (TT.getOS() == Triple::AMDHSA)
if (TT.getOS() == Triple::AMDHSA) {
Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
for (Function &F : M) {
if (F.hasFnAttribute("amdgpu-queue-ptr"))
continue;
if (hasAddrSpaceCast(F))
F.addFnAttr("amdgpu-queue-ptr");
}
}
return Changed;
}

View File

@ -147,7 +147,6 @@ private:
bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
SDNode *SelectAddrSpaceCast(SDNode *N);
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@ -526,8 +525,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
Lowering.legalizeTargetIndependentNode(N, *CurDAG);
break;
}
case ISD::ADDRSPACECAST:
return SelectAddrSpaceCast(N);
case ISD::AND:
case ISD::SRL:
case ISD::SRA:
@ -1332,69 +1329,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
!isa<ConstantSDNode>(Offset);
}
// FIXME: This is incorrect and only enough to be able to compile.
SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
SDLoc DL(N);
const MachineFunction &MF = CurDAG->getMachineFunction();
DiagnosticInfoUnsupported NotImplemented(
*MF.getFunction(), "addrspacecast not implemented", DL.getDebugLoc());
CurDAG->getContext()->diagnose(NotImplemented);
assert(Subtarget->hasFlatAddressSpace() &&
"addrspacecast only supported with flat address space!");
assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
"Can only cast to / from flat address space!");
// The flat instructions read the address as the index of the VGPR holding the
// address, so casting should just be reinterpreting the base VGPR, so just
// insert trunc / bitcast / zext.
SDValue Src = ASC->getOperand(0);
EVT DestVT = ASC->getValueType(0);
EVT SrcVT = Src.getValueType();
unsigned SrcSize = SrcVT.getSizeInBits();
unsigned DestSize = DestVT.getSizeInBits();
if (SrcSize > DestSize) {
assert(SrcSize == 64 && DestSize == 32);
return CurDAG->getMachineNode(
TargetOpcode::EXTRACT_SUBREG,
DL,
DestVT,
Src,
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32));
}
if (DestSize > SrcSize) {
assert(SrcSize == 32 && DestSize == 64);
// FIXME: This is probably wrong, we should never be defining
// a register class with both VGPRs and SGPRs
SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL,
MVT::i32);
const SDValue Ops[] = {
RC,
Src,
CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getConstant(0, DL, MVT::i32)), 0),
CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
};
return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
DL, N->getValueType(0), Ops);
}
assert(SrcSize == 64 && DestSize == 64);
return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
}
SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val,
uint32_t Offset, uint32_t Width) {
// Transformation function, pack the offset and width of a BFE into

View File

@ -278,6 +278,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
if (Subtarget->hasFlatAddressSpace()) {
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
}
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
@ -1232,6 +1237,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
}
return SDValue();
}
@ -1390,6 +1396,84 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
SDValue SITargetLowering::getSegmentAperture(unsigned AS,
SelectionDAG &DAG) const {
SDLoc SL;
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SDValue QueuePtr = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, Info->getQueuePtrUserSGPR(), MVT::i64);
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
DAG.getConstant(StructOffset, SL, MVT::i64));
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
// be available and how do we get it?
Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
AMDGPUAS::CONSTANT_ADDRESS));
MachinePointerInfo PtrInfo(V, StructOffset);
return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr,
PtrInfo, false,
false, true,
MinAlign(64, StructOffset));
}
SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SelectionDAG &DAG) const {
SDLoc SL(Op);
const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
SDValue Src = ASC->getOperand(0);
// FIXME: Really support non-0 null pointers.
SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
// flat -> local/private
if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
return DAG.getNode(ISD::SELECT, SL, MVT::i32,
NonNull, Ptr, SegmentNullPtr);
}
}
// local/private -> flat
if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
SDValue NonNull
= DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
SDValue CvtPtr
= DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
FlatNullPtr);
}
}
// global <-> flat are no-ops and never emitted.
const MachineFunction &MF = DAG.getMachineFunction();
DiagnosticInfoUnsupported InvalidAddrSpaceCast(
*MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
DAG.getContext()->diagnose(InvalidAddrSpaceCast);
return DAG.getUNDEF(ASC->getValueType(0));
}
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {

View File

@ -45,6 +45,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
SDValue performUCharToFloatCombine(SDNode *N,

View File

@ -270,6 +270,10 @@ public:
ScratchWaveOffsetReg = Reg;
}
unsigned getQueuePtrUserSGPR() const {
return QueuePtrUserSGPR;
}
bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;
}

View File

@ -1,18 +1,208 @@
; RUN: not llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
; ERROR: addrspacecast not implemented
; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 1
; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
; XUN: llc -O0 -march=amdgcn -mcpu=bonaire -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-NO-PROMOTE %s
; XUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=+promote-alloca < %s | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PROMOTE %s
; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
store volatile i32 7, i32 addrspace(4)* %stof
ret void
}
; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 1
; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; HSA-DAG: v_cmp_ne_i32_e64 vcc, -1, [[PTR]]
; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
%stof = addrspacecast i32* %ptr to i32 addrspace(4)*
store volatile i32 7, i32 addrspace(4)* %stof
ret void
}
; no-op
; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
; HSA: enable_sgpr_queue_ptr = 0
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
%stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
store volatile i32 7, i32 addrspace(4)* %stof
ret void
}
; no-op
; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
%stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
%ld = load volatile i32, i32 addrspace(4)* %stof
ret void
}
; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 0
; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
store volatile i32 0, i32 addrspace(3)* %ftos
ret void
}
; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 0
; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
; HSA-DAG: v_cmp_ne_i64_e64 vcc, 0, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
store volatile i32 0, i32* %ftos
ret void
}
; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
; HSA: enable_sgpr_queue_ptr = 0
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
store volatile i32 0, i32 addrspace(1)* %ftos
ret void
}
; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
; HSA: enable_sgpr_queue_ptr = 0
; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
load volatile i32, i32 addrspace(2)* %ftos
ret void
}
; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
define void @cast_0_group_to_flat_addrspacecast() #0 {
%cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
store i32 7, i32 addrspace(4)* %cast
ret void
}
; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
; HSA: ds_write_b32 [[PTR]], [[K]]
define void @cast_0_flat_to_group_addrspacecast() #0 {
%cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
store i32 7, i32 addrspace(3)* %cast
ret void
}
; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
define void @cast_neg1_group_to_flat_addrspacecast() #0 {
%cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
store i32 7, i32 addrspace(4)* %cast
ret void
}
; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
; HSA: ds_write_b32 [[PTR]], [[K]]
define void @cast_neg1_flat_to_group_addrspacecast() #0 {
%cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
store i32 7, i32 addrspace(3)* %cast
ret void
}
; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
define void @cast_0_private_to_flat_addrspacecast() #0 {
%cast = addrspacecast i32* null to i32 addrspace(4)*
store i32 7, i32 addrspace(4)* %cast
ret void
}
; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
define void @cast_0_flat_to_private_addrspacecast() #0 {
%cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
store i32 7, i32* %cast
ret void
}
; Disable optimizations in case there are optimizations added that
; specialize away generic pointer accesses.
; CHECK-LABEL: {{^}}branch_use_flat_i32:
; CHECK: flat_store_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
; CHECK: s_endpgm
; HSA-LABEL: {{^}}branch_use_flat_i32:
; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
; HSA: s_endpgm
define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
entry:
%cmp = icmp ne i32 %c, 0
@ -34,20 +224,17 @@ end:
ret void
}
; TODO: This should not be zero when registers are used for small
; scratch allocations again.
; Check for prologue initializing special SGPRs pointing to scratch.
; CHECK-LABEL: {{^}}store_flat_scratch:
; CHECK: s_movk_i32 flat_scratch_lo, 0
; CHECK-NO-PROMOTE: s_movk_i32 flat_scratch_hi, 0x28{{$}}
; CHECK-PROMOTE: s_movk_i32 flat_scratch_hi, 0x0{{$}}
; CHECK: flat_store_dword
; CHECK: s_barrier
; CHECK: flat_load_dword
; HSA-LABEL: {{^}}store_flat_scratch:
; HSA: s_mov_b32 flat_scratch_lo, s9
; HSA: s_add_u32 [[ADD:s[0-9]+]], s8, s11
; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
; HSA: flat_store_dword
; HSA: s_barrier
; HSA: flat_load_dword
define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
%alloca = alloca i32, i32 9, align 4
%x = call i32 @llvm.amdgcn.workitem.id.x() #3
%x = call i32 @llvm.amdgcn.workitem.id.x() #2
%pptr = getelementptr i32, i32* %alloca, i32 %x
%fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
store i32 %x, i32 addrspace(4)* %fptr
@ -59,8 +246,8 @@ define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
}
declare void @llvm.amdgcn.s.barrier() #1
declare i32 @llvm.amdgcn.workitem.id.x() #3
declare i32 @llvm.amdgcn.workitem.id.x() #2
attributes #0 = { nounwind }
attributes #1 = { nounwind convergent }
attributes #3 = { nounwind readnone }
attributes #2 = { nounwind readnone }

View File

@ -164,6 +164,63 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
ret void
}
; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
store volatile i32 0, i32 addrspace(4)* %stof
ret void
}
; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
%stof = addrspacecast i32* %ptr to i32 addrspace(4)*
store volatile i32 0, i32 addrspace(4)* %stof
ret void
}
; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
store volatile i32 0, i32 addrspace(3)* %ftos
ret void
}
; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
store volatile i32 0, i32* %ftos
ret void
}
; No-op addrspacecast should not use queue ptr
; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
%stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
store volatile i32 0, i32 addrspace(4)* %stof
ret void
}
; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
%stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
%ld = load volatile i32, i32 addrspace(4)* %stof
ret void
}
; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
store volatile i32 0, i32 addrspace(1)* %ftos
ret void
}
; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
%ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
%ld = load volatile i32, i32 addrspace(2)* %ftos
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

View File

@ -0,0 +1,8 @@
; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
%stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
store volatile i32 0, i32 addrspace(1)* %stof
ret void
}