2014-06-18 00:53:14 +08:00
|
|
|
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This pass eliminates allocas by either converting them into vectors or
|
|
|
|
// by migrating them to local address space.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
|
|
#include "llvm/IR/InstVisitor.h"
|
2016-01-30 13:19:45 +08:00
|
|
|
#include "llvm/IR/MDBuilder.h"
|
2014-06-18 00:53:14 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2015-03-24 02:07:13 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
#define DEBUG_TYPE "amdgpu-promote-alloca"
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
// FIXME: This can create globals so should be a module pass.
|
2014-06-18 00:53:14 +08:00
|
|
|
class AMDGPUPromoteAlloca : public FunctionPass,
|
2016-01-30 13:19:45 +08:00
|
|
|
public InstVisitor<AMDGPUPromoteAlloca> {
|
|
|
|
private:
|
|
|
|
const TargetMachine *TM;
|
2014-06-18 00:53:14 +08:00
|
|
|
Module *Mod;
|
2016-01-30 13:19:45 +08:00
|
|
|
MDNode *MaxWorkGroupSizeRange;
|
|
|
|
|
|
|
|
// FIXME: This should be per-kernel.
|
2014-06-18 00:53:14 +08:00
|
|
|
int LocalMemAvailable;
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
bool IsAMDGCN;
|
|
|
|
bool IsAMDHSA;
|
|
|
|
|
|
|
|
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
|
|
|
|
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
public:
|
2016-01-30 13:19:45 +08:00
|
|
|
static char ID;
|
|
|
|
|
|
|
|
AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
|
|
|
|
FunctionPass(ID),
|
|
|
|
TM(TM_),
|
|
|
|
Mod(nullptr),
|
|
|
|
MaxWorkGroupSizeRange(nullptr),
|
|
|
|
LocalMemAvailable(0),
|
|
|
|
IsAMDGCN(false),
|
|
|
|
IsAMDHSA(false) { }
|
|
|
|
|
2014-09-03 19:41:21 +08:00
|
|
|
bool doInitialization(Module &M) override;
|
|
|
|
bool runOnFunction(Function &F) override;
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
const char *getPassName() const override {
|
|
|
|
return "AMDGPU Promote Alloca";
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
void visitAlloca(AllocaInst &I);
|
|
|
|
};
|
|
|
|
|
|
|
|
} // End anonymous namespace
|
|
|
|
|
|
|
|
char AMDGPUPromoteAlloca::ID = 0;
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
|
|
|
|
"AMDGPU promote alloca to vector or LDS", false, false)
|
|
|
|
|
|
|
|
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
|
|
|
|
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
|
2016-01-30 13:19:45 +08:00
|
|
|
if (!TM)
|
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
Mod = &M;
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
// The maximum workitem id.
|
|
|
|
//
|
|
|
|
// FIXME: Should get as subtarget property. Usually runtime enforced max is
|
|
|
|
// 256.
|
|
|
|
MDBuilder MDB(Mod->getContext());
|
|
|
|
MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
|
|
|
|
|
|
|
|
const Triple &TT = TM->getTargetTriple();
|
|
|
|
|
|
|
|
IsAMDGCN = TT.getArch() == Triple::amdgcn;
|
|
|
|
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
2016-01-30 13:19:45 +08:00
|
|
|
if (!TM)
|
|
|
|
return false;
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
FunctionType *FTy = F.getFunctionType();
|
2014-06-18 00:53:14 +08:00
|
|
|
LocalMemAvailable = ST.getLocalMemorySize();
|
|
|
|
|
|
|
|
|
|
|
|
// If the function has any arguments in the local address space, then it's
|
|
|
|
// possible these arguments require the entire local memory space, so
|
|
|
|
// we cannot use local memory in the pass.
|
|
|
|
for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
|
2015-08-02 06:20:21 +08:00
|
|
|
Type *ParamTy = FTy->getParamType(i);
|
2014-06-18 00:53:14 +08:00
|
|
|
if (ParamTy->isPointerTy() &&
|
|
|
|
ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
|
|
|
LocalMemAvailable = 0;
|
|
|
|
DEBUG(dbgs() << "Function has local memory argument. Promoting to "
|
|
|
|
"local memory disabled.\n");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (LocalMemAvailable > 0) {
|
|
|
|
// Check how much local memory is being used by global objects
|
|
|
|
for (Module::global_iterator I = Mod->global_begin(),
|
|
|
|
E = Mod->global_end(); I != E; ++I) {
|
2015-10-14 04:07:10 +08:00
|
|
|
GlobalVariable *GV = &*I;
|
2016-01-17 04:30:46 +08:00
|
|
|
if (GV->getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
|
2014-06-18 00:53:14 +08:00
|
|
|
continue;
|
|
|
|
for (Value::use_iterator U = GV->use_begin(),
|
|
|
|
UE = GV->use_end(); U != UE; ++U) {
|
|
|
|
Instruction *Use = dyn_cast<Instruction>(*U);
|
|
|
|
if (!Use)
|
|
|
|
continue;
|
|
|
|
if (Use->getParent()->getParent() == &F)
|
|
|
|
LocalMemAvailable -=
|
2016-01-17 04:30:46 +08:00
|
|
|
Mod->getDataLayout().getTypeAllocSize(GV->getValueType());
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LocalMemAvailable = std::max(0, LocalMemAvailable);
|
|
|
|
DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
|
|
|
|
|
|
|
|
visit(F);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
std::pair<Value *, Value *>
|
|
|
|
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|
|
|
if (!IsAMDHSA) {
|
|
|
|
Function *LocalSizeYFn
|
|
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
|
|
|
|
Function *LocalSizeZFn
|
|
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
|
|
|
|
|
|
|
|
CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
|
|
|
|
CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
|
|
|
|
|
|
|
|
LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
|
|
|
|
return std::make_pair(LocalSizeY, LocalSizeZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We must read the size out of the dispatch pointer.
|
|
|
|
assert(IsAMDGCN);
|
|
|
|
|
|
|
|
// We are indexing into this struct, and want to extract the workgroup_size_*
|
|
|
|
// fields.
|
|
|
|
//
|
|
|
|
// typedef struct hsa_kernel_dispatch_packet_s {
|
|
|
|
// uint16_t header;
|
|
|
|
// uint16_t setup;
|
|
|
|
// uint16_t workgroup_size_x ;
|
|
|
|
// uint16_t workgroup_size_y;
|
|
|
|
// uint16_t workgroup_size_z;
|
|
|
|
// uint16_t reserved0;
|
|
|
|
// uint32_t grid_size_x ;
|
|
|
|
// uint32_t grid_size_y ;
|
|
|
|
// uint32_t grid_size_z;
|
|
|
|
//
|
|
|
|
// uint32_t private_segment_size;
|
|
|
|
// uint32_t group_segment_size;
|
|
|
|
// uint64_t kernel_object;
|
|
|
|
//
|
|
|
|
// #ifdef HSA_LARGE_MODEL
|
|
|
|
// void *kernarg_address;
|
|
|
|
// #elif defined HSA_LITTLE_ENDIAN
|
|
|
|
// void *kernarg_address;
|
|
|
|
// uint32_t reserved1;
|
|
|
|
// #else
|
|
|
|
// uint32_t reserved1;
|
|
|
|
// void *kernarg_address;
|
|
|
|
// #endif
|
|
|
|
// uint64_t reserved2;
|
|
|
|
// hsa_signal_t completion_signal; // uint64_t wrapper
|
|
|
|
// } hsa_kernel_dispatch_packet_t
|
|
|
|
//
|
|
|
|
Function *DispatchPtrFn
|
|
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
|
|
|
|
|
|
|
|
CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
|
|
|
|
DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
|
|
|
|
DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
|
|
|
|
|
|
|
|
// Size of the dispatch packet struct.
|
|
|
|
DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
|
|
|
|
|
|
|
|
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
|
|
|
|
Value *CastDispatchPtr = Builder.CreateBitCast(
|
|
|
|
DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
|
|
|
|
|
|
|
|
// We could do a single 64-bit load here, but it's likely that the basic
|
|
|
|
// 32-bit and extract sequence is already present, and it is probably easier
|
|
|
|
// to CSE this. The loads should be mergable later anyway.
|
|
|
|
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
|
|
|
|
LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
|
|
|
|
|
|
|
|
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
|
|
|
|
LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
|
|
|
|
|
|
|
|
MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
|
|
|
|
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
|
|
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
|
|
LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
|
|
|
|
// Extract y component. Upper half of LoadZU should be zero already.
|
|
|
|
Value *Y = Builder.CreateLShr(LoadXY, 16);
|
|
|
|
|
|
|
|
return std::make_pair(Y, LoadZU);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
|
|
|
|
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
|
|
|
|
|
|
|
|
switch (N) {
|
|
|
|
case 0:
|
|
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
|
|
|
|
: Intrinsic::r600_read_tidig_x;
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
|
|
|
|
: Intrinsic::r600_read_tidig_y;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
|
|
|
|
: Intrinsic::r600_read_tidig_z;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid dimension");
|
|
|
|
}
|
|
|
|
|
|
|
|
Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
|
|
|
|
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
|
|
|
|
CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
|
|
|
|
|
|
return CI;
|
|
|
|
}
|
|
|
|
|
2015-08-02 06:20:21 +08:00
|
|
|
static VectorType *arrayTypeToVecType(Type *ArrayTy) {
|
2014-06-18 00:53:14 +08:00
|
|
|
return VectorType::get(ArrayTy->getArrayElementType(),
|
|
|
|
ArrayTy->getArrayNumElements());
|
|
|
|
}
|
|
|
|
|
2014-10-05 00:55:56 +08:00
|
|
|
static Value *
|
|
|
|
calculateVectorIndex(Value *Ptr,
|
|
|
|
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
|
2014-06-18 00:53:14 +08:00
|
|
|
if (isa<AllocaInst>(Ptr))
|
|
|
|
return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
|
|
|
|
|
|
|
|
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
|
|
|
|
|
2014-10-05 00:55:56 +08:00
|
|
|
auto I = GEPIdx.find(GEP);
|
|
|
|
return I == GEPIdx.end() ? nullptr : I->second;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
|
|
|
|
// FIXME we only support simple cases
|
|
|
|
if (GEP->getNumOperands() != 3)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
|
|
|
|
if (!I0 || !I0->isZero())
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return GEP->getOperand(2);
|
|
|
|
}
|
|
|
|
|
2014-06-28 00:52:49 +08:00
|
|
|
// Not an instruction handled below to turn into a vector.
|
|
|
|
//
|
|
|
|
// TODO: Check isTriviallyVectorizable for calls and handle other
|
|
|
|
// instructions.
|
2015-07-29 02:47:00 +08:00
|
|
|
static bool canVectorizeInst(Instruction *Inst, User *User) {
|
2014-06-28 00:52:49 +08:00
|
|
|
switch (Inst->getOpcode()) {
|
|
|
|
case Instruction::Load:
|
|
|
|
case Instruction::BitCast:
|
|
|
|
case Instruction::AddrSpaceCast:
|
|
|
|
return true;
|
2015-07-29 02:47:00 +08:00
|
|
|
case Instruction::Store: {
|
|
|
|
// Must be the stored pointer operand, not a stored value.
|
|
|
|
StoreInst *SI = cast<StoreInst>(Inst);
|
|
|
|
return SI->getPointerOperand() == User;
|
|
|
|
}
|
2014-06-28 00:52:49 +08:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
|
|
|
|
Type *AllocaTy = Alloca->getAllocatedType();
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
|
|
|
|
|
|
|
|
// FIXME: There is no reason why we can't support larger arrays, we
|
|
|
|
// are just being conservative for now.
|
|
|
|
if (!AllocaTy->isArrayTy() ||
|
|
|
|
AllocaTy->getArrayElementType()->isVectorTy() ||
|
|
|
|
AllocaTy->getArrayNumElements() > 4) {
|
|
|
|
|
|
|
|
DEBUG(dbgs() << " Cannot convert type to vector");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
|
|
|
|
std::vector<Value*> WorkList;
|
|
|
|
for (User *AllocaUser : Alloca->users()) {
|
|
|
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
|
|
|
|
if (!GEP) {
|
2015-07-29 02:47:00 +08:00
|
|
|
if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
|
2014-06-28 00:52:49 +08:00
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(AllocaUser);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Index = GEPToVectorIndex(GEP);
|
|
|
|
|
|
|
|
// If we can't compute a vector index from this GEP, then we can't
|
|
|
|
// promote this alloca to vector.
|
|
|
|
if (!Index) {
|
2014-06-27 10:36:59 +08:00
|
|
|
DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
|
2014-06-18 00:53:14 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
GEPVectorIdx[GEP] = Index;
|
|
|
|
for (User *GEPUser : AllocaUser->users()) {
|
2015-07-29 02:47:00 +08:00
|
|
|
if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
|
2014-06-28 00:52:49 +08:00
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(GEPUser);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
|
|
|
|
|
2014-06-27 10:36:59 +08:00
|
|
|
DEBUG(dbgs() << " Converting alloca to vector "
|
|
|
|
<< *AllocaTy << " -> " << *VectorTy << '\n');
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
for (std::vector<Value*>::iterator I = WorkList.begin(),
|
|
|
|
E = WorkList.end(); I != E; ++I) {
|
|
|
|
Instruction *Inst = cast<Instruction>(*I);
|
|
|
|
IRBuilder<> Builder(Inst);
|
|
|
|
switch (Inst->getOpcode()) {
|
|
|
|
case Instruction::Load: {
|
|
|
|
Value *Ptr = Inst->getOperand(0);
|
|
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
|
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
|
|
|
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
|
|
|
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
|
|
|
|
Inst->replaceAllUsesWith(ExtractElement);
|
|
|
|
Inst->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::Store: {
|
|
|
|
Value *Ptr = Inst->getOperand(1);
|
|
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
|
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
|
|
|
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
|
|
|
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
|
|
|
|
Inst->getOperand(0),
|
|
|
|
Index);
|
|
|
|
Builder.CreateStore(NewVecValue, BitCast);
|
|
|
|
Inst->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::BitCast:
|
2014-06-28 00:52:49 +08:00
|
|
|
case Instruction::AddrSpaceCast:
|
2014-06-18 00:53:14 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
Inst->dump();
|
2014-06-28 00:52:49 +08:00
|
|
|
llvm_unreachable("Inconsistency in instructions promotable to vector");
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-11-01 04:52:04 +08:00
|
|
|
static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
|
|
|
|
bool Success = true;
|
2014-06-18 00:53:14 +08:00
|
|
|
for (User *User : Val->users()) {
|
|
|
|
if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
|
|
|
|
continue;
|
2015-07-29 02:29:14 +08:00
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(User)) {
|
|
|
|
// TODO: We might be able to handle some cases where the callee is a
|
|
|
|
// constantexpr bitcast of a function.
|
|
|
|
if (!CI->getCalledFunction())
|
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(User);
|
|
|
|
continue;
|
|
|
|
}
|
2014-11-01 04:52:04 +08:00
|
|
|
|
|
|
|
// FIXME: Correctly handle ptrtoint instructions.
|
|
|
|
Instruction *UseInst = dyn_cast<Instruction>(User);
|
|
|
|
if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
|
|
|
|
return false;
|
|
|
|
|
2015-07-29 02:47:00 +08:00
|
|
|
if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
|
|
|
|
// Reject if the stored value is not the pointer operand.
|
|
|
|
if (SI->getPointerOperand() != Val)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
if (!User->getType()->isPointerTy())
|
|
|
|
continue;
|
2014-11-01 04:52:04 +08:00
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(User);
|
2014-11-01 04:52:04 +08:00
|
|
|
|
|
|
|
Success &= collectUsesWithPtrTypes(User, WorkList);
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
2014-11-01 04:52:04 +08:00
|
|
|
return Success;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
|
2015-08-27 02:37:13 +08:00
|
|
|
if (!I.isStaticAlloca())
|
|
|
|
return;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
|
|
|
|
// First try to replace the alloca with a vector
|
|
|
|
Type *AllocaTy = I.getAllocatedType();
|
|
|
|
|
2014-06-27 10:36:59 +08:00
|
|
|
DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
if (tryPromoteAllocaToVector(&I))
|
|
|
|
return;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
|
|
|
|
|
|
|
|
// FIXME: This is the maximum work group size. We should try to get
|
|
|
|
// value from the reqd_work_group_size function attribute if it is
|
|
|
|
// available.
|
|
|
|
unsigned WorkGroupSize = 256;
|
2015-03-05 02:43:29 +08:00
|
|
|
int AllocaSize =
|
|
|
|
WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
if (AllocaSize > LocalMemAvailable) {
|
|
|
|
DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-11-01 04:52:04 +08:00
|
|
|
std::vector<Value*> WorkList;
|
|
|
|
|
|
|
|
if (!collectUsesWithPtrTypes(&I, WorkList)) {
|
|
|
|
DEBUG(dbgs() << " Do not know how to convert all uses\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
DEBUG(dbgs() << "Promoting alloca to local memory\n");
|
|
|
|
LocalMemAvailable -= AllocaSize;
|
|
|
|
|
2015-03-25 07:34:31 +08:00
|
|
|
Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
|
2014-06-18 00:53:14 +08:00
|
|
|
GlobalVariable *GV = new GlobalVariable(
|
2015-03-25 07:34:31 +08:00
|
|
|
*Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
|
2014-06-18 00:53:14 +08:00
|
|
|
GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
Value *TCntY, *TCntZ;
|
|
|
|
|
|
|
|
std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
|
|
|
|
Value *TIdX = getWorkitemID(Builder, 0);
|
|
|
|
Value *TIdY = getWorkitemID(Builder, 1);
|
|
|
|
Value *TIdZ = getWorkitemID(Builder, 2);
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
|
|
|
|
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
|
|
|
|
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
|
|
|
|
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
|
|
|
|
TID = Builder.CreateAdd(TID, TIdZ);
|
|
|
|
|
|
|
|
std::vector<Value*> Indices;
|
|
|
|
Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
|
|
|
|
Indices.push_back(TID);
|
|
|
|
|
2015-03-25 07:34:31 +08:00
|
|
|
Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
|
2014-06-18 00:53:14 +08:00
|
|
|
I.mutateType(Offset->getType());
|
|
|
|
I.replaceAllUsesWith(Offset);
|
|
|
|
I.eraseFromParent();
|
|
|
|
|
|
|
|
for (std::vector<Value*>::iterator i = WorkList.begin(),
|
|
|
|
e = WorkList.end(); i != e; ++i) {
|
|
|
|
Value *V = *i;
|
|
|
|
CallInst *Call = dyn_cast<CallInst>(V);
|
|
|
|
if (!Call) {
|
|
|
|
Type *EltTy = V->getType()->getPointerElementType();
|
|
|
|
PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
|
2014-09-15 23:41:44 +08:00
|
|
|
|
|
|
|
// The operand's value should be corrected on its own.
|
|
|
|
if (isa<AddrSpaceCastInst>(V))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// FIXME: It doesn't really make sense to try to do this for all
|
|
|
|
// instructions.
|
2014-06-18 00:53:14 +08:00
|
|
|
V->mutateType(NewTy);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
|
|
|
|
if (!Intr) {
|
|
|
|
std::vector<Type*> ArgTypes;
|
|
|
|
for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
|
|
|
|
ArgIdx != ArgEnd; ++ArgIdx) {
|
|
|
|
ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
|
|
|
|
}
|
|
|
|
Function *F = Call->getCalledFunction();
|
|
|
|
FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
|
|
|
|
F->isVarArg());
|
2015-03-28 01:51:30 +08:00
|
|
|
Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
|
|
|
|
NewType, F->getAttributes());
|
2014-06-18 00:53:14 +08:00
|
|
|
Function *NewF = cast<Function>(C);
|
|
|
|
Call->setCalledFunction(NewF);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.SetInsertPoint(Intr);
|
|
|
|
switch (Intr->getIntrinsicID()) {
|
|
|
|
case Intrinsic::lifetime_start:
|
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
// These intrinsics are for address space 0 only
|
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
case Intrinsic::memcpy: {
|
|
|
|
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
|
|
|
|
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
|
2015-11-19 13:56:52 +08:00
|
|
|
MemCpy->getLength(), MemCpy->getAlignment(),
|
|
|
|
MemCpy->isVolatile());
|
2014-06-18 00:53:14 +08:00
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
case Intrinsic::memset: {
|
|
|
|
MemSetInst *MemSet = cast<MemSetInst>(Intr);
|
|
|
|
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
|
2015-11-19 13:56:52 +08:00
|
|
|
MemSet->getLength(), MemSet->getAlignment(),
|
2014-06-18 00:53:14 +08:00
|
|
|
MemSet->isVolatile());
|
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-23 03:47:54 +08:00
|
|
|
case Intrinsic::invariant_start:
|
|
|
|
case Intrinsic::invariant_end:
|
|
|
|
case Intrinsic::invariant_group_barrier:
|
|
|
|
Intr->eraseFromParent();
|
|
|
|
// FIXME: I think the invariant marker should still theoretically apply,
|
|
|
|
// but the intrinsics need to be changed to accept pointers with any
|
|
|
|
// address space.
|
|
|
|
continue;
|
2014-06-18 00:53:14 +08:00
|
|
|
default:
|
|
|
|
Intr->dump();
|
|
|
|
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
|
|
|
|
return new AMDGPUPromoteAlloca(TM);
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|