2014-06-18 00:53:14 +08:00
|
|
|
//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This pass eliminates allocas by either converting them into vectors or
|
|
|
|
// by migrating them to local address space.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "AMDGPU.h"
|
|
|
|
#include "AMDGPUSubtarget.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
|
|
#include "llvm/ADT/APInt.h"
|
|
|
|
#include "llvm/ADT/None.h"
|
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include "llvm/ADT/Triple.h"
|
|
|
|
#include "llvm/ADT/Twine.h"
|
2017-01-25 03:06:28 +08:00
|
|
|
#include "llvm/Analysis/CaptureTracking.h"
|
2014-06-18 00:53:14 +08:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2017-05-19 01:21:13 +08:00
|
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/IR/Attributes.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constant.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/GlobalValue.h"
|
|
|
|
#include "llvm/IR/GlobalVariable.h"
|
2017-06-06 19:49:48 +08:00
|
|
|
#include "llvm/IR/IRBuilder.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/IR/Instruction.h"
|
|
|
|
#include "llvm/IR/Instructions.h"
|
2016-03-11 16:20:50 +08:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/IR/Intrinsics.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
|
|
|
#include "llvm/IR/Metadata.h"
|
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/User.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
|
|
|
#include "llvm/Pass.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
2014-06-18 00:53:14 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/MathExtras.h"
|
2015-03-24 02:07:13 +08:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2017-01-21 01:52:16 +08:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <map>
|
|
|
|
#include <tuple>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
#define DEBUG_TYPE "amdgpu-promote-alloca"
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2018-02-17 03:14:17 +08:00
|
|
|
static cl::opt<bool> DisablePromoteAllocaToVector(
|
|
|
|
"disable-promote-alloca-to-vector",
|
|
|
|
cl::desc("Disable promote alloca to vector"),
|
|
|
|
cl::init(false));
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
// FIXME: This can create globals so should be a module pass.
|
2016-03-11 16:20:50 +08:00
|
|
|
class AMDGPUPromoteAlloca : public FunctionPass {
|
2016-01-30 13:19:45 +08:00
|
|
|
private:
|
|
|
|
const TargetMachine *TM;
|
2017-01-21 01:52:16 +08:00
|
|
|
Module *Mod = nullptr;
|
|
|
|
const DataLayout *DL = nullptr;
|
2017-03-27 22:04:01 +08:00
|
|
|
AMDGPUAS AS;
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
// FIXME: This should be per-kernel.
|
2017-01-21 01:52:16 +08:00
|
|
|
uint32_t LocalMemLimit = 0;
|
|
|
|
uint32_t CurrentLocalMemUsage = 0;
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2017-01-21 01:52:16 +08:00
|
|
|
bool IsAMDGCN = false;
|
|
|
|
bool IsAMDHSA = false;
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
|
|
|
|
Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
/// BaseAlloca is the alloca root the search started from.
|
|
|
|
/// Val may be that alloca or a recursive user of it.
|
|
|
|
bool collectUsesWithPtrTypes(Value *BaseAlloca,
|
|
|
|
Value *Val,
|
|
|
|
std::vector<Value*> &WorkList) const;
|
|
|
|
|
|
|
|
/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
|
|
|
|
/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
|
|
|
|
/// Returns true if both operands are derived from the same alloca. Val should
|
|
|
|
/// be the same value as one of the input operands of UseInst.
|
|
|
|
bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
|
|
|
|
Instruction *UseInst,
|
|
|
|
int OpIdx0, int OpIdx1) const;
|
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
/// Check whether we have enough local memory for promotion.
|
|
|
|
bool hasSufficientLocalMem(const Function &F);
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
public:
|
2016-01-30 13:19:45 +08:00
|
|
|
static char ID;
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
AMDGPUPromoteAlloca() : FunctionPass(ID) {}
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-09-03 19:41:21 +08:00
|
|
|
bool doInitialization(Module &M) override;
|
|
|
|
bool runOnFunction(Function &F) override;
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2016-10-01 10:56:57 +08:00
|
|
|
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
bool handleAlloca(AllocaInst &I, bool SufficientLDS);
|
2016-05-12 09:58:58 +08:00
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.setPreservesCFG();
|
|
|
|
FunctionPass::getAnalysisUsage(AU);
|
|
|
|
}
|
2014-06-18 00:53:14 +08:00
|
|
|
};
|
|
|
|
|
2017-01-21 01:52:16 +08:00
|
|
|
} // end anonymous namespace
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
char AMDGPUPromoteAlloca::ID = 0;
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
|
|
|
|
"AMDGPU promote alloca to vector or LDS", false, false)
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
|
|
|
|
Mod = &M;
|
2016-05-12 09:58:58 +08:00
|
|
|
DL = &Mod->getDataLayout();
|
2016-01-30 13:19:45 +08:00
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
2017-05-19 01:21:13 +08:00
|
|
|
if (skipFunction(F))
|
2016-01-30 13:19:45 +08:00
|
|
|
return false;
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
|
|
|
|
TM = &TPC->getTM<TargetMachine>();
|
|
|
|
else
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const Triple &TT = TM->getTargetTriple();
|
|
|
|
IsAMDGCN = TT.getArch() == Triple::amdgcn;
|
|
|
|
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
|
|
|
|
|
2016-06-28 04:32:13 +08:00
|
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
|
|
|
|
if (!ST.isPromoteAllocaEnabled())
|
|
|
|
return false;
|
2016-05-17 05:19:59 +08:00
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
AS = AMDGPU::getAMDGPUAS(*F.getParent());
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
bool SufficientLDS = hasSufficientLocalMem(F);
|
|
|
|
bool Changed = false;
|
2016-03-11 16:20:50 +08:00
|
|
|
BasicBlock &EntryBB = *F.begin();
|
|
|
|
for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
|
|
|
|
AllocaInst *AI = dyn_cast<AllocaInst>(I);
|
|
|
|
|
|
|
|
++I;
|
|
|
|
if (AI)
|
2017-05-24 04:25:41 +08:00
|
|
|
Changed |= handleAlloca(*AI, SufficientLDS);
|
2016-03-11 16:20:50 +08:00
|
|
|
}
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
return Changed;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
std::pair<Value *, Value *>
|
|
|
|
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
2017-04-13 04:48:56 +08:00
|
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
|
|
|
|
*Builder.GetInsertBlock()->getParent());
|
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
if (!IsAMDHSA) {
|
|
|
|
Function *LocalSizeYFn
|
|
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
|
|
|
|
Function *LocalSizeZFn
|
|
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
|
|
|
|
|
|
|
|
CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
|
|
|
|
CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
|
|
|
|
|
2017-04-13 04:48:56 +08:00
|
|
|
ST.makeLIDRangeMetadata(LocalSizeY);
|
|
|
|
ST.makeLIDRangeMetadata(LocalSizeZ);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
return std::make_pair(LocalSizeY, LocalSizeZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We must read the size out of the dispatch pointer.
|
|
|
|
assert(IsAMDGCN);
|
|
|
|
|
|
|
|
// We are indexing into this struct, and want to extract the workgroup_size_*
|
|
|
|
// fields.
|
|
|
|
//
|
|
|
|
// typedef struct hsa_kernel_dispatch_packet_s {
|
|
|
|
// uint16_t header;
|
|
|
|
// uint16_t setup;
|
|
|
|
// uint16_t workgroup_size_x ;
|
|
|
|
// uint16_t workgroup_size_y;
|
|
|
|
// uint16_t workgroup_size_z;
|
|
|
|
// uint16_t reserved0;
|
|
|
|
// uint32_t grid_size_x ;
|
|
|
|
// uint32_t grid_size_y ;
|
|
|
|
// uint32_t grid_size_z;
|
|
|
|
//
|
|
|
|
// uint32_t private_segment_size;
|
|
|
|
// uint32_t group_segment_size;
|
|
|
|
// uint64_t kernel_object;
|
|
|
|
//
|
|
|
|
// #ifdef HSA_LARGE_MODEL
|
|
|
|
// void *kernarg_address;
|
|
|
|
// #elif defined HSA_LITTLE_ENDIAN
|
|
|
|
// void *kernarg_address;
|
|
|
|
// uint32_t reserved1;
|
|
|
|
// #else
|
|
|
|
// uint32_t reserved1;
|
|
|
|
// void *kernarg_address;
|
|
|
|
// #endif
|
|
|
|
// uint64_t reserved2;
|
|
|
|
// hsa_signal_t completion_signal; // uint64_t wrapper
|
|
|
|
// } hsa_kernel_dispatch_packet_t
|
|
|
|
//
|
|
|
|
Function *DispatchPtrFn
|
|
|
|
= Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
|
|
|
|
|
|
|
|
CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-22 00:57:19 +08:00
|
|
|
DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
|
|
|
|
DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
// Size of the dispatch packet struct.
|
Rename AttributeSet to AttributeList
Summary:
This class is a list of AttributeSetNodes corresponding the function
prototype of a call or function declaration. This class used to be
called ParamAttrListPtr, then AttrListPtr, then AttributeSet. It is
typically accessed by parameter and return value index, so
"AttributeList" seems like a more intuitive name.
Rename AttributeSetImpl to AttributeListImpl to follow suit.
It's useful to rename this class so that we can rename AttributeSetNode
to AttributeSet later. AttributeSet is the set of attributes that apply
to a single function, argument, or return value.
Reviewers: sanjoy, javed.absar, chandlerc, pete
Reviewed By: pete
Subscribers: pete, jholewinski, arsenm, dschuff, mehdi_amini, jfb, nhaehnle, sbc100, void, llvm-commits
Differential Revision: https://reviews.llvm.org/D31102
llvm-svn: 298393
2017-03-22 00:57:19 +08:00
|
|
|
DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
|
|
|
|
Value *CastDispatchPtr = Builder.CreateBitCast(
|
2017-03-27 22:04:01 +08:00
|
|
|
DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
// We could do a single 64-bit load here, but it's likely that the basic
|
|
|
|
// 32-bit and extract sequence is already present, and it is probably easier
|
|
|
|
// to CSE this. The loads should be mergable later anyway.
|
|
|
|
Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
|
|
|
|
LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
|
|
|
|
|
|
|
|
Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
|
|
|
|
LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
|
|
|
|
|
2017-01-21 01:52:16 +08:00
|
|
|
MDNode *MD = MDNode::get(Mod->getContext(), None);
|
2016-01-30 13:19:45 +08:00
|
|
|
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
|
|
|
|
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
|
2017-04-13 04:48:56 +08:00
|
|
|
ST.makeLIDRangeMetadata(LoadZU);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
// Extract y component. Upper half of LoadZU should be zero already.
|
|
|
|
Value *Y = Builder.CreateLShr(LoadXY, 16);
|
|
|
|
|
|
|
|
return std::make_pair(Y, LoadZU);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
|
2017-04-13 04:48:56 +08:00
|
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
|
|
|
|
*Builder.GetInsertBlock()->getParent());
|
2016-01-30 13:19:45 +08:00
|
|
|
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
|
|
|
|
|
|
|
|
switch (N) {
|
|
|
|
case 0:
|
|
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
|
|
|
|
: Intrinsic::r600_read_tidig_x;
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
|
|
|
|
: Intrinsic::r600_read_tidig_y;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
|
|
|
|
: Intrinsic::r600_read_tidig_z;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("invalid dimension");
|
|
|
|
}
|
|
|
|
|
|
|
|
Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
|
|
|
|
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
|
2017-04-13 04:48:56 +08:00
|
|
|
ST.makeLIDRangeMetadata(CI);
|
2016-01-30 13:19:45 +08:00
|
|
|
|
|
|
|
return CI;
|
|
|
|
}
|
|
|
|
|
2017-09-15 02:02:29 +08:00
|
|
|
static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
|
|
|
|
return VectorType::get(ArrayTy->getElementType(),
|
|
|
|
ArrayTy->getNumElements());
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
2014-10-05 00:55:56 +08:00
|
|
|
static Value *
|
|
|
|
calculateVectorIndex(Value *Ptr,
|
|
|
|
const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
|
2014-06-18 00:53:14 +08:00
|
|
|
GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
|
|
|
|
|
2014-10-05 00:55:56 +08:00
|
|
|
auto I = GEPIdx.find(GEP);
|
|
|
|
return I == GEPIdx.end() ? nullptr : I->second;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
|
|
|
|
// FIXME we only support simple cases
|
|
|
|
if (GEP->getNumOperands() != 3)
|
2016-07-19 02:34:53 +08:00
|
|
|
return nullptr;
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
|
|
|
|
if (!I0 || !I0->isZero())
|
2016-07-19 02:34:53 +08:00
|
|
|
return nullptr;
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
return GEP->getOperand(2);
|
|
|
|
}
|
|
|
|
|
2014-06-28 00:52:49 +08:00
|
|
|
// Not an instruction handled below to turn into a vector.
|
|
|
|
//
|
|
|
|
// TODO: Check isTriviallyVectorizable for calls and handle other
|
|
|
|
// instructions.
|
2015-07-29 02:47:00 +08:00
|
|
|
static bool canVectorizeInst(Instruction *Inst, User *User) {
|
2014-06-28 00:52:49 +08:00
|
|
|
switch (Inst->getOpcode()) {
|
2017-05-13 04:31:12 +08:00
|
|
|
case Instruction::Load: {
|
|
|
|
LoadInst *LI = cast<LoadInst>(Inst);
|
2017-06-09 22:16:22 +08:00
|
|
|
// Currently only handle the case where the Pointer Operand is a GEP so check for that case.
|
|
|
|
return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
|
2017-05-13 04:31:12 +08:00
|
|
|
}
|
2014-06-28 00:52:49 +08:00
|
|
|
case Instruction::BitCast:
|
|
|
|
return true;
|
2015-07-29 02:47:00 +08:00
|
|
|
case Instruction::Store: {
|
2017-06-09 22:16:22 +08:00
|
|
|
// Must be the stored pointer operand, not a stored value, plus
|
|
|
|
// since it should be canonical form, the User should be a GEP.
|
2015-07-29 02:47:00 +08:00
|
|
|
StoreInst *SI = cast<StoreInst>(Inst);
|
2017-06-09 22:16:22 +08:00
|
|
|
return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
|
2015-07-29 02:47:00 +08:00
|
|
|
}
|
2014-06-28 00:52:49 +08:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-27 22:04:01 +08:00
|
|
|
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
|
2018-02-17 03:14:17 +08:00
|
|
|
|
|
|
|
if (DisablePromoteAllocaToVector) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
|
2018-02-17 03:14:17 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-02-03 03:32:35 +08:00
|
|
|
ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
|
2014-06-18 00:53:14 +08:00
|
|
|
|
|
|
|
// FIXME: There is no reason why we can't support larger arrays, we
|
|
|
|
// are just being conservative for now.
|
2017-06-09 22:16:22 +08:00
|
|
|
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
|
|
|
|
// could also be promoted but we don't currently handle this case
|
2016-02-03 03:32:35 +08:00
|
|
|
if (!AllocaTy ||
|
2018-02-17 03:14:17 +08:00
|
|
|
AllocaTy->getNumElements() > 16 ||
|
2017-09-15 02:02:29 +08:00
|
|
|
AllocaTy->getNumElements() < 2 ||
|
|
|
|
!VectorType::isValidElementType(AllocaTy->getElementType())) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
|
2014-06-18 00:53:14 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
|
|
|
|
std::vector<Value*> WorkList;
|
|
|
|
for (User *AllocaUser : Alloca->users()) {
|
|
|
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
|
|
|
|
if (!GEP) {
|
2015-07-29 02:47:00 +08:00
|
|
|
if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
|
2014-06-28 00:52:49 +08:00
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(AllocaUser);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Index = GEPToVectorIndex(GEP);
|
|
|
|
|
|
|
|
// If we can't compute a vector index from this GEP, then we can't
|
|
|
|
// promote this alloca to vector.
|
|
|
|
if (!Index) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
|
|
|
|
<< '\n');
|
2014-06-18 00:53:14 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
GEPVectorIdx[GEP] = Index;
|
|
|
|
for (User *GEPUser : AllocaUser->users()) {
|
2015-07-29 02:47:00 +08:00
|
|
|
if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
|
2014-06-28 00:52:49 +08:00
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(GEPUser);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
|
|
|
|
<< *VectorTy << '\n');
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-02-03 03:32:35 +08:00
|
|
|
for (Value *V : WorkList) {
|
|
|
|
Instruction *Inst = cast<Instruction>(V);
|
2014-06-18 00:53:14 +08:00
|
|
|
IRBuilder<> Builder(Inst);
|
|
|
|
switch (Inst->getOpcode()) {
|
|
|
|
case Instruction::Load: {
|
2017-03-27 22:04:01 +08:00
|
|
|
Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
|
2017-06-09 22:16:22 +08:00
|
|
|
Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
|
2014-06-18 00:53:14 +08:00
|
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
2016-07-19 02:34:53 +08:00
|
|
|
|
|
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
|
2014-06-18 00:53:14 +08:00
|
|
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
|
|
|
Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
|
|
|
|
Inst->replaceAllUsesWith(ExtractElement);
|
|
|
|
Inst->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::Store: {
|
2017-03-27 22:04:01 +08:00
|
|
|
Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
|
2016-07-19 02:34:53 +08:00
|
|
|
|
2017-06-09 22:16:22 +08:00
|
|
|
StoreInst *SI = cast<StoreInst>(Inst);
|
|
|
|
Value *Ptr = SI->getPointerOperand();
|
2014-06-18 00:53:14 +08:00
|
|
|
Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
|
2016-07-19 02:34:53 +08:00
|
|
|
Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
|
2014-06-18 00:53:14 +08:00
|
|
|
Value *VecValue = Builder.CreateLoad(BitCast);
|
|
|
|
Value *NewVecValue = Builder.CreateInsertElement(VecValue,
|
2017-06-09 22:16:22 +08:00
|
|
|
SI->getValueOperand(),
|
2014-06-18 00:53:14 +08:00
|
|
|
Index);
|
|
|
|
Builder.CreateStore(NewVecValue, BitCast);
|
|
|
|
Inst->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Instruction::BitCast:
|
2014-06-28 00:52:49 +08:00
|
|
|
case Instruction::AddrSpaceCast:
|
2014-06-18 00:53:14 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2014-06-28 00:52:49 +08:00
|
|
|
llvm_unreachable("Inconsistency in instructions promotable to vector");
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-02-03 03:18:53 +08:00
|
|
|
static bool isCallPromotable(CallInst *CI) {
|
|
|
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
|
|
|
|
if (!II)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
case Intrinsic::memcpy:
|
2016-02-03 04:28:10 +08:00
|
|
|
case Intrinsic::memmove:
|
2016-02-03 03:18:53 +08:00
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::lifetime_start:
|
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
case Intrinsic::invariant_start:
|
|
|
|
case Intrinsic::invariant_end:
|
2018-05-03 19:03:01 +08:00
|
|
|
case Intrinsic::launder_invariant_group:
|
2016-02-03 04:28:10 +08:00
|
|
|
case Intrinsic::objectsize:
|
2016-02-03 03:18:53 +08:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
|
|
|
|
Value *Val,
|
|
|
|
Instruction *Inst,
|
|
|
|
int OpIdx0,
|
|
|
|
int OpIdx1) const {
|
|
|
|
// Figure out which operand is the one we might not be promoting.
|
|
|
|
Value *OtherOp = Inst->getOperand(OpIdx0);
|
|
|
|
if (Val == OtherOp)
|
|
|
|
OtherOp = Inst->getOperand(OpIdx1);
|
|
|
|
|
2016-05-18 23:57:21 +08:00
|
|
|
if (isa<ConstantPointerNull>(OtherOp))
|
|
|
|
return true;
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
|
|
|
|
if (!isa<AllocaInst>(OtherObj))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// TODO: We should be able to replace undefs with the right pointer type.
|
|
|
|
|
|
|
|
// TODO: If we know the other base object is another promotable
|
|
|
|
// alloca, not necessarily this alloca, we can do this. The
|
|
|
|
// important part is both must have the same address space at
|
|
|
|
// the end.
|
|
|
|
if (OtherObj != BaseAlloca) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << "Found a binary instruction with another alloca object\n");
|
2016-05-12 09:58:58 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
|
|
|
|
Value *BaseAlloca,
|
|
|
|
Value *Val,
|
|
|
|
std::vector<Value*> &WorkList) const {
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
for (User *User : Val->users()) {
|
2016-08-12 06:21:41 +08:00
|
|
|
if (is_contained(WorkList, User))
|
2014-06-18 00:53:14 +08:00
|
|
|
continue;
|
2016-02-03 03:18:53 +08:00
|
|
|
|
2015-07-29 02:29:14 +08:00
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(User)) {
|
2016-02-03 03:18:53 +08:00
|
|
|
if (!isCallPromotable(CI))
|
2015-07-29 02:29:14 +08:00
|
|
|
return false;
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(User);
|
|
|
|
continue;
|
|
|
|
}
|
2014-11-01 04:52:04 +08:00
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
Instruction *UseInst = cast<Instruction>(User);
|
|
|
|
if (UseInst->getOpcode() == Instruction::PtrToInt)
|
2014-11-01 04:52:04 +08:00
|
|
|
return false;
|
|
|
|
|
2016-07-19 03:00:07 +08:00
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
|
2016-05-19 07:20:24 +08:00
|
|
|
if (LI->isVolatile())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
|
2016-03-24 07:17:29 +08:00
|
|
|
if (SI->isVolatile())
|
|
|
|
return false;
|
|
|
|
|
2015-07-29 02:47:00 +08:00
|
|
|
// Reject if the stored value is not the pointer operand.
|
|
|
|
if (SI->getPointerOperand() != Val)
|
|
|
|
return false;
|
2016-07-19 03:00:07 +08:00
|
|
|
} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
|
2016-03-24 07:17:29 +08:00
|
|
|
if (RMW->isVolatile())
|
|
|
|
return false;
|
2016-07-19 03:00:07 +08:00
|
|
|
} else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
|
2016-03-24 07:17:29 +08:00
|
|
|
if (CAS->isVolatile())
|
|
|
|
return false;
|
2015-07-29 02:47:00 +08:00
|
|
|
}
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
// Only promote a select if we know that the other select operand
|
|
|
|
// is from another pointer that will also be promoted.
|
|
|
|
if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
|
|
|
|
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
|
|
|
|
return false;
|
2016-05-18 23:57:21 +08:00
|
|
|
|
|
|
|
// May need to rewrite constant operands.
|
|
|
|
WorkList.push_back(ICmp);
|
2016-05-12 09:58:58 +08:00
|
|
|
}
|
|
|
|
|
2016-12-10 08:52:50 +08:00
|
|
|
if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
|
2017-01-25 03:06:28 +08:00
|
|
|
// Give up if the pointer may be captured.
|
|
|
|
if (PointerMayBeCaptured(UseInst, true, true))
|
|
|
|
return false;
|
2016-12-10 08:52:50 +08:00
|
|
|
// Don't collect the users of this.
|
|
|
|
WorkList.push_back(User);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
if (!User->getType()->isPointerTy())
|
|
|
|
continue;
|
2014-11-01 04:52:04 +08:00
|
|
|
|
2016-02-03 05:16:12 +08:00
|
|
|
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
|
|
|
|
// Be conservative if an address could be computed outside the bounds of
|
|
|
|
// the alloca.
|
|
|
|
if (!GEP->isInBounds())
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
// Only promote a select if we know that the other select operand is from
|
|
|
|
// another pointer that will also be promoted.
|
|
|
|
if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
|
|
|
|
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Repeat for phis.
|
|
|
|
if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
|
|
|
|
// TODO: Handle more complex cases. We should be able to replace loops
|
|
|
|
// over arrays.
|
|
|
|
switch (Phi->getNumIncomingValues()) {
|
|
|
|
case 1:
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
WorkList.push_back(User);
|
2016-05-12 09:58:58 +08:00
|
|
|
if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
|
2016-02-03 03:18:53 +08:00
|
|
|
return false;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
2016-02-03 03:18:53 +08:00
|
|
|
|
|
|
|
return true;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
|
|
|
|
|
|
|
|
FunctionType *FTy = F.getFunctionType();
|
|
|
|
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
|
|
|
|
|
|
|
|
// If the function has any arguments in the local address space, then it's
|
|
|
|
// possible these arguments require the entire local memory space, so
|
|
|
|
// we cannot use local memory in the pass.
|
|
|
|
for (Type *ParamTy : FTy->params()) {
|
|
|
|
PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
|
|
|
|
if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
|
|
|
|
LocalMemLimit = 0;
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
|
|
|
|
"local memory disabled.\n");
|
2017-05-24 04:25:41 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LocalMemLimit = ST.getLocalMemorySize();
|
|
|
|
if (LocalMemLimit == 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const DataLayout &DL = Mod->getDataLayout();
|
|
|
|
|
|
|
|
// Check how much local memory is being used by global objects
|
|
|
|
CurrentLocalMemUsage = 0;
|
|
|
|
for (GlobalVariable &GV : Mod->globals()) {
|
|
|
|
if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (const User *U : GV.users()) {
|
|
|
|
const Instruction *Use = dyn_cast<Instruction>(U);
|
|
|
|
if (!Use)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (Use->getParent()->getParent() == &F) {
|
|
|
|
unsigned Align = GV.getAlignment();
|
|
|
|
if (Align == 0)
|
|
|
|
Align = DL.getABITypeAlignment(GV.getValueType());
|
|
|
|
|
|
|
|
// FIXME: Try to account for padding here. The padding is currently
|
|
|
|
// determined from the inverse order of uses in the function. I'm not
|
|
|
|
// sure if the use list order is in any way connected to this, so the
|
|
|
|
// total reported size is likely incorrect.
|
|
|
|
uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
|
|
|
|
CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
|
|
|
|
CurrentLocalMemUsage += AllocSize;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
|
|
|
|
F);
|
|
|
|
|
|
|
|
// Restrict local memory usage so that we don't drastically reduce occupancy,
|
|
|
|
// unless it is already significantly reduced.
|
|
|
|
|
|
|
|
// TODO: Have some sort of hint or other heuristics to guess occupancy based
|
|
|
|
// on other factors..
|
|
|
|
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
|
|
|
|
if (OccupancyHint == 0)
|
|
|
|
OccupancyHint = 7;
|
|
|
|
|
|
|
|
// Clamp to max value.
|
|
|
|
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
|
|
|
|
|
|
|
|
// Check the hint but ignore it if it's obviously wrong from the existing LDS
|
|
|
|
// usage.
|
|
|
|
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
|
|
|
|
|
|
|
|
|
|
|
|
// Round up to the next tier of usage.
|
|
|
|
unsigned MaxSizeWithWaveCount
|
|
|
|
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
|
|
|
|
|
|
|
|
// Program is possibly broken by using more local mem than available.
|
|
|
|
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
LocalMemLimit = MaxSizeWithWaveCount;
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
|
|
|
|
<< " bytes of LDS\n"
|
|
|
|
<< " Rounding size to " << MaxSizeWithWaveCount
|
|
|
|
<< " with a maximum occupancy of " << MaxOccupancy << '\n'
|
|
|
|
<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
|
|
|
|
<< " available for promotion\n");
|
2017-05-24 04:25:41 +08:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-05-17 05:19:59 +08:00
|
|
|
// FIXME: Should try to pick the most likely to be profitable allocas first.
|
2017-05-24 04:25:41 +08:00
|
|
|
bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
|
2016-04-29 02:38:48 +08:00
|
|
|
// Array allocations are probably not worth handling, since an allocation of
|
|
|
|
// the array type is the canonical form.
|
|
|
|
if (!I.isStaticAlloca() || I.isArrayAllocation())
|
2017-05-24 04:25:41 +08:00
|
|
|
return false;
|
2015-08-27 02:37:13 +08:00
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
IRBuilder<> Builder(&I);
|
|
|
|
|
|
|
|
// First try to replace the alloca with a vector
|
|
|
|
Type *AllocaTy = I.getAllocatedType();
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
if (tryPromoteAllocaToVector(&I, AS))
|
|
|
|
return true; // Promoted to vector.
|
2014-06-18 00:53:14 +08:00
|
|
|
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
const Function &ContainingFunction = *I.getParent()->getParent();
|
2017-05-03 02:33:18 +08:00
|
|
|
CallingConv::ID CC = ContainingFunction.getCallingConv();
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
|
2016-07-18 17:02:47 +08:00
|
|
|
// Don't promote the alloca to LDS for shader calling conventions as the work
|
|
|
|
// item ID intrinsics are not supported for these calling conventions.
|
|
|
|
// Furthermore not all LDS is available for some of the stages.
|
2017-05-03 02:33:18 +08:00
|
|
|
switch (CC) {
|
|
|
|
case CallingConv::AMDGPU_KERNEL:
|
|
|
|
case CallingConv::SPIR_KERNEL:
|
|
|
|
break;
|
|
|
|
default:
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs()
|
|
|
|
<< " promote alloca to LDS not supported with calling convention.\n");
|
2017-05-24 04:25:41 +08:00
|
|
|
return false;
|
2017-05-03 02:33:18 +08:00
|
|
|
}
|
2016-07-18 17:02:47 +08:00
|
|
|
|
2017-05-24 04:25:41 +08:00
|
|
|
// Not likely to have sufficient local memory for promotion.
|
|
|
|
if (!SufficientLDS)
|
|
|
|
return false;
|
|
|
|
|
2016-09-07 04:22:28 +08:00
|
|
|
const AMDGPUSubtarget &ST =
|
|
|
|
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
|
|
|
|
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
|
2016-05-17 05:19:59 +08:00
|
|
|
const DataLayout &DL = Mod->getDataLayout();
|
|
|
|
|
|
|
|
unsigned Align = I.getAlignment();
|
|
|
|
if (Align == 0)
|
|
|
|
Align = DL.getABITypeAlignment(I.getAllocatedType());
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-05-17 05:19:59 +08:00
|
|
|
// FIXME: This computed padding is likely wrong since it depends on inverse
|
|
|
|
// usage order.
|
|
|
|
//
|
|
|
|
// FIXME: It is also possible that if we're allowed to use all of the memory
|
|
|
|
// could could end up using more than the maximum due to alignment padding.
|
|
|
|
|
|
|
|
uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
|
|
|
|
uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
|
|
|
|
NewSize += AllocSize;
|
|
|
|
|
|
|
|
if (NewSize > LocalMemLimit) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " " << AllocSize
|
|
|
|
<< " bytes of local memory not available to promote\n");
|
2017-05-24 04:25:41 +08:00
|
|
|
return false;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
2016-05-17 05:19:59 +08:00
|
|
|
CurrentLocalMemUsage = NewSize;
|
|
|
|
|
2014-11-01 04:52:04 +08:00
|
|
|
std::vector<Value*> WorkList;
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
|
2017-05-24 04:25:41 +08:00
|
|
|
return false;
|
2014-11-01 04:52:04 +08:00
|
|
|
}
|
|
|
|
|
2018-05-14 20:53:11 +08:00
|
|
|
LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-02-06 03:47:23 +08:00
|
|
|
Function *F = I.getParent()->getParent();
|
|
|
|
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-15 00:27:07 +08:00
|
|
|
Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
|
2014-06-18 00:53:14 +08:00
|
|
|
GlobalVariable *GV = new GlobalVariable(
|
2016-02-06 03:47:23 +08:00
|
|
|
*Mod, GVTy, false, GlobalValue::InternalLinkage,
|
|
|
|
UndefValue::get(GVTy),
|
|
|
|
Twine(F->getName()) + Twine('.') + I.getName(),
|
|
|
|
nullptr,
|
|
|
|
GlobalVariable::NotThreadLocal,
|
2017-03-27 22:04:01 +08:00
|
|
|
AS.LOCAL_ADDRESS);
|
2016-06-15 05:01:22 +08:00
|
|
|
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
|
2016-02-06 03:47:23 +08:00
|
|
|
GV->setAlignment(I.getAlignment());
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-01-30 13:19:45 +08:00
|
|
|
Value *TCntY, *TCntZ;
|
|
|
|
|
|
|
|
std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
|
|
|
|
Value *TIdX = getWorkitemID(Builder, 0);
|
|
|
|
Value *TIdY = getWorkitemID(Builder, 1);
|
|
|
|
Value *TIdZ = getWorkitemID(Builder, 2);
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-02-03 03:18:48 +08:00
|
|
|
Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
|
2014-06-18 00:53:14 +08:00
|
|
|
Tmp0 = Builder.CreateMul(Tmp0, TIdX);
|
2016-02-03 03:18:48 +08:00
|
|
|
Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
|
2014-06-18 00:53:14 +08:00
|
|
|
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
|
|
|
|
TID = Builder.CreateAdd(TID, TIdZ);
|
|
|
|
|
2016-02-03 03:18:48 +08:00
|
|
|
Value *Indices[] = {
|
|
|
|
Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
|
|
|
|
TID
|
|
|
|
};
|
2014-06-18 00:53:14 +08:00
|
|
|
|
2016-02-03 03:18:48 +08:00
|
|
|
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
|
2014-06-18 00:53:14 +08:00
|
|
|
I.mutateType(Offset->getType());
|
|
|
|
I.replaceAllUsesWith(Offset);
|
|
|
|
I.eraseFromParent();
|
|
|
|
|
2016-02-03 03:32:35 +08:00
|
|
|
for (Value *V : WorkList) {
|
2014-06-18 00:53:14 +08:00
|
|
|
CallInst *Call = dyn_cast<CallInst>(V);
|
|
|
|
if (!Call) {
|
2016-05-18 23:57:21 +08:00
|
|
|
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
|
|
|
|
Value *Src0 = CI->getOperand(0);
|
|
|
|
Type *EltTy = Src0->getType()->getPointerElementType();
|
2017-03-27 22:04:01 +08:00
|
|
|
PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
|
2016-05-18 23:57:21 +08:00
|
|
|
|
|
|
|
if (isa<ConstantPointerNull>(CI->getOperand(0)))
|
|
|
|
CI->setOperand(0, ConstantPointerNull::get(NewTy));
|
|
|
|
|
|
|
|
if (isa<ConstantPointerNull>(CI->getOperand(1)))
|
|
|
|
CI->setOperand(1, ConstantPointerNull::get(NewTy));
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
2014-09-15 23:41:44 +08:00
|
|
|
|
2016-12-10 08:52:50 +08:00
|
|
|
// The operand's value should be corrected on its own and we don't want to
|
|
|
|
// touch the users.
|
2014-09-15 23:41:44 +08:00
|
|
|
if (isa<AddrSpaceCastInst>(V))
|
|
|
|
continue;
|
|
|
|
|
2016-05-18 23:57:21 +08:00
|
|
|
Type *EltTy = V->getType()->getPointerElementType();
|
2017-03-27 22:04:01 +08:00
|
|
|
PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
|
2016-05-18 23:57:21 +08:00
|
|
|
|
2014-09-15 23:41:44 +08:00
|
|
|
// FIXME: It doesn't really make sense to try to do this for all
|
|
|
|
// instructions.
|
2014-06-18 00:53:14 +08:00
|
|
|
V->mutateType(NewTy);
|
2016-05-18 23:57:21 +08:00
|
|
|
|
|
|
|
// Adjust the types of any constant operands.
|
|
|
|
if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
|
|
|
|
if (isa<ConstantPointerNull>(SI->getOperand(1)))
|
|
|
|
SI->setOperand(1, ConstantPointerNull::get(NewTy));
|
|
|
|
|
|
|
|
if (isa<ConstantPointerNull>(SI->getOperand(2)))
|
|
|
|
SI->setOperand(2, ConstantPointerNull::get(NewTy));
|
|
|
|
} else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
|
|
|
|
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
|
|
|
|
if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
|
|
|
|
Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-18 00:53:14 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-07-19 02:34:48 +08:00
|
|
|
IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
|
2014-06-18 00:53:14 +08:00
|
|
|
Builder.SetInsertPoint(Intr);
|
|
|
|
switch (Intr->getIntrinsicID()) {
|
|
|
|
case Intrinsic::lifetime_start:
|
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
// These intrinsics are for address space 0 only
|
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
case Intrinsic::memcpy: {
|
|
|
|
MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
|
[AMDGPUPromoteAlloca] Replace deprecated memory intrinsic APIs (NFCI)
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
AMDGPUPromoteAlloca pass to cease using:
1) The old getAlignment() API of MemoryIntrinsic in favour of getting source & dest specific
alignments through the new API.
2) The old IRBuilder createMemCpy/createMemMove single-alignment APIs in favour of the new
API that allows setting source and destination alignments independently.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, r323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
llvm-svn: 324774
2018-02-10 05:56:15 +08:00
|
|
|
Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(),
|
|
|
|
MemCpy->getRawSource(), MemCpy->getSourceAlignment(),
|
|
|
|
MemCpy->getLength(), MemCpy->isVolatile());
|
2014-06-18 00:53:14 +08:00
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
2016-02-03 04:28:10 +08:00
|
|
|
case Intrinsic::memmove: {
|
|
|
|
MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
|
[AMDGPUPromoteAlloca] Replace deprecated memory intrinsic APIs (NFCI)
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
AMDGPUPromoteAlloca pass to cease using:
1) The old getAlignment() API of MemoryIntrinsic in favour of getting source & dest specific
alignments through the new API.
2) The old IRBuilder createMemCpy/createMemMove single-alignment APIs in favour of the new
API that allows setting source and destination alignments independently.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, r323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
llvm-svn: 324774
2018-02-10 05:56:15 +08:00
|
|
|
Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(),
|
|
|
|
MemMove->getRawSource(), MemMove->getSourceAlignment(),
|
|
|
|
MemMove->getLength(), MemMove->isVolatile());
|
2016-02-03 04:28:10 +08:00
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
2014-06-18 00:53:14 +08:00
|
|
|
case Intrinsic::memset: {
|
|
|
|
MemSetInst *MemSet = cast<MemSetInst>(Intr);
|
|
|
|
Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
|
[AMDGPUPromoteAlloca] Replace deprecated memory intrinsic APIs (NFCI)
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
AMDGPUPromoteAlloca pass to cease using:
1) The old getAlignment() API of MemoryIntrinsic in favour of getting source & dest specific
alignments through the new API.
2) The old IRBuilder createMemCpy/createMemMove single-alignment APIs in favour of the new
API that allows setting source and destination alignments independently.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, r323891, rL324148, rL324273, rL324278,
rL324384, rL324395, rL324402, rL324626, rL324642, rL324653, rL324654, rL324773 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
llvm-svn: 324774
2018-02-10 05:56:15 +08:00
|
|
|
MemSet->getLength(), MemSet->getDestAlignment(),
|
2014-06-18 00:53:14 +08:00
|
|
|
MemSet->isVolatile());
|
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
2016-01-23 03:47:54 +08:00
|
|
|
case Intrinsic::invariant_start:
|
|
|
|
case Intrinsic::invariant_end:
|
2018-05-03 19:03:01 +08:00
|
|
|
case Intrinsic::launder_invariant_group:
|
2016-01-23 03:47:54 +08:00
|
|
|
Intr->eraseFromParent();
|
|
|
|
// FIXME: I think the invariant marker should still theoretically apply,
|
|
|
|
// but the intrinsics need to be changed to accept pointers with any
|
|
|
|
// address space.
|
|
|
|
continue;
|
2016-02-03 04:28:10 +08:00
|
|
|
case Intrinsic::objectsize: {
|
|
|
|
Value *Src = Intr->getOperand(0);
|
|
|
|
Type *SrcTy = Src->getType()->getPointerElementType();
|
|
|
|
Function *ObjectSize = Intrinsic::getDeclaration(Mod,
|
|
|
|
Intrinsic::objectsize,
|
2017-03-27 22:04:01 +08:00
|
|
|
{ Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
|
2016-02-03 04:28:10 +08:00
|
|
|
);
|
|
|
|
|
2017-03-22 04:08:59 +08:00
|
|
|
CallInst *NewCall = Builder.CreateCall(
|
|
|
|
ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
|
2016-02-03 04:28:10 +08:00
|
|
|
Intr->replaceAllUsesWith(NewCall);
|
|
|
|
Intr->eraseFromParent();
|
|
|
|
continue;
|
|
|
|
}
|
2014-06-18 00:53:14 +08:00
|
|
|
default:
|
2017-01-28 10:02:38 +08:00
|
|
|
Intr->print(errs());
|
2014-06-18 00:53:14 +08:00
|
|
|
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
|
|
|
|
}
|
|
|
|
}
|
2017-05-24 04:25:41 +08:00
|
|
|
return true;
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|
|
|
|
|
2017-05-19 01:21:13 +08:00
|
|
|
FunctionPass *llvm::createAMDGPUPromoteAlloca() {
|
|
|
|
return new AMDGPUPromoteAlloca();
|
2014-06-18 00:53:14 +08:00
|
|
|
}
|