forked from OSchip/llvm-project
[PPCGCodeGeneration] Allow intrinsics within kernels.
- In D33414, if any function call was found within a kernel, we would bail out. - This is an over-approximation. This patch changes this by allowing the `llvm.sqrt.*` family of intrinsics. - This introduces an additional step when creating a separate llvm::Module for a kernel (GPUModule). We now copy function declarations from the original module to new module. - We also populate IslNodeBuilder::ValueMap so it replaces the function references to the old module to the ones in the new module (GPUModule). Differential Revision: https://reviews.llvm.org/D34145 llvm-svn: 306284
This commit is contained in:
parent
256070d85c
commit
f291c8d510
|
@ -255,8 +255,12 @@ private:
|
|||
///
|
||||
/// @param Kernel The kernel to scan for llvm::Values
|
||||
///
|
||||
/// @returns A set of values referenced by the kernel.
|
||||
SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
|
||||
/// @returns A pair, whose first element contains the set of values
|
||||
/// referenced by the kernel, and whose second element contains the
|
||||
/// set of functions referenced by the kernel. All functions in the
|
||||
/// second set satisfy isValidFunctionInKernel.
|
||||
std::pair<SetVector<Value *>, SetVector<Function *>>
|
||||
getReferencesInKernel(ppcg_kernel *Kernel);
|
||||
|
||||
/// Compute the sizes of the execution grid for a given kernel.
|
||||
///
|
||||
|
@ -365,8 +369,11 @@ private:
|
|||
///
|
||||
/// @param Kernel The kernel to generate code for.
|
||||
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
|
||||
/// @param SubtreeFunctions The set of llvm::Functions referenced by this
|
||||
/// kernel.
|
||||
void createKernelFunction(ppcg_kernel *Kernel,
|
||||
SetVector<Value *> &SubtreeValues);
|
||||
SetVector<Value *> &SubtreeValues,
|
||||
SetVector<Function *> &SubtreeFunctions);
|
||||
|
||||
/// Create the declaration of a kernel function.
|
||||
///
|
||||
|
@ -389,6 +396,25 @@ private:
|
|||
/// @param The kernel to generate the intrinsic functions for.
|
||||
void insertKernelIntrinsics(ppcg_kernel *Kernel);
|
||||
|
||||
/// Setup the creation of functions referenced by the GPU kernel.
|
||||
///
|
||||
/// 1. Create new function declarations in GPUModule which are the same as
|
||||
/// SubtreeFunctions.
|
||||
///
|
||||
/// 2. Populate IslNodeBuilder::ValueMap with mappings from
|
||||
/// old functions (that come from the original module) to new functions
|
||||
/// (that are created within GPUModule). That way, we generate references
|
||||
/// to the correct function (in GPUModule) in BlockGenerator.
|
||||
///
|
||||
/// @see IslNodeBuilder::ValueMap
|
||||
/// @see BlockGenerator::GlobalMap
|
||||
/// @see BlockGenerator::getNewValue
|
||||
/// @see GPUNodeBuilder::getReferencesInKernel.
|
||||
///
|
||||
/// @param SubtreeFunctions The set of llvm::Functions referenced by
|
||||
/// this kernel.
|
||||
void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
|
||||
|
||||
/// Create a global-to-shared or shared-to-global copy statement.
|
||||
///
|
||||
/// @param CopyStmt The copy statement to generate code for
|
||||
|
@ -1109,7 +1135,40 @@ isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
|
|||
return isl_bool_true;
|
||||
}
|
||||
|
||||
SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
|
||||
/// Check if F is a function that we can code-generate in a GPU kernel.
|
||||
static bool isValidFunctionInKernel(llvm::Function *F) {
|
||||
assert(F && "F is an invalid pointer");
|
||||
// We string compare against the name of the function to allow
|
||||
// all variants of the intrinsic "llvm.sqrt.*"
|
||||
return F->isIntrinsic() && F->getName().startswith("llvm.sqrt");
|
||||
}
|
||||
|
||||
/// Do not take `Function` as a subtree value.
|
||||
///
|
||||
/// We try to take the reference of all subtree values and pass them along
|
||||
/// to the kernel from the host. Taking an address of any function and
|
||||
/// trying to pass along is nonsensical. Only allow `Value`s that are not
|
||||
/// `Function`s.
|
||||
static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
|
||||
|
||||
/// Return `Function`s from `RawSubtreeValues`.
|
||||
static SetVector<Function *>
|
||||
getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) {
|
||||
SetVector<Function *> SubtreeFunctions;
|
||||
for (Value *It : RawSubtreeValues) {
|
||||
Function *F = dyn_cast<Function>(It);
|
||||
if (F) {
|
||||
assert(isValidFunctionInKernel(F) && "Code should have bailed out by "
|
||||
"this point if an invalid function "
|
||||
"were present in a kernel.");
|
||||
SubtreeFunctions.insert(F);
|
||||
}
|
||||
}
|
||||
return SubtreeFunctions;
|
||||
}
|
||||
|
||||
std::pair<SetVector<Value *>, SetVector<Function *>>
|
||||
GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
|
||||
SetVector<Value *> SubtreeValues;
|
||||
SetVector<const SCEV *> SCEVs;
|
||||
SetVector<const Loop *> Loops;
|
||||
|
@ -1146,7 +1205,19 @@ SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
|
|||
isl_id_free(Id);
|
||||
}
|
||||
|
||||
return SubtreeValues;
|
||||
// Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
|
||||
// SubtreeValues. This is important, because we should not lose any
|
||||
// SubtreeValues in the process of constructing the
|
||||
// "ValidSubtree{Values, Functions} sets. Nor should the set
|
||||
// ValidSubtree{Values, Functions} have any common element.
|
||||
auto ValidSubtreeValuesIt =
|
||||
make_filter_range(SubtreeValues, isValidSubtreeValue);
|
||||
SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
|
||||
ValidSubtreeValuesIt.end());
|
||||
SetVector<Function *> ValidSubtreeFunctions(
|
||||
getFunctionsFromRawSubtreeValues(SubtreeValues));
|
||||
|
||||
return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions);
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::clearDominators(Function *F) {
|
||||
|
@ -1353,6 +1424,21 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
|
|||
Launch + "_params_i8ptr", Location);
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::setupKernelSubtreeFunctions(
|
||||
SetVector<Function *> SubtreeFunctions) {
|
||||
for (auto Fn : SubtreeFunctions) {
|
||||
const std::string ClonedFnName = Fn->getName();
|
||||
Function *Clone = GPUModule->getFunction(ClonedFnName);
|
||||
if (!Clone)
|
||||
Clone =
|
||||
Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
|
||||
ClonedFnName, GPUModule.get());
|
||||
assert(Clone && "Expected cloned function to be initialized.");
|
||||
assert(ValueMap.find(Fn) == ValueMap.end() &&
|
||||
"Fn already present in ValueMap");
|
||||
ValueMap[Fn] = Clone;
|
||||
}
|
||||
}
|
||||
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
||||
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
|
||||
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
|
||||
|
@ -1369,7 +1455,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
|||
Value *BlockDimX, *BlockDimY, *BlockDimZ;
|
||||
std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
|
||||
|
||||
SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
|
||||
SetVector<Value *> SubtreeValues;
|
||||
SetVector<Function *> SubtreeFunctions;
|
||||
std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel);
|
||||
|
||||
assert(Kernel->tree && "Device AST of kernel node is empty");
|
||||
|
||||
|
@ -1393,7 +1481,8 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
|||
SubtreeValues.insert(V);
|
||||
}
|
||||
|
||||
createKernelFunction(Kernel, SubtreeValues);
|
||||
createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
|
||||
setupKernelSubtreeFunctions(SubtreeFunctions);
|
||||
|
||||
create(isl_ast_node_copy(Kernel->tree));
|
||||
|
||||
|
@ -1721,8 +1810,9 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
|
|||
}
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
|
||||
SetVector<Value *> &SubtreeValues) {
|
||||
void GPUNodeBuilder::createKernelFunction(
|
||||
ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
|
||||
SetVector<Function *> &SubtreeFunctions) {
|
||||
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
|
||||
GPUModule.reset(new Module(Identifier, Builder.getContext()));
|
||||
|
||||
|
@ -2611,9 +2701,18 @@ public:
|
|||
return isl_ast_expr_ge(Iterations, MinComputeExpr);
|
||||
}
|
||||
|
||||
/// Check whether the Block contains any Function value.
|
||||
bool ContainsFnPtrValInBlock(const BasicBlock *BB) {
|
||||
for (const Instruction &Inst : *BB)
|
||||
/// Check if the basic block contains a function we cannot codegen for GPU
|
||||
/// kernels.
|
||||
///
|
||||
/// If this basic block does something with a `Function` other than calling
|
||||
/// a function that we support in a kernel, return true.
|
||||
bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) {
|
||||
for (const Instruction &Inst : *BB) {
|
||||
const CallInst *Call = dyn_cast<CallInst>(&Inst);
|
||||
if (Call && isValidFunctionInKernel(Call->getCalledFunction())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (Value *SrcVal : Inst.operands()) {
|
||||
PointerType *p = dyn_cast<PointerType>(SrcVal->getType());
|
||||
if (!p)
|
||||
|
@ -2621,20 +2720,21 @@ public:
|
|||
if (isa<FunctionType>(p->getElementType()))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Return whether the Scop S has functions.
|
||||
bool ContainsFnPtr(const Scop &S) {
|
||||
/// Return whether the Scop S uses functions in a way that we do not support.
|
||||
bool containsInvalidKernelFunction(const Scop &S) {
|
||||
for (auto &Stmt : S) {
|
||||
if (Stmt.isBlockStmt()) {
|
||||
if (ContainsFnPtrValInBlock(Stmt.getBasicBlock()))
|
||||
if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock()))
|
||||
return true;
|
||||
} else {
|
||||
assert(Stmt.isRegionStmt() &&
|
||||
"Stmt was neither block nor region statement");
|
||||
for (const BasicBlock *BB : Stmt.getRegion()->blocks())
|
||||
if (ContainsFnPtrValInBlock(BB))
|
||||
if (containsInvalidKernelFunctionInBllock(BB))
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -2708,13 +2808,18 @@ public:
|
|||
DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
|
||||
RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
|
||||
|
||||
// We currently do not support functions inside kernels, as code
|
||||
// generation will need to offload function calls to the kernel.
|
||||
// This may lead to a kernel trying to call a function on the host.
|
||||
// We currently do not support functions other than intrinsics inside
|
||||
// kernels, as code generation will need to offload function calls to the
|
||||
// kernel. This may lead to a kernel trying to call a function on the host.
|
||||
// This also allows us to prevent codegen from trying to take the
|
||||
// address of an intrinsic function to send to the kernel.
|
||||
if (ContainsFnPtr(CurrentScop))
|
||||
if (containsInvalidKernelFunction(CurrentScop)) {
|
||||
DEBUG(
|
||||
dbgs()
|
||||
<< "Scop contains function which cannot be materialised in a GPU "
|
||||
"kernel. Bailing out.\n";);
|
||||
return false;
|
||||
}
|
||||
|
||||
auto PPCGScop = createPPCGScop();
|
||||
auto PPCGProg = createPPCGProg(PPCGScop);
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP
|
||||
; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR
|
||||
; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR
|
||||
|
||||
; Test that we do recognise and codegen a kernel that has intrinsics.
|
||||
|
||||
; REQUIRES: pollyacc
|
||||
|
||||
; Check that we model the kernel as a scop.
|
||||
; SCOP: Function: f
|
||||
; SCOP-NEXT: Region: %entry.split---%for.end
|
||||
|
||||
; Check that the intrinsic call is present in the kernel IR.
|
||||
; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
|
||||
; KERNEL-IR: declare float @llvm.sqrt.f32(float) #2
|
||||
|
||||
; Check that kernel launch is generated in host IR.
|
||||
; the declare would not be generated unless a call to a kernel exists.
|
||||
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
|
||||
|
||||
|
||||
; void f(float *A, float *B, int N) {
|
||||
; for(int i = 0; i < N; i++) {
|
||||
; B[i] = sqrt(A[i]);
|
||||
; }
|
||||
; }
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @f(float* %A, float* %B, i32 %N) {
|
||||
entry:
|
||||
br label %entry.split
|
||||
|
||||
entry.split: ; preds = %entry
|
||||
%cmp1 = icmp sgt i32 %N, 0
|
||||
br i1 %cmp1, label %for.body.lr.ph, label %for.end
|
||||
|
||||
for.body.lr.ph: ; preds = %entry.split
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.lr.ph, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
|
||||
%A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
|
||||
%A.arr.i.val = load float, float* %A.arr.i, align 4
|
||||
; Call to intrinsic that should be part of the kernel.
|
||||
%sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
|
||||
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
|
||||
store float %sqrt, float* %B.arr.i, align 4
|
||||
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%wide.trip.count = zext i32 %N to i64
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
|
||||
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
|
||||
|
||||
for.cond.for.end_crit_edge: ; preds = %for.body
|
||||
br label %for.end
|
||||
|
||||
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare float @llvm.sqrt.f32(float) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
Loading…
Reference in New Issue