[PPCGCodeGeneration] Allow intrinsics within kernels.

- In D33414, if any function call was found within a kernel, we would bail out.

- This is an over-approximation. This patch changes this by allowing the
  `llvm.sqrt.*` family of intrinsics.

- This introduces an additional step when creating a separate llvm::Module
  for a kernel (GPUModule). We now copy function declarations from the
  original module to new module.

- We also populate IslNodeBuilder::ValueMap so it replaces the function
  references to the old module to the ones in the new module
  (GPUModule).

Differential Revision: https://reviews.llvm.org/D34145

llvm-svn: 306284
This commit is contained in:
Siddharth Bhat 2017-06-26 13:12:06 +00:00
parent 256070d85c
commit f291c8d510
2 changed files with 191 additions and 20 deletions

View File

@ -255,8 +255,12 @@ private:
///
/// @param Kernel The kernel to scan for llvm::Values
///
/// @returns A set of values referenced by the kernel.
SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
/// @returns A pair, whose first element contains the set of values
/// referenced by the kernel, and whose second element contains the
/// set of functions referenced by the kernel. All functions in the
/// second set satisfy isValidFunctionInKernel.
std::pair<SetVector<Value *>, SetVector<Function *>>
getReferencesInKernel(ppcg_kernel *Kernel);
/// Compute the sizes of the execution grid for a given kernel.
///
@ -365,8 +369,11 @@ private:
///
/// @param Kernel The kernel to generate code for.
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
/// @param SubtreeFunctions The set of llvm::Functions referenced by this
/// kernel.
void createKernelFunction(ppcg_kernel *Kernel,
SetVector<Value *> &SubtreeValues);
SetVector<Value *> &SubtreeValues,
SetVector<Function *> &SubtreeFunctions);
/// Create the declaration of a kernel function.
///
@ -389,6 +396,25 @@ private:
/// @param The kernel to generate the intrinsic functions for.
void insertKernelIntrinsics(ppcg_kernel *Kernel);
/// Setup the creation of functions referenced by the GPU kernel.
///
/// 1. Create new function declarations in GPUModule which are the same as
/// SubtreeFunctions.
///
/// 2. Populate IslNodeBuilder::ValueMap with mappings from
/// old functions (that come from the original module) to new functions
/// (that are created within GPUModule). That way, we generate references
/// to the correct function (in GPUModule) in BlockGenerator.
///
/// @see IslNodeBuilder::ValueMap
/// @see BlockGenerator::GlobalMap
/// @see BlockGenerator::getNewValue
/// @see GPUNodeBuilder::getReferencesInKernel.
///
/// @param SubtreeFunctions The set of llvm::Functions referenced by
/// this kernel.
void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
/// Create a global-to-shared or shared-to-global copy statement.
///
/// @param CopyStmt The copy statement to generate code for
@ -1109,7 +1135,40 @@ isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
return isl_bool_true;
}
SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
/// Check if F is a function that we can code-generate in a GPU kernel.
static bool isValidFunctionInKernel(llvm::Function *F) {
assert(F && "F is an invalid pointer");
// We string compare against the name of the function to allow
// all variants of the intrinsic "llvm.sqrt.*"
return F->isIntrinsic() && F->getName().startswith("llvm.sqrt");
}
/// Do not take `Function` as a subtree value.
///
/// We try to take the reference of all subtree values and pass them along
/// to the kernel from the host. Taking an address of any function and
/// trying to pass along is nonsensical. Only allow `Value`s that are not
/// `Function`s.
static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
/// Return `Function`s from `RawSubtreeValues`.
static SetVector<Function *>
getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues) {
SetVector<Function *> SubtreeFunctions;
for (Value *It : RawSubtreeValues) {
Function *F = dyn_cast<Function>(It);
if (F) {
assert(isValidFunctionInKernel(F) && "Code should have bailed out by "
"this point if an invalid function "
"were present in a kernel.");
SubtreeFunctions.insert(F);
}
}
return SubtreeFunctions;
}
std::pair<SetVector<Value *>, SetVector<Function *>>
GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
SetVector<Value *> SubtreeValues;
SetVector<const SCEV *> SCEVs;
SetVector<const Loop *> Loops;
@ -1146,7 +1205,19 @@ SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
isl_id_free(Id);
}
return SubtreeValues;
// Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
// SubtreeValues. This is important, because we should not lose any
// SubtreeValues in the process of constructing the
// "ValidSubtree{Values, Functions} sets. Nor should the set
// ValidSubtree{Values, Functions} have any common element.
auto ValidSubtreeValuesIt =
make_filter_range(SubtreeValues, isValidSubtreeValue);
SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
ValidSubtreeValuesIt.end());
SetVector<Function *> ValidSubtreeFunctions(
getFunctionsFromRawSubtreeValues(SubtreeValues));
return std::make_pair(ValidSubtreeValues, ValidSubtreeFunctions);
}
void GPUNodeBuilder::clearDominators(Function *F) {
@ -1353,6 +1424,21 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
Launch + "_params_i8ptr", Location);
}
void GPUNodeBuilder::setupKernelSubtreeFunctions(
SetVector<Function *> SubtreeFunctions) {
for (auto Fn : SubtreeFunctions) {
const std::string ClonedFnName = Fn->getName();
Function *Clone = GPUModule->getFunction(ClonedFnName);
if (!Clone)
Clone =
Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
ClonedFnName, GPUModule.get());
assert(Clone && "Expected cloned function to be initialized.");
assert(ValueMap.find(Fn) == ValueMap.end() &&
"Fn already present in ValueMap");
ValueMap[Fn] = Clone;
}
}
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
@ -1369,7 +1455,9 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
Value *BlockDimX, *BlockDimY, *BlockDimZ;
std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
SetVector<Value *> SubtreeValues;
SetVector<Function *> SubtreeFunctions;
std::tie(SubtreeValues, SubtreeFunctions) = getReferencesInKernel(Kernel);
assert(Kernel->tree && "Device AST of kernel node is empty");
@ -1393,7 +1481,8 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
SubtreeValues.insert(V);
}
createKernelFunction(Kernel, SubtreeValues);
createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
setupKernelSubtreeFunctions(SubtreeFunctions);
create(isl_ast_node_copy(Kernel->tree));
@ -1721,8 +1810,9 @@ void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
}
}
void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
SetVector<Value *> &SubtreeValues) {
void GPUNodeBuilder::createKernelFunction(
ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
SetVector<Function *> &SubtreeFunctions) {
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
GPUModule.reset(new Module(Identifier, Builder.getContext()));
@ -2611,9 +2701,18 @@ public:
return isl_ast_expr_ge(Iterations, MinComputeExpr);
}
/// Check whether the Block contains any Function value.
bool ContainsFnPtrValInBlock(const BasicBlock *BB) {
for (const Instruction &Inst : *BB)
/// Check if the basic block contains a function we cannot codegen for GPU
/// kernels.
///
/// If this basic block does something with a `Function` other than calling
/// a function that we support in a kernel, return true.
bool containsInvalidKernelFunctionInBllock(const BasicBlock *BB) {
for (const Instruction &Inst : *BB) {
const CallInst *Call = dyn_cast<CallInst>(&Inst);
if (Call && isValidFunctionInKernel(Call->getCalledFunction())) {
continue;
}
for (Value *SrcVal : Inst.operands()) {
PointerType *p = dyn_cast<PointerType>(SrcVal->getType());
if (!p)
@ -2621,20 +2720,21 @@ public:
if (isa<FunctionType>(p->getElementType()))
return true;
}
}
return false;
}
/// Return whether the Scop S has functions.
bool ContainsFnPtr(const Scop &S) {
/// Return whether the Scop S uses functions in a way that we do not support.
bool containsInvalidKernelFunction(const Scop &S) {
for (auto &Stmt : S) {
if (Stmt.isBlockStmt()) {
if (ContainsFnPtrValInBlock(Stmt.getBasicBlock()))
if (containsInvalidKernelFunctionInBllock(Stmt.getBasicBlock()))
return true;
} else {
assert(Stmt.isRegionStmt() &&
"Stmt was neither block nor region statement");
for (const BasicBlock *BB : Stmt.getRegion()->blocks())
if (ContainsFnPtrValInBlock(BB))
if (containsInvalidKernelFunctionInBllock(BB))
return true;
}
}
@ -2708,13 +2808,18 @@ public:
DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
// We currently do not support functions inside kernels, as code
// generation will need to offload function calls to the kernel.
// This may lead to a kernel trying to call a function on the host.
// We currently do not support functions other than intrinsics inside
// kernels, as code generation will need to offload function calls to the
// kernel. This may lead to a kernel trying to call a function on the host.
// This also allows us to prevent codegen from trying to take the
// address of an intrinsic function to send to the kernel.
if (ContainsFnPtr(CurrentScop))
if (containsInvalidKernelFunction(CurrentScop)) {
DEBUG(
dbgs()
<< "Scop contains function which cannot be materialised in a GPU "
"kernel. Bailing out.\n";);
return false;
}
auto PPCGScop = createPPCGScop();
auto PPCGProg = createPPCGProg(PPCGScop);

View File

@ -0,0 +1,66 @@
; RUN: opt %loadPolly -analyze -polly-scops < %s | FileCheck %s --check-prefix=SCOP
; RUN: opt %loadPolly -analyze -polly-codegen-ppcg -polly-acc-dump-kernel-ir < %s | FileCheck %s --check-prefix=KERNEL-IR
; RUN: opt %loadPolly -S -polly-codegen-ppcg < %s | FileCheck %s --check-prefix=HOST-IR
; Test that we do recognise and codegen a kernel that has intrinsics.
; REQUIRES: pollyacc
; Check that we model the kernel as a scop.
; SCOP: Function: f
; SCOP-NEXT: Region: %entry.split---%for.end
; Check that the intrinsic call is present in the kernel IR.
; KERNEL-IR: %p_sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val_p_scalar_)
; KERNEL-IR: declare float @llvm.sqrt.f32(float) #2
; Check that kernel launch is generated in host IR.
; the declare would not be generated unless a call to a kernel exists.
; HOST-IR: declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)
; void f(float *A, float *B, int N) {
; for(int i = 0; i < N; i++) {
; B[i] = sqrt(A[i]);
; }
; }
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @f(float* %A, float* %B, i32 %N) {
entry:
br label %entry.split
entry.split: ; preds = %entry
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry.split
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%A.arr.i = getelementptr inbounds float, float* %A, i64 %indvars.iv
%A.arr.i.val = load float, float* %A.arr.i, align 4
; Call to intrinsic that should be part of the kernel.
%sqrt = tail call float @llvm.sqrt.f32(float %A.arr.i.val)
%B.arr.i = getelementptr inbounds float, float* %B, i64 %indvars.iv
store float %sqrt, float* %B.arr.i, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%wide.trip.count = zext i32 %N to i64
%exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
for.cond.for.end_crit_edge: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.cond.for.end_crit_edge, %entry.split
ret void
}
; Function Attrs: nounwind readnone
declare float @llvm.sqrt.f32(float) #0
attributes #0 = { nounwind readnone }