forked from OSchip/llvm-project
GPGPU: create kernel function skeleton
Create for each kernel a separate LLVM-IR module containing a single function marked as kernel function and taking one pointer for each array referenced by this kernel. Add debugging output to verify the kernels are generated correctly. llvm-svn: 275952
This commit is contained in:
parent
651777da8b
commit
32837fe313
|
@ -51,6 +51,11 @@ static cl::opt<bool>
|
|||
cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
|
||||
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
|
||||
cl::desc("Dump the kernel LLVM-IR"),
|
||||
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
||||
cl::cat(PollyCategory));
|
||||
|
||||
/// Create the ast expressions for a ScopStmt.
|
||||
///
|
||||
/// This function is a callback for to generate the ast expressions for each
|
||||
|
@ -80,10 +85,18 @@ class GPUNodeBuilder : public IslNodeBuilder {
|
|||
public:
|
||||
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
|
||||
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
|
||||
DominatorTree &DT, Scop &S)
|
||||
: IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S) {}
|
||||
DominatorTree &DT, Scop &S, gpu_prog *Prog)
|
||||
: IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {}
|
||||
|
||||
private:
|
||||
/// A module containing GPU code.
|
||||
///
|
||||
/// This pointer is only set in case we are currently generating GPU code.
|
||||
std::unique_ptr<Module> GPUModule;
|
||||
|
||||
/// The GPU program we generate code for.
|
||||
gpu_prog *Prog;
|
||||
|
||||
/// Create code for user-defined AST nodes.
|
||||
///
|
||||
/// These AST nodes can be of type:
|
||||
|
@ -94,13 +107,145 @@ private:
|
|||
///
|
||||
/// @param UserStmt The ast node to generate code for.
|
||||
virtual void createUser(__isl_take isl_ast_node *UserStmt);
|
||||
|
||||
/// Create GPU kernel.
|
||||
///
|
||||
/// Code generate the kernel described by @p KernelStmt.
|
||||
///
|
||||
/// @param KernelStmt The ast node to generate kernel code for.
|
||||
void createKernel(__isl_take isl_ast_node *KernelStmt);
|
||||
|
||||
/// Create kernel function.
|
||||
///
|
||||
/// Create a kernel function located in a newly created module that can serve
|
||||
/// as target for device code generation. Set the Builder to point to the
|
||||
/// start block of this newly created function.
|
||||
///
|
||||
/// @param Kernel The kernel to generate code for.
|
||||
void createKernelFunction(ppcg_kernel *Kernel);
|
||||
|
||||
/// Create the declaration of a kernel function.
|
||||
///
|
||||
/// The kernel function takes as arguments:
|
||||
///
|
||||
/// - One i8 pointer for each external array reference used in the kernel.
|
||||
/// - Host iterators (TODO)
|
||||
/// - Parameters (TODO)
|
||||
/// - Other LLVM Value references (TODO)
|
||||
///
|
||||
/// @param Kernel The kernel to generate the function declaration for.
|
||||
/// @returns The newly declared function.
|
||||
Function *createKernelFunctionDecl(ppcg_kernel *Kernel);
|
||||
|
||||
/// Finalize the generation of the kernel function.
|
||||
///
|
||||
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
|
||||
/// dump its IR to stderr.
|
||||
void finalizeKernelFunction();
|
||||
};
|
||||
|
||||
void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
|
||||
isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
|
||||
isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
|
||||
isl_id *Id = isl_ast_expr_get_id(StmtExpr);
|
||||
isl_id_free(Id);
|
||||
isl_ast_expr_free(StmtExpr);
|
||||
|
||||
const char *Str = isl_id_get_name(Id);
|
||||
if (!strcmp(Str, "kernel")) {
|
||||
createKernel(UserStmt);
|
||||
isl_ast_expr_free(Expr);
|
||||
return;
|
||||
}
|
||||
|
||||
isl_ast_expr_free(Expr);
|
||||
isl_ast_node_free(UserStmt);
|
||||
return;
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
||||
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
|
||||
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
|
||||
isl_id_free(Id);
|
||||
isl_ast_node_free(KernelStmt);
|
||||
|
||||
assert(Kernel->tree && "Device AST of kernel node is empty");
|
||||
|
||||
Instruction &HostInsertPoint = *Builder.GetInsertPoint();
|
||||
|
||||
createKernelFunction(Kernel);
|
||||
|
||||
Builder.SetInsertPoint(&HostInsertPoint);
|
||||
|
||||
finalizeKernelFunction();
|
||||
}
|
||||
|
||||
/// Compute the DataLayout string for the NVPTX backend.
|
||||
///
|
||||
/// @param is64Bit Are we looking for a 64 bit architecture?
|
||||
static std::string computeNVPTXDataLayout(bool is64Bit) {
|
||||
std::string Ret = "e";
|
||||
|
||||
if (!is64Bit)
|
||||
Ret += "-p:32:32";
|
||||
|
||||
Ret += "-i64:64-v16:16-v32:32-n16:32:64";
|
||||
|
||||
return Ret;
|
||||
}
|
||||
|
||||
Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
|
||||
std::vector<Type *> Args;
|
||||
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
|
||||
|
||||
for (long i = 0; i < Prog->n_array; i++) {
|
||||
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
||||
continue;
|
||||
|
||||
Args.push_back(Builder.getInt8PtrTy());
|
||||
}
|
||||
|
||||
auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
|
||||
GPUModule.get());
|
||||
FN->setCallingConv(CallingConv::PTX_Kernel);
|
||||
|
||||
auto Arg = FN->arg_begin();
|
||||
for (long i = 0; i < Kernel->n_array; i++) {
|
||||
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
||||
continue;
|
||||
|
||||
Arg->setName(Prog->array[i].name);
|
||||
Arg++;
|
||||
}
|
||||
|
||||
return FN;
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel) {
|
||||
|
||||
std::string Identifier = "kernel_" + std::to_string(Kernel->id);
|
||||
GPUModule.reset(new Module(Identifier, Builder.getContext()));
|
||||
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
|
||||
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
|
||||
|
||||
Function *FN = createKernelFunctionDecl(Kernel);
|
||||
|
||||
auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
|
||||
|
||||
Builder.SetInsertPoint(EntryBlock);
|
||||
Builder.CreateRetVoid();
|
||||
Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
|
||||
}
|
||||
|
||||
void GPUNodeBuilder::finalizeKernelFunction() {
|
||||
|
||||
if (DumpKernelIR)
|
||||
outs() << *GPUModule << "\n";
|
||||
|
||||
GPUModule.release();
|
||||
}
|
||||
|
||||
namespace {
|
||||
class PPCGCodeGeneration : public ScopPass {
|
||||
public:
|
||||
|
@ -693,8 +838,9 @@ public:
|
|||
|
||||
/// Generate code for a given GPU AST described by @p Root.
|
||||
///
|
||||
/// @param An isl_ast_node pointing to the root of the GPU AST.
|
||||
void generateCode(__isl_take isl_ast_node *Root) {
|
||||
/// @param Root An isl_ast_node pointing to the root of the GPU AST.
|
||||
/// @param Prog The GPU Program to generate code for.
|
||||
void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
|
||||
ScopAnnotator Annotator;
|
||||
Annotator.buildAliasScopes(*S);
|
||||
|
||||
|
@ -706,8 +852,8 @@ public:
|
|||
|
||||
PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
|
||||
|
||||
GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT,
|
||||
*S);
|
||||
GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT, *S,
|
||||
Prog);
|
||||
|
||||
// Only build the run-time condition and parameters _after_ having
|
||||
// introduced the conditional branch. This is important as the conditional
|
||||
|
@ -741,7 +887,7 @@ public:
|
|||
auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
|
||||
|
||||
if (PPCGGen->tree)
|
||||
generateCode(isl_ast_node_copy(PPCGGen->tree));
|
||||
generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
|
||||
|
||||
freeOptions(PPCGScop);
|
||||
freePPCGGen(PPCGGen);
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
|
||||
; RUN: -disable-output < %s | \
|
||||
; RUN: FileCheck -check-prefix=KERNEL %s
|
||||
;
|
||||
; void kernel_params_only_some_arrays(float A[], float B[]) {
|
||||
; for (long i = 0; i < 32; i++)
|
||||
; A[i] += 42;
|
||||
;
|
||||
; for (long i = 0; i < 32; i++)
|
||||
; B[i] += 42;
|
||||
; }
|
||||
|
||||
; KERNEL: ; ModuleID = 'kernel_0'
|
||||
; KERNEL-NEXT: source_filename = "kernel_0"
|
||||
; KERNEL-NEXT: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
|
||||
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
; KERNEL: define ptx_kernel void @kernel_0(i8* %MemRef_A) {
|
||||
; KERNEL-NEXT: entry:
|
||||
; KERNEL-NEXT: ret void
|
||||
; KERNEL-NEXT: }
|
||||
|
||||
; KERNEL: ; ModuleID = 'kernel_1'
|
||||
; KERNEL-NEXT: source_filename = "kernel_1"
|
||||
; KERNEL-NEXT: target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
|
||||
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
; KERNEL: define ptx_kernel void @kernel_1(i8* %MemRef_B) {
|
||||
; KERNEL-NEXT: entry:
|
||||
; KERNEL-NEXT: ret void
|
||||
; KERNEL-NEXT: }
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @kernel_params_only_some_arrays(float* %A, float* %B) {
|
||||
entry:
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc, %entry
|
||||
%i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ]
|
||||
%exitcond1 = icmp ne i64 %i.0, 32
|
||||
br i1 %exitcond1, label %for.body, label %for.end
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
%arrayidx = getelementptr inbounds float, float* %A, i64 %i.0
|
||||
%tmp = load float, float* %arrayidx, align 4
|
||||
%add = fadd float %tmp, 4.200000e+01
|
||||
store float %add, float* %arrayidx, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body
|
||||
%inc = add nuw nsw i64 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end: ; preds = %for.cond
|
||||
br label %for.cond2
|
||||
|
||||
for.cond2: ; preds = %for.inc7, %for.end
|
||||
%i1.0 = phi i64 [ 0, %for.end ], [ %inc8, %for.inc7 ]
|
||||
%exitcond = icmp ne i64 %i1.0, 32
|
||||
br i1 %exitcond, label %for.body4, label %for.end9
|
||||
|
||||
for.body4: ; preds = %for.cond2
|
||||
%arrayidx5 = getelementptr inbounds float, float* %B, i64 %i1.0
|
||||
%tmp2 = load float, float* %arrayidx5, align 4
|
||||
%add6 = fadd float %tmp2, 4.200000e+01
|
||||
store float %add6, float* %arrayidx5, align 4
|
||||
br label %for.inc7
|
||||
|
||||
for.inc7: ; preds = %for.body4
|
||||
%inc8 = add nuw nsw i64 %i1.0, 1
|
||||
br label %for.cond2
|
||||
|
||||
for.end9: ; preds = %for.cond2
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue