forked from OSchip/llvm-project
GPGPU: Create host control flow
Create LLVM-IR for all host-side control flow of a given GPU AST. We implement this by introducing a new GPUNodeBuilder class derived from IslNodeBuilder. The IslNodeBuilder will take care of generating all general-purpose ast nodes, but we provide our own createUser implementation to handle the different GPU specific user statements. For now, we just skip any user statement and only generate a host-code sceleton, but in subsequent commits we will add handling of normal ScopStmt's performing computations, kernel calls, as well as host-device data transfers. We will also introduce run-time check generation and LICM in subsequent commits. llvm-svn: 275783
This commit is contained in:
parent
52bd8012bd
commit
38fc0aed08
|
@ -13,6 +13,7 @@
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
#include "polly/CodeGen/IslNodeBuilder.h"
|
#include "polly/CodeGen/IslNodeBuilder.h"
|
||||||
|
#include "polly/CodeGen/Utils.h"
|
||||||
#include "polly/DependenceInfo.h"
|
#include "polly/DependenceInfo.h"
|
||||||
#include "polly/LinkAllPasses.h"
|
#include "polly/LinkAllPasses.h"
|
||||||
#include "polly/Options.h"
|
#include "polly/Options.h"
|
||||||
|
@ -68,6 +69,35 @@ static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generate code for a GPU specific isl AST.
|
||||||
|
///
|
||||||
|
/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
|
||||||
|
/// generates code for general-prupose AST nodes, with special functionality
|
||||||
|
/// for generating GPU specific user nodes.
|
||||||
|
///
|
||||||
|
/// @see GPUNodeBuilder::createUser
|
||||||
|
class GPUNodeBuilder : public IslNodeBuilder {
|
||||||
|
public:
|
||||||
|
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
|
||||||
|
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
|
||||||
|
DominatorTree &DT, Scop &S)
|
||||||
|
: IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S) {}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// Create code for user-defined AST nodes.
|
||||||
|
///
|
||||||
|
/// These AST nodes can be of type:
|
||||||
|
///
|
||||||
|
/// - ScopStmt: A computational statement (TODO)
|
||||||
|
/// - Kernel: A GPU kernel call (TODO)
|
||||||
|
/// - Data-Transfer: A GPU <-> CPU data-transfer (TODO)
|
||||||
|
///
|
||||||
|
virtual void createUser(__isl_take isl_ast_node *User) {
|
||||||
|
isl_ast_node_free(User);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
class PPCGCodeGeneration : public ScopPass {
|
class PPCGCodeGeneration : public ScopPass {
|
||||||
public:
|
public:
|
||||||
|
@ -76,6 +106,12 @@ public:
|
||||||
/// The scop that is currently processed.
|
/// The scop that is currently processed.
|
||||||
Scop *S;
|
Scop *S;
|
||||||
|
|
||||||
|
LoopInfo *LI;
|
||||||
|
DominatorTree *DT;
|
||||||
|
ScalarEvolution *SE;
|
||||||
|
const DataLayout *DL;
|
||||||
|
RegionInfo *RI;
|
||||||
|
|
||||||
PPCGCodeGeneration() : ScopPass(ID) {}
|
PPCGCodeGeneration() : ScopPass(ID) {}
|
||||||
|
|
||||||
/// Construct compilation options for PPCG.
|
/// Construct compilation options for PPCG.
|
||||||
|
@ -650,12 +686,58 @@ public:
|
||||||
PPCGScop->options = nullptr;
|
PPCGScop->options = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generate code for a given GPU AST described by @p Root.
|
||||||
|
///
|
||||||
|
/// @param An isl_ast_node pointing to the root of the GPU AST.
|
||||||
|
void generateCode(__isl_take isl_ast_node *Root) {
|
||||||
|
ScopAnnotator Annotator;
|
||||||
|
Annotator.buildAliasScopes(*S);
|
||||||
|
|
||||||
|
Region *R = &S->getRegion();
|
||||||
|
|
||||||
|
simplifyRegion(R, DT, LI, RI);
|
||||||
|
|
||||||
|
BasicBlock *EnteringBB = R->getEnteringBlock();
|
||||||
|
|
||||||
|
PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
|
||||||
|
|
||||||
|
GPUNodeBuilder NodeBuilder(Builder, Annotator, this, *DL, *LI, *SE, *DT,
|
||||||
|
*S);
|
||||||
|
|
||||||
|
// Only build the run-time condition and parameters _after_ having
|
||||||
|
// introduced the conditional branch. This is important as the conditional
|
||||||
|
// branch will guard the original scop from new induction variables that
|
||||||
|
// the SCEVExpander may introduce while code generating the parameters and
|
||||||
|
// which may introduce scalar dependences that prevent us from correctly
|
||||||
|
// code generating this scop.
|
||||||
|
BasicBlock *StartBlock =
|
||||||
|
executeScopConditionally(*S, this, Builder.getTrue());
|
||||||
|
|
||||||
|
// TODO: Handle LICM
|
||||||
|
// TODO: Verify run-time checks
|
||||||
|
auto SplitBlock = StartBlock->getSinglePredecessor();
|
||||||
|
Builder.SetInsertPoint(SplitBlock->getTerminator());
|
||||||
|
NodeBuilder.addParameters(S->getContext());
|
||||||
|
Builder.SetInsertPoint(&*StartBlock->begin());
|
||||||
|
NodeBuilder.create(Root);
|
||||||
|
NodeBuilder.finalizeSCoP(*S);
|
||||||
|
}
|
||||||
|
|
||||||
bool runOnScop(Scop &CurrentScop) override {
|
bool runOnScop(Scop &CurrentScop) override {
|
||||||
S = &CurrentScop;
|
S = &CurrentScop;
|
||||||
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||||
|
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||||
|
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
||||||
|
DL = &S->getRegion().getEntry()->getParent()->getParent()->getDataLayout();
|
||||||
|
RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
|
||||||
|
|
||||||
auto PPCGScop = createPPCGScop();
|
auto PPCGScop = createPPCGScop();
|
||||||
auto PPCGProg = createPPCGProg(PPCGScop);
|
auto PPCGProg = createPPCGProg(PPCGScop);
|
||||||
auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
|
auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
|
||||||
|
|
||||||
|
if (PPCGGen->tree)
|
||||||
|
generateCode(isl_ast_node_copy(PPCGGen->tree));
|
||||||
|
|
||||||
freeOptions(PPCGScop);
|
freeOptions(PPCGScop);
|
||||||
freePPCGGen(PPCGGen);
|
freePPCGGen(PPCGGen);
|
||||||
gpu_prog_free(PPCGProg);
|
gpu_prog_free(PPCGProg);
|
||||||
|
|
|
@ -7,6 +7,9 @@
|
||||||
; RUN: -disable-output < %s | \
|
; RUN: -disable-output < %s | \
|
||||||
; RUN: FileCheck -check-prefix=CODE %s
|
; RUN: FileCheck -check-prefix=CODE %s
|
||||||
|
|
||||||
|
; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
|
||||||
|
; RUN: FileCheck %s -check-prefix=IR
|
||||||
|
|
||||||
; REQUIRES: pollyacc
|
; REQUIRES: pollyacc
|
||||||
|
|
||||||
; CHECK: Stmt_bb5
|
; CHECK: Stmt_bb5
|
||||||
|
@ -77,7 +80,14 @@
|
||||||
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
|
; CODE-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1)
|
||||||
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
|
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
|
||||||
|
|
||||||
|
; IR: polly.split_new_and_old:
|
||||||
|
; IR-NEXT: br i1 true, label %polly.start, label %bb2
|
||||||
|
|
||||||
|
; IR: polly.start:
|
||||||
|
; IR-NEXT: br label %polly.exiting
|
||||||
|
|
||||||
|
; IR: polly.exiting:
|
||||||
|
; IR-NEXT: br label %polly.merge_new_and_old
|
||||||
|
|
||||||
; void double_parallel_loop(float A[][1024]) {
|
; void double_parallel_loop(float A[][1024]) {
|
||||||
; for (long i = 0; i < 1024; i++)
|
; for (long i = 0; i < 1024; i++)
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
; RUN: opt %loadPolly -polly-codegen-ppcg -disable-output \
|
||||||
|
; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE
|
||||||
|
|
||||||
|
; RUN: opt %loadPolly -polly-codegen-ppcg \
|
||||||
|
; RUN: -S < %s | FileCheck %s -check-prefix=IR
|
||||||
|
; void foo(float A[2][100]) {
|
||||||
|
; for (long t = 0; t < 100; t++)
|
||||||
|
; for (long i = 1; i < 99; i++)
|
||||||
|
; A[(t + 1) % 2][i] += A[t % 2][i - 1] + A[t % 2][i] + A[t % 2][i + 1];
|
||||||
|
; }
|
||||||
|
|
||||||
|
; CODE: # host
|
||||||
|
; CODE-NEXT: {
|
||||||
|
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
|
||||||
|
; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1)
|
||||||
|
; CODE-NEXT: {
|
||||||
|
; CODE-NEXT: dim3 k0_dimBlock(32);
|
||||||
|
; CODE-NEXT: dim3 k0_dimGrid(4);
|
||||||
|
; CODE-NEXT: kernel0 <<<k0_dimGrid, k0_dimBlock>>> (c0);
|
||||||
|
; CODE-NEXT: cudaCheckKernel();
|
||||||
|
; CODE-NEXT: }
|
||||||
|
|
||||||
|
; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
|
||||||
|
; CODE-NEXT: }
|
||||||
|
|
||||||
|
; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
|
||||||
|
; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
|
||||||
|
; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
|
||||||
|
; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
|
||||||
|
; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
|
||||||
|
|
||||||
|
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||||
|
|
||||||
|
define void @foo([100 x float]* %A) {
|
||||||
|
entry:
|
||||||
|
br label %for.cond
|
||||||
|
|
||||||
|
for.cond: ; preds = %for.inc18, %entry
|
||||||
|
%t.0 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
|
||||||
|
%exitcond1 = icmp ne i64 %t.0, 100
|
||||||
|
br i1 %exitcond1, label %for.body, label %for.end20
|
||||||
|
|
||||||
|
for.body: ; preds = %for.cond
|
||||||
|
br label %for.cond1
|
||||||
|
|
||||||
|
for.cond1: ; preds = %for.inc, %for.body
|
||||||
|
%i.0 = phi i64 [ 1, %for.body ], [ %inc, %for.inc ]
|
||||||
|
%exitcond = icmp ne i64 %i.0, 99
|
||||||
|
br i1 %exitcond, label %for.body3, label %for.end
|
||||||
|
|
||||||
|
for.body3: ; preds = %for.cond1
|
||||||
|
%sub = add nsw i64 %i.0, -1
|
||||||
|
%rem = srem i64 %t.0, 2
|
||||||
|
%arrayidx4 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem, i64 %sub
|
||||||
|
%tmp = load float, float* %arrayidx4, align 4
|
||||||
|
%rem5 = srem i64 %t.0, 2
|
||||||
|
%arrayidx7 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem5, i64 %i.0
|
||||||
|
%tmp2 = load float, float* %arrayidx7, align 4
|
||||||
|
%add = fadd float %tmp, %tmp2
|
||||||
|
%add8 = add nuw nsw i64 %i.0, 1
|
||||||
|
%rem9 = srem i64 %t.0, 2
|
||||||
|
%arrayidx11 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem9, i64 %add8
|
||||||
|
%tmp3 = load float, float* %arrayidx11, align 4
|
||||||
|
%add12 = fadd float %add, %tmp3
|
||||||
|
%add13 = add nuw nsw i64 %t.0, 1
|
||||||
|
%rem14 = srem i64 %add13, 2
|
||||||
|
%arrayidx16 = getelementptr inbounds [100 x float], [100 x float]* %A, i64 %rem14, i64 %i.0
|
||||||
|
%tmp4 = load float, float* %arrayidx16, align 4
|
||||||
|
%add17 = fadd float %tmp4, %add12
|
||||||
|
store float %add17, float* %arrayidx16, align 4
|
||||||
|
br label %for.inc
|
||||||
|
|
||||||
|
for.inc: ; preds = %for.body3
|
||||||
|
%inc = add nuw nsw i64 %i.0, 1
|
||||||
|
br label %for.cond1
|
||||||
|
|
||||||
|
for.end: ; preds = %for.cond1
|
||||||
|
br label %for.inc18
|
||||||
|
|
||||||
|
for.inc18: ; preds = %for.end
|
||||||
|
%inc19 = add nuw nsw i64 %t.0, 1
|
||||||
|
br label %for.cond
|
||||||
|
|
||||||
|
for.end20: ; preds = %for.cond
|
||||||
|
ret void
|
||||||
|
}
|
Loading…
Reference in New Issue