GPGPU: Dynamically ensure 'sufficient compute'

Offloading to a GPU is only beneficial if there is a sufficient amount of
compute that can be accelerated. Many kernels just have a very small number
of dynamic compute, which means GPU acceleration is not beneficial. We
compute at run-time an approximation of how many dynamic instructions will be
executed and fall back to CPU code in case this number is not sufficiently
large. To keep the run-time checking code simple, we over-approximate the
number of instructions executed in each statement by computing the volume of
the rectangular hull of its iteration space.

llvm-svn: 281848
This commit is contained in:
Tobias Grosser 2016-09-18 06:50:35 +00:00
parent cfdee6582b
commit 82f2af3508
2 changed files with 132 additions and 2 deletions

View File

@ -92,6 +92,11 @@ static cl::opt<std::string>
cl::desc("The CUDA version to compile for"), cl::Hidden,
cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
static cl::opt<int>
MinCompute("polly-acc-mincompute",
cl::desc("Minimal number of compute statements to run on GPU."),
cl::Hidden, cl::init(10 * 512 * 512));
/// Create the ast expressions for a ScopStmt.
///
/// This function is a callback for to generate the ast expressions for each
@ -2261,6 +2266,109 @@ public:
PPCGScop->options = nullptr;
}
/// Approximate the number of points in the set.
///
/// This function returns an ast expression that overapproximates the number
/// of points in an isl set through the rectangular hull surrounding this set.
///
/// @param Set The set to count.
/// @param Build The isl ast build object to use for creating the ast
/// expression.
///
/// @returns An approximation of the number of points in the set.
__isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
__isl_keep isl_ast_build *Build) {
isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
isl_space *Space = isl_set_get_space(Set);
Space = isl_space_params(Space);
auto *Univ = isl_set_universe(Space);
isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
}
isl_set_free(Set);
isl_pw_aff_free(OneAff);
return Expr;
}
/// Approximate a number of dynamic instructions executed by a given
/// statement.
///
/// @param Stmt The statement for which to compute the number of dynamic
/// instructions.
/// @param Build The isl ast build object to use for creating the ast
/// expression.
/// @returns An approximation of the number of dynamic instructions executed
/// by @p Stmt.
__isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
__isl_keep isl_ast_build *Build) {
auto Iterations = approxPointsInSet(Stmt.getDomain(), Build);
long InstCount = 0;
if (Stmt.isBlockStmt()) {
auto *BB = Stmt.getBasicBlock();
InstCount = std::distance(BB->begin(), BB->end());
} else {
auto *R = Stmt.getRegion();
for (auto *BB : R->blocks()) {
InstCount += std::distance(BB->begin(), BB->end());
}
}
isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
auto *InstExpr = isl_ast_expr_from_val(InstVal);
return isl_ast_expr_mul(InstExpr, Iterations);
}
/// Approximate dynamic instructions executed in scop.
///
/// @param S The scop for which to approximate dynamic instructions.
/// @param Build The isl ast build object to use for creating the ast
/// expression.
/// @returns An approximation of the number of dynamic instructions executed
/// in @p S.
__isl_give isl_ast_expr *
getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
isl_ast_expr *Instructions;
isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
Instructions = isl_ast_expr_from_val(Zero);
for (ScopStmt &Stmt : S) {
isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
}
return Instructions;
}
/// Create a check that ensures sufficient compute in scop.
///
/// @param S The scop for which to ensure sufficient compute.
/// @param Build The isl ast build object to use for creating the ast
/// expression.
/// @returns An expression that evaluates to TRUE in case of sufficient
/// compute and to FALSE, otherwise.
__isl_give isl_ast_expr *
createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
auto Iterations = getNumberOfIterations(S, Build);
auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
return isl_ast_expr_ge(Iterations, MinComputeExpr);
}
/// Generate code for a given GPU AST described by @p Root.
///
/// @param Root An isl_ast_node pointing to the root of the GPU AST.
@ -2296,6 +2404,8 @@ public:
isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build);
isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
Condition = isl_ast_expr_and(Condition, SufficientCompute);
isl_ast_build_free(Build);
Value *RTC = NodeBuilder.createRTC(Condition);

View File

@ -89,7 +89,27 @@
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
; IR: polly.split_new_and_old:
; IR-NEXT: br i1 true, label %polly.start, label %bb2
; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1
; IR-NEXT: %polly.overflow.state = or i1 false, %.obit
; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0
; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1
; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0
; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1
; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0
; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1
; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0
; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440
; IR-NEXT: %5 = and i1 true, %4
; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown
; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2
; IR: polly.start:
; IR-NEXT: br label %polly.acc.initialize
@ -105,7 +125,7 @@
; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]]
; IR-NEXT: call i8* @polly_getKernel
; IR-NEXT: call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
; IR-NEXT: call void @polly_launchKernel(i8* %11, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
; IR-NEXT: call void @polly_freeKernel
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)