forked from OSchip/llvm-project
GPGPU: Dynamically ensure 'sufficient compute'
Offloading to a GPU is only beneficial if there is a sufficient amount of compute that can be accelerated. Many kernels just have a very small number of dynamic compute, which means GPU acceleration is not beneficial. We compute at run-time an approximation of how many dynamic instructions will be executed and fall back to CPU code in case this number is not sufficiently large. To keep the run-time checking code simple, we over-approximate the number of instructions executed in each statement by computing the volume of the rectangular hull of its iteration space. llvm-svn: 281848
This commit is contained in:
parent
cfdee6582b
commit
82f2af3508
|
@ -92,6 +92,11 @@ static cl::opt<std::string>
|
|||
cl::desc("The CUDA version to compile for"), cl::Hidden,
|
||||
cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<int>
|
||||
MinCompute("polly-acc-mincompute",
|
||||
cl::desc("Minimal number of compute statements to run on GPU."),
|
||||
cl::Hidden, cl::init(10 * 512 * 512));
|
||||
|
||||
/// Create the ast expressions for a ScopStmt.
|
||||
///
|
||||
/// This function is a callback for to generate the ast expressions for each
|
||||
|
@ -2261,6 +2266,109 @@ public:
|
|||
PPCGScop->options = nullptr;
|
||||
}
|
||||
|
||||
/// Approximate the number of points in the set.
|
||||
///
|
||||
/// This function returns an ast expression that overapproximates the number
|
||||
/// of points in an isl set through the rectangular hull surrounding this set.
|
||||
///
|
||||
/// @param Set The set to count.
|
||||
/// @param Build The isl ast build object to use for creating the ast
|
||||
/// expression.
|
||||
///
|
||||
/// @returns An approximation of the number of points in the set.
|
||||
__isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
|
||||
__isl_keep isl_ast_build *Build) {
|
||||
|
||||
isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
|
||||
auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
|
||||
|
||||
isl_space *Space = isl_set_get_space(Set);
|
||||
Space = isl_space_params(Space);
|
||||
auto *Univ = isl_set_universe(Space);
|
||||
isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
|
||||
|
||||
for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
|
||||
isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
|
||||
isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
|
||||
isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
|
||||
DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
|
||||
auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
|
||||
Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
|
||||
}
|
||||
|
||||
isl_set_free(Set);
|
||||
isl_pw_aff_free(OneAff);
|
||||
|
||||
return Expr;
|
||||
}
|
||||
|
||||
/// Approximate a number of dynamic instructions executed by a given
|
||||
/// statement.
|
||||
///
|
||||
/// @param Stmt The statement for which to compute the number of dynamic
|
||||
/// instructions.
|
||||
/// @param Build The isl ast build object to use for creating the ast
|
||||
/// expression.
|
||||
/// @returns An approximation of the number of dynamic instructions executed
|
||||
/// by @p Stmt.
|
||||
__isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
|
||||
__isl_keep isl_ast_build *Build) {
|
||||
auto Iterations = approxPointsInSet(Stmt.getDomain(), Build);
|
||||
|
||||
long InstCount = 0;
|
||||
|
||||
if (Stmt.isBlockStmt()) {
|
||||
auto *BB = Stmt.getBasicBlock();
|
||||
InstCount = std::distance(BB->begin(), BB->end());
|
||||
} else {
|
||||
auto *R = Stmt.getRegion();
|
||||
|
||||
for (auto *BB : R->blocks()) {
|
||||
InstCount += std::distance(BB->begin(), BB->end());
|
||||
}
|
||||
}
|
||||
|
||||
isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
|
||||
auto *InstExpr = isl_ast_expr_from_val(InstVal);
|
||||
return isl_ast_expr_mul(InstExpr, Iterations);
|
||||
}
|
||||
|
||||
/// Approximate dynamic instructions executed in scop.
|
||||
///
|
||||
/// @param S The scop for which to approximate dynamic instructions.
|
||||
/// @param Build The isl ast build object to use for creating the ast
|
||||
/// expression.
|
||||
/// @returns An approximation of the number of dynamic instructions executed
|
||||
/// in @p S.
|
||||
__isl_give isl_ast_expr *
|
||||
getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
|
||||
isl_ast_expr *Instructions;
|
||||
|
||||
isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
|
||||
Instructions = isl_ast_expr_from_val(Zero);
|
||||
|
||||
for (ScopStmt &Stmt : S) {
|
||||
isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
|
||||
Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
|
||||
}
|
||||
return Instructions;
|
||||
}
|
||||
|
||||
/// Create a check that ensures sufficient compute in scop.
|
||||
///
|
||||
/// @param S The scop for which to ensure sufficient compute.
|
||||
/// @param Build The isl ast build object to use for creating the ast
|
||||
/// expression.
|
||||
/// @returns An expression that evaluates to TRUE in case of sufficient
|
||||
/// compute and to FALSE, otherwise.
|
||||
__isl_give isl_ast_expr *
|
||||
createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
|
||||
auto Iterations = getNumberOfIterations(S, Build);
|
||||
auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
|
||||
auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
|
||||
return isl_ast_expr_ge(Iterations, MinComputeExpr);
|
||||
}
|
||||
|
||||
/// Generate code for a given GPU AST described by @p Root.
|
||||
///
|
||||
/// @param Root An isl_ast_node pointing to the root of the GPU AST.
|
||||
|
@ -2296,6 +2404,8 @@ public:
|
|||
|
||||
isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
|
||||
isl_ast_expr *Condition = IslAst::buildRunCondition(S, Build);
|
||||
isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
|
||||
Condition = isl_ast_expr_and(Condition, SufficientCompute);
|
||||
isl_ast_build_free(Build);
|
||||
|
||||
Value *RTC = NodeBuilder.createRTC(Condition);
|
||||
|
|
|
@ -89,7 +89,27 @@
|
|||
; CODE-NEXT: Stmt_bb5(32 * b0 + t0, 32 * b1 + t1 + 16 * c3);
|
||||
|
||||
; IR: polly.split_new_and_old:
|
||||
; IR-NEXT: br i1 true, label %polly.start, label %bb2
|
||||
; IR-NEXT: %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 1024)
|
||||
; IR-NEXT: %.obit = extractvalue { i64, i1 } %0, 1
|
||||
; IR-NEXT: %polly.overflow.state = or i1 false, %.obit
|
||||
; IR-NEXT: %.res = extractvalue { i64, i1 } %0, 0
|
||||
; IR-NEXT: %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %.res, i64 1024)
|
||||
; IR-NEXT: %.obit1 = extractvalue { i64, i1 } %1, 1
|
||||
; IR-NEXT: %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
|
||||
; IR-NEXT: %.res3 = extractvalue { i64, i1 } %1, 0
|
||||
; IR-NEXT: %2 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 7, i64 %.res3)
|
||||
; IR-NEXT: %.obit4 = extractvalue { i64, i1 } %2, 1
|
||||
; IR-NEXT: %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
|
||||
; IR-NEXT: %.res6 = extractvalue { i64, i1 } %2, 0
|
||||
; IR-NEXT: %3 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res6)
|
||||
; IR-NEXT: %.obit7 = extractvalue { i64, i1 } %3, 1
|
||||
; IR-NEXT: %polly.overflow.state8 = or i1 %polly.overflow.state5, %.obit7
|
||||
; IR-NEXT: %.res9 = extractvalue { i64, i1 } %3, 0
|
||||
; IR-NEXT: %4 = icmp sge i64 %.res9, 2621440
|
||||
; IR-NEXT: %5 = and i1 true, %4
|
||||
; IR-NEXT: %polly.rtc.overflown = xor i1 %polly.overflow.state8, true
|
||||
; IR-NEXT: %polly.rtc.result = and i1 %5, %polly.rtc.overflown
|
||||
; IR-NEXT: br i1 %polly.rtc.result, label %polly.start, label %bb2
|
||||
|
||||
; IR: polly.start:
|
||||
; IR-NEXT: br label %polly.acc.initialize
|
||||
|
@ -105,7 +125,7 @@
|
|||
; IR-NEXT: [[ParamTyped:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
|
||||
; IR-NEXT: store i8* [[ParamTyped]], i8** [[ParamSlot]]
|
||||
; IR-NEXT: call i8* @polly_getKernel
|
||||
; IR-NEXT: call void @polly_launchKernel(i8* %5, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
|
||||
; IR-NEXT: call void @polly_launchKernel(i8* %11, i32 32, i32 32, i32 32, i32 16, i32 1, i8* %polly_launch_0_params_i8ptr)
|
||||
; IR-NEXT: call void @polly_freeKernel
|
||||
; IR-NEXT: [[HostPtr2:%.*]] = bitcast [1024 x float]* %A to i8*
|
||||
; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_A, i8* [[HostPtr2]], i64 4194304)
|
||||
|
|
Loading…
Reference in New Issue