[OPENMP]Add call to __kmpc_push_target_tripcount() function.

Each we create the target regions with the teams distribute inner
region, we can better estimate number of the teams required to execute
the target region. Function __kmpc_push_target_tripcount() is used for
purpose, which accepts device_id and the number of the iterations,
performed by the associated loop.

llvm-svn: 350571
This commit is contained in:
Alexey Bataev 2019-01-07 21:30:43 +00:00
parent 347b0804bc
commit 7bb3353f6a
12 changed files with 247 additions and 4 deletions

View File

@ -673,6 +673,9 @@ enum OpenMPRTLFunction {
//
// Offloading related calls
//
// Call to void __kmpc_push_target_tripcount(int64_t device_id, kmp_uint64
// size);
OMPRTL__kmpc_push_target_tripcount,
// Call to int32_t __tgt_target(int64_t device_id, void *host_ptr, int32_t
// arg_num, void** args_base, void **args, size_t *arg_sizes, int64_t
// *arg_types);
@ -2163,6 +2166,15 @@ CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
FnTy, /*Name=*/"__kmpc_task_reduction_get_th_data");
break;
}
case OMPRTL__kmpc_push_target_tripcount: {
// Build void __kmpc_push_target_tripcount(int64_t device_id, kmp_uint64
// size);
llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int64Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_push_target_tripcount");
break;
}
case OMPRTL__tgt_target: {
// Build int32_t __tgt_target(int64_t device_id, void *host_ptr, int32_t
// arg_num, void** args_base, void **args, size_t *arg_sizes, int64_t
@ -8053,6 +8065,183 @@ static void emitOffloadingArraysArgument(
}
}
/// Checks if the expression is constant or does not have non-trivial function
/// calls.
static bool isTrivial(ASTContext &Ctx, const Expr * E) {
// We can skip constant expressions.
// We can skip expressions with trivial calls or simple expressions.
return (E->isEvaluatable(Ctx, Expr::SE_AllowUndefinedBehavior) ||
!E->hasNonTrivialCall(Ctx)) &&
!E->HasSideEffects(Ctx, /*IncludePossibleEffects=*/true);
}
/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
/// iff there is only one that is not evaluatable at the compile time.
static const Stmt *getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body) {
if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
const Stmt *Child = nullptr;
for (const Stmt *S : C->body()) {
if (const auto *E = dyn_cast<Expr>(S)) {
if (isTrivial(Ctx, E))
continue;
}
// Some of the statements can be ignored.
if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) ||
isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S))
continue;
// Analyze declarations.
if (const auto *DS = dyn_cast<DeclStmt>(S)) {
if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
if (isa<EmptyDecl>(D) || isa<DeclContext>(D) ||
isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) ||
isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) ||
isa<UsingDirectiveDecl>(D) ||
isa<OMPDeclareReductionDecl>(D) ||
isa<OMPThreadPrivateDecl>(D))
return true;
const auto *VD = dyn_cast<VarDecl>(D);
if (!VD)
return false;
return VD->isConstexpr() ||
((VD->getType().isTrivialType(Ctx) ||
VD->getType()->isReferenceType()) &&
(!VD->hasInit() || isTrivial(Ctx, VD->getInit())));
}))
continue;
}
// Found multiple children - cannot get the one child only.
if (Child)
return Body;
Child = S;
}
if (Child)
return Child;
}
return Body;
}
/// Check for inner distribute directive.
static const OMPExecutableDirective *
getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) {
const auto *CS = D.getInnermostCapturedStmt();
const auto *Body =
CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
switch (D.getDirectiveKind()) {
case OMPD_target:
if (isOpenMPDistributeDirective(DKind))
return NestedDir;
if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
/*IgnoreCaptured=*/true);
if (!Body)
return nullptr;
ChildStmt = getSingleCompoundChild(Ctx, Body);
if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
DKind = NND->getDirectiveKind();
if (isOpenMPDistributeDirective(DKind))
return NND;
}
}
return nullptr;
case OMPD_target_teams:
if (isOpenMPDistributeDirective(DKind))
return NestedDir;
return nullptr;
case OMPD_target_parallel:
case OMPD_target_simd:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
return nullptr;
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
case OMPD_parallel_sections:
case OMPD_for_simd:
case OMPD_parallel_for_simd:
case OMPD_cancel:
case OMPD_cancellation_point:
case OMPD_ordered:
case OMPD_threadprivate:
case OMPD_task:
case OMPD_simd:
case OMPD_sections:
case OMPD_section:
case OMPD_single:
case OMPD_master:
case OMPD_critical:
case OMPD_taskyield:
case OMPD_barrier:
case OMPD_taskwait:
case OMPD_taskgroup:
case OMPD_atomic:
case OMPD_flush:
case OMPD_teams:
case OMPD_target_data:
case OMPD_target_exit_data:
case OMPD_target_enter_data:
case OMPD_distribute:
case OMPD_distribute_simd:
case OMPD_distribute_parallel_for:
case OMPD_distribute_parallel_for_simd:
case OMPD_teams_distribute:
case OMPD_teams_distribute_simd:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
case OMPD_target_update:
case OMPD_declare_simd:
case OMPD_declare_target:
case OMPD_end_declare_target:
case OMPD_declare_reduction:
case OMPD_taskloop:
case OMPD_taskloop_simd:
case OMPD_requires:
case OMPD_unknown:
llvm_unreachable("Unexpected directive.");
}
}
return nullptr;
}
void CGOpenMPRuntime::emitTargetNumIterationsCall(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *Device,
const llvm::function_ref<llvm::Value *(
CodeGenFunction &CGF, const OMPLoopDirective &D)> &SizeEmitter) {
OpenMPDirectiveKind Kind = D.getDirectiveKind();
const OMPExecutableDirective *TD = &D;
// Get nested teams distribute kind directive, if any.
if (!isOpenMPDistributeDirective(Kind) || !isOpenMPTeamsDirective(Kind))
TD = getNestedDistributeDirective(CGM.getContext(), D);
if (!TD)
return;
const auto *LD = cast<OMPLoopDirective>(TD);
auto &&CodeGen = [LD, &Device, &SizeEmitter, this](CodeGenFunction &CGF,
PrePostActionTy &) {
llvm::Value *NumIterations = SizeEmitter(CGF, *LD);
// Emit device ID if any.
llvm::Value *DeviceID;
if (Device)
DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device),
CGF.Int64Ty, /*isSigned=*/true);
else
DeviceID = CGF.Builder.getInt64(OMP_DEVICEID_UNDEF);
llvm::Value *Args[] = {DeviceID, NumIterations};
CGF.EmitRuntimeCall(
createRuntimeFunction(OMPRTL__kmpc_push_target_tripcount), Args);
};
emitInlinedDirective(CGF, OMPD_unknown, CodeGen);
}
void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
llvm::Value *OutlinedFn,

View File

@ -1367,6 +1367,15 @@ public:
bool IsOffloadEntry,
const RegionCodeGenTy &CodeGen);
/// Emit code that pushes the trip count of loops associated with constructs
/// 'target teams distribute' and 'teams distribute parallel for'.
/// \param SizeEmitter Emits the int64 value for the number of iterations of
/// the associated loop.
virtual void emitTargetNumIterationsCall(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *Device,
const llvm::function_ref<llvm::Value *(
CodeGenFunction &CGF, const OMPLoopDirective &D)> &SizeEmitter);
/// Emit the target offloading code associated with \a D. The emitted
/// code attempts offloading the execution to the device, an the event of
/// a failure it executes the host version outlined in \a OutlinedFn.

View File

@ -705,8 +705,8 @@ getDataSharingMode(CodeGenModule &CGM) {
: CGOpenMPRuntimeNVPTX::Generic;
}
// Checks if the expression is constant or does not have non-trivial function
// calls.
/// Checks if the expression is constant or does not have non-trivial function
/// calls.
static bool isTrivial(ASTContext &Ctx, const Expr * E) {
// We can skip constant expressions.
// We can skip expressions with trivial calls or simple expressions.

View File

@ -4071,6 +4071,16 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
CGM.getOpenMPRuntime().emitTargetOutlinedFunction(S, ParentName, Fn, FnID,
IsOffloadEntry, CodeGen);
OMPLexicalScope Scope(CGF, S, OMPD_task);
auto &&SizeEmitter = [](CodeGenFunction &CGF, const OMPLoopDirective &D) {
OMPLoopScope(CGF, D);
// Emit calculation of the iterations count.
llvm::Value *NumIterations = CGF.EmitScalarExpr(D.getNumIterations());
NumIterations = CGF.Builder.CreateIntCast(NumIterations, CGF.Int64Ty,
/*IsSigned=*/false);
return NumIterations;
};
CGM.getOpenMPRuntime().emitTargetNumIterationsCall(CGF, S, Device,
SizeEmitter);
CGM.getOpenMPRuntime().emitTargetCall(CGF, S, Fn, FnID, IfCond, Device);
}

View File

@ -51,9 +51,13 @@ int target_teams_fun(int *g){
// discard capture expressions for te and th
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: [[N_CAST:%.+]] = alloca i{{32|64}},
// HCK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
// HCK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
// HCK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}})
// HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]],
// HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
// HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],

View File

@ -22,8 +22,10 @@ int Arg;
// CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
void gtid_test() {
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
#pragma omp target teams distribute parallel for
@ -78,9 +80,12 @@ int tmain(T Arg) {
// CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
int main() {
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_2:@.+]](
// CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain

View File

@ -52,10 +52,14 @@ int target_teams_fun(int *g){
// discard capture expressions for te and th
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: = alloca i32,
// HCK1: [[I_CAST:%.+]] = alloca i{{32|64}},
// HCK1: [[N_CAST:%.+]] = alloca i{{32|64}},
// HCK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
// HCK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
// HCK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}})
// HCK1: [[I_PAR:%.+]] = load{{.+}}, {{.+}} [[I_CAST]],
// HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]],
// HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
@ -73,7 +77,8 @@ int target_teams_fun(int *g){
// HCK1: call void @[[OFFL2:.+]](i{{64|32}} %{{.+}})
{{{
#pragma omp target teams distribute parallel for simd is_device_ptr(g) simdlen(8)
for(int i = 0; i < n; i++) {
for(
int i = 0; i < n; i++) {
a[i] = g[0];
}
}}}

View File

@ -22,8 +22,10 @@ int Arg;
// CHECK-LABEL: define {{.*}}void @{{.+}}gtid_test
void gtid_test() {
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
#pragma omp target teams distribute parallel for simd
@ -78,9 +80,12 @@ int tmain(T Arg) {
// CHECK-LABEL: define {{.*}}i{{[0-9]+}} @main()
int main() {
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_0:@.+]](
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call void [[OFFLOADING_FUN_1:@.+]](
// CHECK: call void @__kmpc_push_target_tripcount(i64 -1, i64 100)
// CHECK: call i{{[0-9]+}} @__tgt_target_teams(
// CHECK: call void [[OFFLOADING_FUN_2:@.+]](
// CHECK: = call {{.*}}i{{.+}} @{{.+}}tmain

View File

@ -21,15 +21,19 @@
int a[100];
// CK1: define {{.*}}i32 @{{.+}}teams_argument_globali(
int teams_argument_global(int n){
int teams_argument_global(int n){
int te = n / 128;
int th = 128;
// discard n_addr
// CK1: alloca i32,
// CK1: [[TE:%.+]] = alloca i32,
// CK1: [[TH:%.+]] = alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
// CK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
// CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}})
// CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
// CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],

View File

@ -28,8 +28,12 @@ int teams_argument_global(int n){
// CK1: alloca i32,
// CK1: [[TE:%.+]] = alloca i32,
// CK1: [[TH:%.+]] = alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
// CK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
// CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}})
// CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
// CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],
// CK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 {{.+}}, i32 {{.+}})

View File

@ -28,8 +28,12 @@ int teams_argument_global(int n){
// CK1: alloca i32,
// CK1: [[TE:%.+]] = alloca i32,
// CK1: [[TH:%.+]] = alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
// CK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
// CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}})
// CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
// CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],

View File

@ -30,8 +30,12 @@ int teams_argument_global(int n) {
// CK1: alloca i32,
// CK1: [[TE:%.+]] = alloca i32,
// CK1: [[TH:%.+]] = alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: alloca i32,
// CK1: [[TE_CAST:%.+]] = alloca i{{32|64}},
// CK1: [[TH_CAST:%.+]] = alloca i{{32|64}},
// CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}})
// CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]],
// CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]],