[OPENMP] Codegen for teams directive for NVPTX

This patch implements the teams directive for the NVPTX backend. It is different from the host code generation path as it:

Does not call kmpc_fork_teams. All necessary teams and threads are started upon touching the target region, when launching a CUDA kernel, and their execution is coordinated through sequential and parallel regions within the target region.
Does not call kmpc_push_num_teams even if a num_teams of thread_limit clause is present. Setting the number of teams and the thread limit is implemented by the nvptx-related runtime.
Please note that I am now passing a Clang Expr * to emitPushNumTeams instead of the originally chosen llvm::Value * type. The reason for that is that I want to avoid emitting expressions for num_teams and thread_limit if they are not needed in the target region.

http://reviews.llvm.org/D17963

llvm-svn: 265304
This commit is contained in:
Carlo Bertolli 2016-04-04 15:55:02 +00:00
parent 1eec3f01f0
commit c687225b43
6 changed files with 239 additions and 20 deletions

View File

@ -4832,17 +4832,29 @@ void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
}
void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
llvm::Value *NumTeams,
llvm::Value *ThreadLimit,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {
if (!CGF.HaveInsertPoint())
return;
auto *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *NumTeamsVal =
(NumTeams)
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
CGF.CGM.Int32Ty, /* isSigned = */ true)
: CGF.Builder.getInt32(0);
llvm::Value *ThreadLimitVal =
(ThreadLimit)
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
CGF.CGM.Int32Ty, /* isSigned = */ true)
: CGF.Builder.getInt32(0);
// Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
llvm::Value *PushNumTeamsArgs[] = {
RTLoc, getThreadID(CGF, Loc), NumTeams, ThreadLimit};
llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
ThreadLimitVal};
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
PushNumTeamsArgs);
}

View File

@ -912,11 +912,10 @@ public:
/// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code
/// for num_teams clause.
/// \param NumTeams An integer value of teams.
/// \param ThreadLimit An integer value of threads.
virtual void emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *NumTeams,
llvm::Value *ThreadLimit, SourceLocation Loc);
/// \param NumTeams An integer expression of teams.
/// \param ThreadLimit An integer expression of threads.
virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
const Expr *ThreadLimit, SourceLocation Loc);
};
} // namespace CodeGen

View File

@ -14,6 +14,8 @@
#include "CGOpenMPRuntimeNVPTX.h"
#include "clang/AST/DeclOpenMP.h"
#include "CodeGenFunction.h"
#include "clang/AST/StmtOpenMP.h"
using namespace clang;
using namespace CodeGen;
@ -350,3 +352,45 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
// Called once per module during initialization.
initializeEnvironment();
}
void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
SourceLocation Loc) {}
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
llvm::Function *OutlinedFun = nullptr;
if (isa<OMPTeamsDirective>(D)) {
llvm::Value *OutlinedFunVal =
CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen);
OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
} else
llvm_unreachable("parallel directive is not yet supported for nvptx "
"backend.");
return OutlinedFun;
}
void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
const OMPExecutableDirective &D,
SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars) {
if (!CGF.HaveInsertPoint())
return;
Address ZeroAddr =
CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
/*Name*/ ".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
}

View File

@ -136,6 +136,41 @@ private:
public:
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
/// \brief This function ought to emit, in the general case, a call to
// the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed
// as these numbers are obtained through the PTX grid and block configuration.
/// \param NumTeams An integer expression of teams.
/// \param ThreadLimit An integer expression of threads.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
const Expr *ThreadLimit, SourceLocation Loc) override;
/// \brief Emits inlined function for the specified OpenMP parallel
// directive but an inlined function for teams.
/// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
/// kmp_int32 BoundID, struct context_vars*).
/// \param D OpenMP directive.
/// \param ThreadIDVar Variable for thread id in the current OpenMP region.
/// \param InnermostKind Kind of innermost directive (for simple directives it
/// is a directive itself, for combined - its innermost directive).
/// \param CodeGen Code generation sequence for the \a D directive.
llvm::Value *
emitParallelOrTeamsOutlinedFunction(const OMPExecutableDirective &D,
const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind,
const RegionCodeGenTy &CodeGen) override;
/// \brief Emits code for teams call of the \a OutlinedFn with
/// variables captured in a record which address is stored in \a
/// CapturedStruct.
/// \param OutlinedFn Outlined function to be run by team masters. Type of
/// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
/// \param CapturedVars A pointer to the record with the references to
/// variables used in \a OutlinedFn function.
///
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
SourceLocation Loc, llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars) override;
};
} // CodeGen namespace.

View File

@ -3098,18 +3098,11 @@ static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF,
const OMPNumTeamsClause *NT = TD.getSingleClause<OMPNumTeamsClause>();
const OMPThreadLimitClause *TL = TD.getSingleClause<OMPThreadLimitClause>();
if (NT || TL) {
llvm::Value *NumTeamsVal = (NT) ? CGF.Builder.CreateIntCast(
CGF.EmitScalarExpr(NT->getNumTeams()), CGF.CGM.Int32Ty,
/* isSigned = */ true) :
CGF.Builder.getInt32(0);
Expr *NumTeams = (NT) ? NT->getNumTeams() : nullptr;
Expr *ThreadLimit = (TL) ? TL->getThreadLimit() : nullptr;
llvm::Value *ThreadLimitVal = (TL) ? CGF.Builder.CreateIntCast(
CGF.EmitScalarExpr(TL->getThreadLimit()), CGF.CGM.Int32Ty,
/* isSigned = */ true) :
CGF.Builder.getInt32(0);
CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeamsVal,
ThreadLimitVal, S.getLocStart());
CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeams, ThreadLimit,
S.getLocStart());
}
OMPLexicalScope Scope(CGF, S);

View File

@ -0,0 +1,136 @@
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
#ifdef CK1
template <typename T>
int tmain(T argc) {
#pragma omp target
#pragma omp teams
argc = 0;
return 0;
}
int main (int argc, char **argv) {
#pragma omp target
#pragma omp teams
{
argc = 0;
}
return tmain(argv);
}
// only nvptx side: do not outline teams region and do not call fork_teams
// CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
// CK1: {{.+}} = alloca i{{[0-9]+}}*,
// CK1: {{.+}} = alloca i{{[0-9]+}}*,
// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
// CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
// CK1: store {{.+}} 0, {{.+}},
// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
// CK1-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
// CK1-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
// CK1: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
// CK1: ret void
// CK1-NEXT: }
// target region in template
// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}***{{.+}} [[ARGC:%.+]])
// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}***,
// CK1: store i{{.+}}*** [[ARGC]], i{{.+}}**** [[ARGCADDR]]
// CK1: [[ARGCADDR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR]],
// CK1: store i8*** [[ARGCADDR_REF]], i8**** [[ARGCADDR_PTR]],
// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR_PTR]],
// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
// CK1: ret void
// CK1-NEXT: }
#endif // CK1
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
// expected-no-diagnostics
#ifdef CK2
template <typename T>
int tmain(T argc) {
int a = 10;
int b = 5;
#pragma omp target
#pragma omp teams num_teams(a) thread_limit(b)
{
argc = 0;
}
return 0;
}
int main (int argc, char **argv) {
int a = 20;
int b = 5;
#pragma omp target
#pragma omp teams num_teams(a) thread_limit(b)
{
argc = 0;
}
return tmain(argv);
}
// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
// CK2: {{.}} = alloca i{{[0-9]+}}*,
// CK2: {{.}} = alloca i{{[0-9]+}}*,
// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]],
// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
// CK2-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
// CK2-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
// CK2: ret
// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}}*{{.+}} [[A_IN:%.+]], i{{[0-9]+}}*{{.+}} [[BP:%.+]], i{{[0-9]+}}***{{.+}} [[ARGC:%.+]])
// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***,
// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}*,
// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}*,
// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}***,
// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
// CK2: store i{{[0-9]+}}* [[A_IN]], i{{[0-9]+}}** [[AADDR]],
// CK2: store i{{[0-9]+}}* [[B_IN]], i{{[0-9]+}}** [[BADDR]],
// CK2: store i{{[0-9]+}}*** [[ARGC]], i{{[0-9]+}}**** [[ARGCADDR]],
// CK2: [[A_ADDR_VAL:%.+]] = load i32*, i32** [[AADDR]]
// CK2: [[B_ADDR_VAL:%.+]] = load i32*, i32** [[BADDR]]
// CK2: [[ARGC_ADDR_VAL:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR]]
// CK2: store i{{[0-9]+}}*** [[ARGC_ADDR_VAL]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]],
// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
// CK2: ret void
#endif // CK2
#endif