forked from OSchip/llvm-project
[OPENMP] Codegen for teams directive for NVPTX
This patch implements the teams directive for the NVPTX backend. It is different from the host code generation path as it: Does not call kmpc_fork_teams. All necessary teams and threads are started upon touching the target region, when launching a CUDA kernel, and their execution is coordinated through sequential and parallel regions within the target region. Does not call kmpc_push_num_teams even if a num_teams of thread_limit clause is present. Setting the number of teams and the thread limit is implemented by the nvptx-related runtime. Please note that I am now passing a Clang Expr * to emitPushNumTeams instead of the originally chosen llvm::Value * type. The reason for that is that I want to avoid emitting expressions for num_teams and thread_limit if they are not needed in the target region. http://reviews.llvm.org/D17963 llvm-svn: 265304
This commit is contained in:
parent
1eec3f01f0
commit
c687225b43
|
@ -4832,17 +4832,29 @@ void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
|
|||
}
|
||||
|
||||
void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
|
||||
llvm::Value *NumTeams,
|
||||
llvm::Value *ThreadLimit,
|
||||
const Expr *NumTeams,
|
||||
const Expr *ThreadLimit,
|
||||
SourceLocation Loc) {
|
||||
if (!CGF.HaveInsertPoint())
|
||||
return;
|
||||
|
||||
auto *RTLoc = emitUpdateLocation(CGF, Loc);
|
||||
|
||||
llvm::Value *NumTeamsVal =
|
||||
(NumTeams)
|
||||
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
|
||||
CGF.CGM.Int32Ty, /* isSigned = */ true)
|
||||
: CGF.Builder.getInt32(0);
|
||||
|
||||
llvm::Value *ThreadLimitVal =
|
||||
(ThreadLimit)
|
||||
? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
|
||||
CGF.CGM.Int32Ty, /* isSigned = */ true)
|
||||
: CGF.Builder.getInt32(0);
|
||||
|
||||
// Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
|
||||
llvm::Value *PushNumTeamsArgs[] = {
|
||||
RTLoc, getThreadID(CGF, Loc), NumTeams, ThreadLimit};
|
||||
llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
|
||||
ThreadLimitVal};
|
||||
CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
|
||||
PushNumTeamsArgs);
|
||||
}
|
||||
|
|
|
@ -912,11 +912,10 @@ public:
|
|||
/// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32
|
||||
/// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code
|
||||
/// for num_teams clause.
|
||||
/// \param NumTeams An integer value of teams.
|
||||
/// \param ThreadLimit An integer value of threads.
|
||||
virtual void emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *NumTeams,
|
||||
llvm::Value *ThreadLimit, SourceLocation Loc);
|
||||
|
||||
/// \param NumTeams An integer expression of teams.
|
||||
/// \param ThreadLimit An integer expression of threads.
|
||||
virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
|
||||
const Expr *ThreadLimit, SourceLocation Loc);
|
||||
};
|
||||
|
||||
} // namespace CodeGen
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
|
||||
#include "CGOpenMPRuntimeNVPTX.h"
|
||||
#include "clang/AST/DeclOpenMP.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/StmtOpenMP.h"
|
||||
|
||||
using namespace clang;
|
||||
using namespace CodeGen;
|
||||
|
@ -350,3 +352,45 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
|
|||
// Called once per module during initialization.
|
||||
initializeEnvironment();
|
||||
}
|
||||
|
||||
void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
|
||||
const Expr *NumTeams,
|
||||
const Expr *ThreadLimit,
|
||||
SourceLocation Loc) {}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
|
||||
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
|
||||
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
|
||||
|
||||
llvm::Function *OutlinedFun = nullptr;
|
||||
if (isa<OMPTeamsDirective>(D)) {
|
||||
llvm::Value *OutlinedFunVal =
|
||||
CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
|
||||
D, ThreadIDVar, InnermostKind, CodeGen);
|
||||
OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
|
||||
OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
} else
|
||||
llvm_unreachable("parallel directive is not yet supported for nvptx "
|
||||
"backend.");
|
||||
|
||||
return OutlinedFun;
|
||||
}
|
||||
|
||||
void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
|
||||
const OMPExecutableDirective &D,
|
||||
SourceLocation Loc,
|
||||
llvm::Value *OutlinedFn,
|
||||
ArrayRef<llvm::Value *> CapturedVars) {
|
||||
if (!CGF.HaveInsertPoint())
|
||||
return;
|
||||
|
||||
Address ZeroAddr =
|
||||
CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
|
||||
/*Name*/ ".zero.addr");
|
||||
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
|
||||
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
|
||||
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
|
||||
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
|
||||
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
|
||||
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
|
||||
}
|
||||
|
|
|
@ -136,6 +136,41 @@ private:
|
|||
|
||||
public:
|
||||
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
|
||||
|
||||
/// \brief This function ought to emit, in the general case, a call to
|
||||
// the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed
|
||||
// as these numbers are obtained through the PTX grid and block configuration.
|
||||
/// \param NumTeams An integer expression of teams.
|
||||
/// \param ThreadLimit An integer expression of threads.
|
||||
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
|
||||
const Expr *ThreadLimit, SourceLocation Loc) override;
|
||||
|
||||
/// \brief Emits inlined function for the specified OpenMP parallel
|
||||
// directive but an inlined function for teams.
|
||||
/// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
|
||||
/// kmp_int32 BoundID, struct context_vars*).
|
||||
/// \param D OpenMP directive.
|
||||
/// \param ThreadIDVar Variable for thread id in the current OpenMP region.
|
||||
/// \param InnermostKind Kind of innermost directive (for simple directives it
|
||||
/// is a directive itself, for combined - its innermost directive).
|
||||
/// \param CodeGen Code generation sequence for the \a D directive.
|
||||
llvm::Value *
|
||||
emitParallelOrTeamsOutlinedFunction(const OMPExecutableDirective &D,
|
||||
const VarDecl *ThreadIDVar,
|
||||
OpenMPDirectiveKind InnermostKind,
|
||||
const RegionCodeGenTy &CodeGen) override;
|
||||
|
||||
/// \brief Emits code for teams call of the \a OutlinedFn with
|
||||
/// variables captured in a record which address is stored in \a
|
||||
/// CapturedStruct.
|
||||
/// \param OutlinedFn Outlined function to be run by team masters. Type of
|
||||
/// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
|
||||
/// \param CapturedVars A pointer to the record with the references to
|
||||
/// variables used in \a OutlinedFn function.
|
||||
///
|
||||
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
|
||||
SourceLocation Loc, llvm::Value *OutlinedFn,
|
||||
ArrayRef<llvm::Value *> CapturedVars) override;
|
||||
};
|
||||
|
||||
} // CodeGen namespace.
|
||||
|
|
|
@ -3098,18 +3098,11 @@ static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF,
|
|||
const OMPNumTeamsClause *NT = TD.getSingleClause<OMPNumTeamsClause>();
|
||||
const OMPThreadLimitClause *TL = TD.getSingleClause<OMPThreadLimitClause>();
|
||||
if (NT || TL) {
|
||||
llvm::Value *NumTeamsVal = (NT) ? CGF.Builder.CreateIntCast(
|
||||
CGF.EmitScalarExpr(NT->getNumTeams()), CGF.CGM.Int32Ty,
|
||||
/* isSigned = */ true) :
|
||||
CGF.Builder.getInt32(0);
|
||||
Expr *NumTeams = (NT) ? NT->getNumTeams() : nullptr;
|
||||
Expr *ThreadLimit = (TL) ? TL->getThreadLimit() : nullptr;
|
||||
|
||||
llvm::Value *ThreadLimitVal = (TL) ? CGF.Builder.CreateIntCast(
|
||||
CGF.EmitScalarExpr(TL->getThreadLimit()), CGF.CGM.Int32Ty,
|
||||
/* isSigned = */ true) :
|
||||
CGF.Builder.getInt32(0);
|
||||
|
||||
CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeamsVal,
|
||||
ThreadLimitVal, S.getLocStart());
|
||||
CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeams, ThreadLimit,
|
||||
S.getLocStart());
|
||||
}
|
||||
|
||||
OMPLexicalScope Scope(CGF, S);
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
#ifdef CK1
|
||||
|
||||
template <typename T>
|
||||
int tmain(T argc) {
|
||||
#pragma omp target
|
||||
#pragma omp teams
|
||||
argc = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
#pragma omp target
|
||||
#pragma omp teams
|
||||
{
|
||||
argc = 0;
|
||||
}
|
||||
return tmain(argv);
|
||||
}
|
||||
|
||||
// only nvptx side: do not outline teams region and do not call fork_teams
|
||||
// CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
|
||||
// CK1: {{.+}} = alloca i{{[0-9]+}}*,
|
||||
// CK1: {{.+}} = alloca i{{[0-9]+}}*,
|
||||
// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
|
||||
// CK1: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
|
||||
// CK1: store {{.+}} 0, {{.+}},
|
||||
// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
|
||||
// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
|
||||
// CK1-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
|
||||
// CK1-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
|
||||
// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
|
||||
// CK1: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
|
||||
// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
|
||||
// CK1: ret void
|
||||
// CK1-NEXT: }
|
||||
|
||||
// target region in template
|
||||
// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}***{{.+}} [[ARGC:%.+]])
|
||||
// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
|
||||
// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}***,
|
||||
// CK1: store i{{.+}}*** [[ARGC]], i{{.+}}**** [[ARGCADDR]]
|
||||
// CK1: [[ARGCADDR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR]],
|
||||
// CK1: store i8*** [[ARGCADDR_REF]], i8**** [[ARGCADDR_PTR]],
|
||||
// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR_PTR]],
|
||||
// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
|
||||
// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
|
||||
// CK1: ret void
|
||||
// CK1-NEXT: }
|
||||
|
||||
|
||||
#endif // CK1
|
||||
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
|
||||
// expected-no-diagnostics
|
||||
#ifdef CK2
|
||||
|
||||
template <typename T>
|
||||
int tmain(T argc) {
|
||||
int a = 10;
|
||||
int b = 5;
|
||||
#pragma omp target
|
||||
#pragma omp teams num_teams(a) thread_limit(b)
|
||||
{
|
||||
argc = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
int a = 20;
|
||||
int b = 5;
|
||||
#pragma omp target
|
||||
#pragma omp teams num_teams(a) thread_limit(b)
|
||||
{
|
||||
argc = 0;
|
||||
}
|
||||
return tmain(argv);
|
||||
}
|
||||
|
||||
// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
|
||||
// CK2: {{.}} = alloca i{{[0-9]+}}*,
|
||||
// CK2: {{.}} = alloca i{{[0-9]+}}*,
|
||||
// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
|
||||
// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
|
||||
// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
|
||||
// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
|
||||
// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
|
||||
// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
|
||||
// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
|
||||
// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]],
|
||||
// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
|
||||
// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
|
||||
// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
|
||||
// CK2-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
|
||||
// CK2-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
|
||||
// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
|
||||
// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
|
||||
// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
|
||||
// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
|
||||
// CK2: ret
|
||||
|
||||
// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}}*{{.+}} [[A_IN:%.+]], i{{[0-9]+}}*{{.+}} [[BP:%.+]], i{{[0-9]+}}***{{.+}} [[ARGC:%.+]])
|
||||
// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***,
|
||||
// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}*,
|
||||
// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}*,
|
||||
// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}***,
|
||||
// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
|
||||
// CK2: store i{{[0-9]+}}* [[A_IN]], i{{[0-9]+}}** [[AADDR]],
|
||||
// CK2: store i{{[0-9]+}}* [[B_IN]], i{{[0-9]+}}** [[BADDR]],
|
||||
// CK2: store i{{[0-9]+}}*** [[ARGC]], i{{[0-9]+}}**** [[ARGCADDR]],
|
||||
// CK2: [[A_ADDR_VAL:%.+]] = load i32*, i32** [[AADDR]]
|
||||
// CK2: [[B_ADDR_VAL:%.+]] = load i32*, i32** [[BADDR]]
|
||||
// CK2: [[ARGC_ADDR_VAL:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR]]
|
||||
// CK2: store i{{[0-9]+}}*** [[ARGC_ADDR_VAL]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
|
||||
// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]],
|
||||
// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
|
||||
// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
|
||||
// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
|
||||
// CK2: ret void
|
||||
|
||||
#endif // CK2
|
||||
#endif
|
Loading…
Reference in New Issue