[OPENMP] Codegen for teams directive for NVPTX

This patch implements the teams directive for the NVPTX backend. It is different from the host code generation path as it: Does not call kmpc_fork_teams. All necessary teams and threads are started upon touching the target region, when launching a CUDA kernel, and their execution is coordinated through sequential and parallel regions within the target region. Does not call kmpc_push_num_teams even if a num_teams of thread_limit clause is present. Setting the number of teams and the thread limit is implemented by the nvptx-related runtime. Please note that I am now passing a Clang Expr * to emitPushNumTeams instead of the originally chosen llvm::Value * type. The reason for that is that I want to avoid emitting expressions for num_teams and thread_limit if they are not needed in the target region. http://reviews.llvm.org/D17963 llvm-svn: 265304
2016-04-04 15:55:02 +00:00 · 2016-04-04 15:55:02 +00:00 · c687225b43
parent 1eec3f01f0
commit c687225b43
6 changed files with 239 additions and 20 deletions
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@ -4832,17 +4832,29 @@ void CGOpenMPRuntime::emitTeamsCall(CodeGenFunction &CGF,
 }

 void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
-                                         llvm::Value *NumTeams,
-                                         llvm::Value *ThreadLimit,
+                                         const Expr *NumTeams,
+                                         const Expr *ThreadLimit,
                                         SourceLocation Loc) {
  if (!CGF.HaveInsertPoint())
    return;

  auto *RTLoc = emitUpdateLocation(CGF, Loc);

+  llvm::Value *NumTeamsVal =
+      (NumTeams)
+          ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(NumTeams),
+                                      CGF.CGM.Int32Ty, /* isSigned = */ true)
+          : CGF.Builder.getInt32(0);
+
+  llvm::Value *ThreadLimitVal =
+      (ThreadLimit)
+          ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
+                                      CGF.CGM.Int32Ty, /* isSigned = */ true)
+          : CGF.Builder.getInt32(0);
+
  // Build call __kmpc_push_num_teamss(&loc, global_tid, num_teams, thread_limit)
-  llvm::Value *PushNumTeamsArgs[] = {
-      RTLoc, getThreadID(CGF, Loc), NumTeams, ThreadLimit};
+  llvm::Value *PushNumTeamsArgs[] = {RTLoc, getThreadID(CGF, Loc), NumTeamsVal,
+                                     ThreadLimitVal};
  CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_push_num_teams),
                      PushNumTeamsArgs);
 }
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@ -912,11 +912,10 @@ public:
  /// \brief Emits call to void __kmpc_push_num_teams(ident_t *loc, kmp_int32
  /// global_tid, kmp_int32 num_teams, kmp_int32 thread_limit) to generate code
  /// for num_teams clause.
-  /// \param NumTeams An integer value of teams.
-  /// \param ThreadLimit An integer value of threads.
-  virtual void emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *NumTeams,
-                                  llvm::Value *ThreadLimit, SourceLocation Loc);
-
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                                  const Expr *ThreadLimit, SourceLocation Loc);
 };

 } // namespace CodeGen
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@ -14,6 +14,8 @@

 #include "CGOpenMPRuntimeNVPTX.h"
 #include "clang/AST/DeclOpenMP.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"

 using namespace clang;
 using namespace CodeGen;
@ -350,3 +352,45 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
  // Called once per module during initialization.
  initializeEnvironment();
 }
+
+void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
+                                              const Expr *NumTeams,
+                                              const Expr *ThreadLimit,
+                                              SourceLocation Loc) {}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  llvm::Function *OutlinedFun = nullptr;
+  if (isa<OMPTeamsDirective>(D)) {
+    llvm::Value *OutlinedFunVal =
+        CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
+            D, ThreadIDVar, InnermostKind, CodeGen);
+    OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
+    OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
+  } else
+    llvm_unreachable("parallel directive is not yet supported for nvptx "
+                     "backend.");
+
+  return OutlinedFun;
+}
+
+void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
+                                         const OMPExecutableDirective &D,
+                                         SourceLocation Loc,
+                                         llvm::Value *OutlinedFn,
+                                         ArrayRef<llvm::Value *> CapturedVars) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  Address ZeroAddr =
+      CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
+                           /*Name*/ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
+}
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@ -136,6 +136,41 @@ private:

 public:
  explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+
+  /// \brief This function ought to emit, in the general case, a call to
+  // the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed
+  // as these numbers are obtained through the PTX grid and block configuration.
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                          const Expr *ThreadLimit, SourceLocation Loc) override;
+
+  /// \brief Emits inlined function for the specified OpenMP parallel
+  //  directive but an inlined function for teams.
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Value *
+  emitParallelOrTeamsOutlinedFunction(const OMPExecutableDirective &D,
+                                      const VarDecl *ThreadIDVar,
+                                      OpenMPDirectiveKind InnermostKind,
+                                      const RegionCodeGenTy &CodeGen) override;
+
+  /// \brief Emits code for teams call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run by team masters. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
+                     SourceLocation Loc, llvm::Value *OutlinedFn,
+                     ArrayRef<llvm::Value *> CapturedVars) override;
 };

 } // CodeGen namespace.
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@ -3098,18 +3098,11 @@ static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF,
  const OMPNumTeamsClause *NT = TD.getSingleClause<OMPNumTeamsClause>();
  const OMPThreadLimitClause *TL = TD.getSingleClause<OMPThreadLimitClause>();
  if (NT || TL) {
-    llvm::Value *NumTeamsVal = (NT) ? CGF.Builder.CreateIntCast(
-        CGF.EmitScalarExpr(NT->getNumTeams()), CGF.CGM.Int32Ty,
-        /* isSigned = */ true) :
-        CGF.Builder.getInt32(0);
+    Expr *NumTeams = (NT) ? NT->getNumTeams() : nullptr;
+    Expr *ThreadLimit = (TL) ? TL->getThreadLimit() : nullptr;

-    llvm::Value *ThreadLimitVal = (TL) ? CGF.Builder.CreateIntCast(
-        CGF.EmitScalarExpr(TL->getThreadLimit()), CGF.CGM.Int32Ty,
-        /* isSigned = */ true) :
-        CGF.Builder.getInt32(0);
-
-    CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeamsVal,
-        ThreadLimitVal, S.getLocStart());
+    CGF.CGM.getOpenMPRuntime().emitNumTeamsClause(CGF, NumTeams, ThreadLimit,
+                                                  S.getLocStart());
  }

  OMPLexicalScope Scope(CGF, S);
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@ -0,0 +1,136 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#ifdef CK1
+
+template <typename T>
+int tmain(T argc) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return 0;
+}
+
+
+int main (int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// only nvptx side: do not outline teams region and do not call fork_teams
+// CK1:  define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
+// CK1:  {{.+}} = alloca i{{[0-9]+}}*,
+// CK1:  {{.+}} = alloca i{{[0-9]+}}*,
+// CK1:  [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
+// CK1:  [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
+// CK1:  store {{.+}} 0, {{.+}},
+// CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
+// CK1-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
+// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK1:  ret void
+// CK1-NEXT: }
+
+// target region in template
+// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}***{{.+}} [[ARGC:%.+]])
+// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
+// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}***,
+// CK1: store i{{.+}}*** [[ARGC]], i{{.+}}**** [[ARGCADDR]]
+// CK1: [[ARGCADDR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR]],
+// CK1: store i8*** [[ARGCADDR_REF]], i8**** [[ARGCADDR_PTR]],
+// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}***, i{{.+}}**** [[ARGCADDR_PTR]],
+// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
+// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK1:  ret void
+// CK1-NEXT: }
+
+
+#endif // CK1
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fomptargets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fomptargets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fomp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// expected-no-diagnostics
+#ifdef CK2
+
+template <typename T>
+int tmain(T argc) {
+  int a = 10;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int a = 20;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
+// CK2: {{.}} = alloca i{{[0-9]+}}*,
+// CK2: {{.}} = alloca i{{[0-9]+}}*,
+// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
+// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2-NOT:  {{%.+}} = call i32 @__kmpc_global_thread_num(
+// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
+// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
+// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK2-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
+// CK2-NOT:  {{.+}} = call i32 @__kmpc_push_num_teams(
+// CK2-NOT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK2: ret
+
+// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}}*{{.+}} [[A_IN:%.+]], i{{[0-9]+}}*{{.+}} [[BP:%.+]], i{{[0-9]+}}***{{.+}}  [[ARGC:%.+]])
+// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***,
+// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}}*,
+// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}***,
+// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
+// CK2: store i{{[0-9]+}}* [[A_IN]], i{{[0-9]+}}** [[AADDR]],
+// CK2: store i{{[0-9]+}}* [[B_IN]], i{{[0-9]+}}** [[BADDR]],
+// CK2: store i{{[0-9]+}}*** [[ARGC]], i{{[0-9]+}}**** [[ARGCADDR]],
+// CK2: [[A_ADDR_VAL:%.+]] = load i32*, i32** [[AADDR]]
+// CK2: [[B_ADDR_VAL:%.+]] = load i32*, i32** [[BADDR]]
+// CK2: [[ARGC_ADDR_VAL:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR]]
+// CK2: store i{{[0-9]+}}*** [[ARGC_ADDR_VAL]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
+// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]],
+// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
+// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
+// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK2:  ret void
+
+#endif // CK2
+#endif