[OPENMP, NVPTX] Initial support for L2 parallelism in SPMD mode.

Added initial support for L2 parallelism in SPMD mode. Note, though, that the orphaned parallel directives are not currently supported in SPMD mode. llvm-svn: 332016
2018-05-10 18:32:08 +00:00 · 2018-05-10 18:32:08 +00:00 · bf5c84861c
parent 0aae2bc260
commit bf5c84861c
6 changed files with 216 additions and 73 deletions
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@ -140,13 +140,15 @@ public:
 /// to emit optimized code.
 class ExecutionModeRAII {
 private:
-  bool SavedMode;
-  bool &Mode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;

 public:
-  ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) {
+  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
+      : Mode(Mode) {
    SavedMode = Mode;
-    Mode = NewMode;
+    Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
+                  : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
  }
  ~ExecutionModeRAII() { Mode = SavedMode; }
 };
@ -579,8 +581,9 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
  WorkerFn->setDoesNotRecurse();
 }

-bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
-  return IsInSPMDExecutionMode;
+CGOpenMPRuntimeNVPTX::ExecutionMode
+CGOpenMPRuntimeNVPTX::getExecutionMode() const {
+  return CurrentExecutionMode;
 }

 static CGOpenMPRuntimeNVPTX::DataSharingMode
@ -589,34 +592,96 @@ getDataSharingMode(CodeGenModule &CGM) {
                                          : CGOpenMPRuntimeNVPTX::Generic;
 }

-/// Check for inner (nested) SPMD construct, if any
-static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
-  const auto *CS = D.getCapturedStmt(OMPD_target);
-  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
-  const Stmt *ChildStmt = nullptr;
+/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
+/// iff there is only one.
+static const Stmt *getSingleCompoundChild(const Stmt *Body) {
  if (const auto *C = dyn_cast<CompoundStmt>(Body))
    if (C->size() == 1)
-      ChildStmt = C->body_front();
-  if (!ChildStmt)
-    return false;
+      return C->body_front();
+  return Body;
+}
+
+/// Check if the parallel directive has an 'if' clause with non-constant or
+/// false condition.
+static bool hasParallelIfClause(ASTContext &Ctx,
+                                const OMPExecutableDirective &D) {
+  for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
+    OpenMPDirectiveKind NameModifier = C->getNameModifier();
+    if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
+      continue;
+    const Expr *Cond = C->getCondition();
+    bool Result;
+    if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
+      return true;
+  }
+  return false;
+}
+
+/// Check for inner (nested) SPMD construct, if any
+static bool hasNestedSPMDDirective(ASTContext &Ctx,
+                                   const OMPExecutableDirective &D) {
+  const auto *CS = D.getInnermostCapturedStmt();
+  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
+  const Stmt *ChildStmt = getSingleCompoundChild(Body);

  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
-    // TODO: add further analysis for inner teams|distribute directives, if any.
    switch (D.getDirectiveKind()) {
    case OMPD_target:
-      return (isOpenMPParallelDirective(DKind) &&
-              !isOpenMPTeamsDirective(DKind) &&
-              !isOpenMPDistributeDirective(DKind)) ||
-             isOpenMPSimdDirective(DKind) ||
-             DKind == OMPD_teams_distribute_parallel_for;
+      if ((isOpenMPParallelDirective(DKind) &&
+           !hasParallelIfClause(Ctx, *NestedDir)) ||
+          isOpenMPSimdDirective(DKind))
+        return true;
+      if (DKind == OMPD_teams || DKind == OMPD_teams_distribute) {
+        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
+        if (!Body)
+          return false;
+        ChildStmt = getSingleCompoundChild(Body);
+        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+          DKind = NND->getDirectiveKind();
+          if ((isOpenMPParallelDirective(DKind) &&
+               !hasParallelIfClause(Ctx, *NND)) ||
+              isOpenMPSimdDirective(DKind))
+            return true;
+          if (DKind == OMPD_distribute) {
+            Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
+            if (!Body)
+              return false;
+            ChildStmt = getSingleCompoundChild(Body);
+            if (!ChildStmt)
+              return false;
+            if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+              DKind = NND->getDirectiveKind();
+              return (isOpenMPParallelDirective(DKind) &&
+                      !hasParallelIfClause(Ctx, *NND)) ||
+                     isOpenMPSimdDirective(DKind);
+            }
+          }
+        }
+      }
+      return false;
    case OMPD_target_teams:
-      return (isOpenMPParallelDirective(DKind) &&
-              !isOpenMPDistributeDirective(DKind)) ||
-             isOpenMPSimdDirective(DKind) ||
-             DKind == OMPD_distribute_parallel_for;
+      if ((isOpenMPParallelDirective(DKind) &&
+           !hasParallelIfClause(Ctx, *NestedDir)) ||
+          isOpenMPSimdDirective(DKind))
+        return true;
+      if (DKind == OMPD_distribute) {
+        Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers();
+        if (!Body)
+          return false;
+        ChildStmt = getSingleCompoundChild(Body);
+        if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+          DKind = NND->getDirectiveKind();
+          return (isOpenMPParallelDirective(DKind) &&
+                  !hasParallelIfClause(Ctx, *NND)) ||
+                 isOpenMPSimdDirective(DKind);
+        }
+      }
+      return false;
    case OMPD_target_teams_distribute:
-      return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind);
+      return (isOpenMPParallelDirective(DKind) &&
+              !hasParallelIfClause(Ctx, *NestedDir)) ||
+             isOpenMPSimdDirective(DKind);
    case OMPD_target_simd:
    case OMPD_target_parallel:
    case OMPD_target_parallel_for:
@ -674,20 +739,22 @@ static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
  return false;
 }

-static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) {
+static bool supportsSPMDExecutionMode(ASTContext &Ctx,
+                                      const OMPExecutableDirective &D) {
  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
  switch (DirectiveKind) {
  case OMPD_target:
  case OMPD_target_teams:
  case OMPD_target_teams_distribute:
-    return hasNestedSPMDDirective(D);
-  case OMPD_target_simd:
+    return hasNestedSPMDDirective(Ctx, D);
  case OMPD_target_parallel:
  case OMPD_target_parallel_for:
  case OMPD_target_parallel_for_simd:
-  case OMPD_target_teams_distribute_simd:
  case OMPD_target_teams_distribute_parallel_for:
  case OMPD_target_teams_distribute_parallel_for_simd:
+    return !hasParallelIfClause(Ctx, D);
+  case OMPD_target_simd:
+  case OMPD_target_teams_distribute_simd:
    return true;
  case OMPD_parallel:
  case OMPD_for:
@ -744,7 +811,7 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
                                             llvm::Constant *&OutlinedFnID,
                                             bool IsOffloadEntry,
                                             const RegionCodeGenTy &CodeGen) {
-  ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false);
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
  EntryFunctionState EST;
  WorkerFunctionState WST(CGM, D.getLocStart());
  Work.clear();
@ -858,7 +925,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
                                          llvm::Constant *&OutlinedFnID,
                                          bool IsOffloadEntry,
                                          const RegionCodeGenTy &CodeGen) {
-  ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true);
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
  EntryFunctionState EST;

  // Emit target region as a standalone region.
@ -905,11 +972,13 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(

  CGF.EmitBlock(ExecuteBB);

+  IsInTargetMasterThreadRegion = true;
  emitGenericVarsProlog(CGF, D.getLocStart());
 }

 void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
                                               EntryFunctionState &EST) {
+  IsInTargetMasterThreadRegion = false;
  if (!CGF.HaveInsertPoint())
    return;

@ -1380,7 +1449,7 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(

  assert(!ParentName.empty() && "Invalid target region parent name!");

-  bool Mode = supportsSPMDExecutionMode(D);
+  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
  if (Mode)
    emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
                   CodeGen);
@ -1401,8 +1470,8 @@ void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
                                              OpenMPProcBindClauseKind ProcBind,
                                              SourceLocation Loc) {
  // Do nothing in case of Spmd mode and L0 parallel.
-  // TODO: If in Spmd mode and L1 parallel emit the clause.
-  if (isInSpmdExecutionMode())
+  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD &&
+      IsInTargetMasterThreadRegion)
    return;

  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
@ -1412,8 +1481,8 @@ void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
                                                llvm::Value *NumThreads,
                                                SourceLocation Loc) {
  // Do nothing in case of Spmd mode and L0 parallel.
-  // TODO: If in Spmd mode and L1 parallel emit the clause.
-  if (isInSpmdExecutionMode())
+  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD &&
+      IsInTargetMasterThreadRegion)
    return;

  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
@ -1457,7 +1526,8 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
          D, ThreadIDVar, InnermostKind, CodeGen));
  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
-  if (!isInSpmdExecutionMode() && !IsInParallelRegion) {
+  if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
+      !IsInParallelRegion) {
    llvm::Function *WrapperFun =
        createParallelDataSharingWrapper(OutlinedFun, D);
    WrapperFunctionsMap[OutlinedFun] = WrapperFun;
@ -1635,7 +1705,7 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
  if (!CGF.HaveInsertPoint())
    return;

-  if (isInSpmdExecutionMode())
+  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
    emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
  else
    emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
@ -1759,6 +1829,8 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
      SeqGen(CGF, Action);
    } else if (IsInTargetMasterThreadRegion) {
      L0ParallelGen(CGF, Action);
+    } else if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD) {
+      RCG(CGF);
    } else {
      // Check for master and then parallelism:
      // if (is_master) {
@ -1770,20 +1842,18 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
      // }
      CGBuilderTy &Bld = CGF.Builder;
      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-      if (!isInSpmdExecutionMode()) {
-        llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
-        llvm::BasicBlock *ParallelCheckBB =
-            CGF.createBasicBlock(".parallelcheck");
-        llvm::Value *IsMaster =
-            Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
-        Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
-        CGF.EmitBlock(MasterCheckBB);
-        L0ParallelGen(CGF, Action);
-        CGF.EmitBranch(ExitBB);
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(ParallelCheckBB);
-      }
+      llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
+      llvm::BasicBlock *ParallelCheckBB =
+          CGF.createBasicBlock(".parallelcheck");
+      llvm::Value *IsMaster =
+          Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+      Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
+      CGF.EmitBlock(MasterCheckBB);
+      L0ParallelGen(CGF, Action);
+      CGF.EmitBranch(ExitBB);
+      // There is no need to emit line number for unconditional branch.
+      (void)ApplyDebugLocation::CreateEmpty(CGF);
+      CGF.EmitBlock(ParallelCheckBB);
      llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
      llvm::Value *ThreadID = getThreadID(CGF, Loc);
      llvm::Value *PL = CGF.EmitRuntimeCall(
@ -1827,14 +1897,49 @@ void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
  // is added on Spmd target directives.
  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

-  Address ZeroAddr = CGF.CreateMemTemp(
-      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
-      ".zero.addr");
+  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+                                           /*DestWidth=*/32, /*Signed=*/1),
+                                       ".zero.addr");
  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
-  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
-  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
-  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
-  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+  Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
+                    ThreadIDAddr](CodeGenFunction &CGF,
+                                  PrePostActionTy &Action) {
+    Action.Enter(CGF);
+
+    llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+    OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
+    OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+    OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+    emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+  };
+  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
+                                        PrePostActionTy &) {
+
+    RegionCodeGenTy RCG(CodeGen);
+    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+    llvm::Value *ThreadID = getThreadID(CGF, Loc);
+    llvm::Value *Args[] = {RTLoc, ThreadID};
+
+    NVPTXActionTy Action(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
+        Args,
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
+        Args);
+    RCG.setAction(Action);
+    RCG(CGF);
+  };
+
+  if (IsInTargetMasterThreadRegion) {
+    RegionCodeGenTy RCG(CodeGen);
+    RCG(CGF);
+  } else {
+    // If we are not in the target region, it is definitely L2 parallelism or
+    // more, because for SPMD mode we always has L1 parallel level, sowe don't
+    // need to check for orphaned directives.
+    RegionCodeGenTy RCG(SeqGen);
+    RCG(CGF);
+  }
 }

 void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@ -24,6 +24,16 @@ namespace clang {
 namespace CodeGen {

 class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
+public:
+  /// Defines the execution mode.
+  enum ExecutionMode {
+    /// SPMD execution mode (all threads are worker threads).
+    EM_SPMD,
+    /// Non-SPMD execution mode (1 master thread, others are workers).
+    EM_NonSPMD,
+    /// Unknown execution mode (orphaned directive).
+    EM_Unknown,
+  };
 private:
  /// Parallel outlined function work for workers to execute.
  llvm::SmallVector<llvm::Function *, 16> Work;
@ -44,7 +54,7 @@ private:
    void createWorkerFunction(CodeGenModule &CGM);
  };

-  bool isInSpmdExecutionMode() const;
+  ExecutionMode getExecutionMode() const;

  /// Emit the worker function for the current target region.
  void emitWorkerFunction(WorkerFunctionState &WST);
@ -334,7 +344,7 @@ private:
  /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
  /// target region and used by containing directives such as 'parallel'
  /// to emit optimized code.
-  bool IsInSPMDExecutionMode = false;
+  ExecutionMode CurrentExecutionMode = EM_Unknown;

  /// true if we're emitting the code for the target region and next parallel
  /// region is L0 for sure.
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@ -58,6 +58,7 @@ tx ftemplate(int n) {
    #pragma omp critical
    ++a;
    }
+    ++a;
  }
  return a;
 }
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@ -9,8 +9,9 @@
 #define HEADER

 // Check that the execution mode of all 2 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0

 template<typename tx>
 tx ftemplate(int n) {
@ -33,6 +34,13 @@ tx ftemplate(int n) {
    aa = 1;
  }

+  #pragma omp target teams
+  {
+#pragma omp parallel
+#pragma omp parallel
+    aa = 1;
+  }
+
  return a;
 }

@ -44,14 +52,14 @@ int bar(int n){
  return a;
 }

-  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l21}}_worker()
+  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l22}}_worker()






-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker()
  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@ -86,7 +94,7 @@ int bar(int n){
  // CHECK: [[EXIT]]
  // CHECK: ret void

-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] [[A:%[^)]+]])
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](i[[SZ:32|64]] [[A:%[^)]+]])
  // CHECK: store i[[SZ]] [[A]], i[[SZ]]* [[A_ADDR:%.+]], align
  // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i8*

@ -137,7 +145,7 @@ int bar(int n){



-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l32}}_worker()
  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@ -172,7 +180,7 @@ int bar(int n){
  // CHECK: [[EXIT]]
  // CHECK: ret void

-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l31]](i[[SZ:32|64]] [[AA:%[^)]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l32]](i[[SZ:32|64]] [[AA:%[^)]+]])
  // CHECK: store i[[SZ]] [[AA]], i[[SZ]]* [[AA_ADDR:%.+]], align
  // CHECK: [[CONV:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*

@ -218,5 +226,24 @@ int bar(int n){
  // CHECK: [[EXIT]]
  // CHECK: ret void

+// CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37(
+// CHECK: call void @__kmpc_spmd_kernel_init(
+// CHECK: call i8* @__kmpc_data_sharing_push_stack(
+// CHECK-NOT: call void @__kmpc_serialized_parallel(
+// CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.*}})
+// CHECK-NOT: call void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_data_sharing_pop_stack(
+// CHECK: call void @__kmpc_spmd_kernel_deinit()
+// CHECK: ret
+
+// CHECK: define internal void [[L0]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable
+// CHECK: call void @__kmpc_serialized_parallel(
+// CHECK: call void [[L1:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.+}})
+// CHECK: call void @__kmpc_end_serialized_parallel(
+// CHECK: ret void
+
+// CHECK: define internal void [[L1]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i16* dereferenceable
+// CHECK: store i16 1, i16* %
+// CHECK: ret void

 #endif
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@ -13,7 +13,7 @@ int a;
 int foo(int *a);

 int main(int argc, char **argv) {
-#pragma omp target teams distribute parallel for map(tofrom:a) if(parallel:argc)
+#pragma omp target teams distribute parallel for map(tofrom:a) if(target:argc) schedule(static, a)
  for (int i= 0; i < argc; ++i)
    a = foo(&i) + foo(&a) + foo(&argc);
  return 0;
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited | FileCheck %s
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc -fopenmp-version=45
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited -fopenmp-version=45 | FileCheck %s
 // expected-no-diagnostics

 int main() {
@ -11,7 +11,7 @@ int main() {
  int c[10][10][10];
 #pragma omp target parallel firstprivate(a, b) map(tofrom          \
                                                   : c) map(tofrom \
-                                                            : bb) if (a)
+                                                            : bb) if (target:a)
  {
    int &f = c[1][1][1];
    int &g = a;
@ -54,7 +54,7 @@ int main() {
  return 0;
 }

-// CHECK: define internal void @__omp_offloading{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* noalias{{[^,]+}}, i1 {{[^)]+}})
+// CHECK: define internal void @__omp_offloading{{[^(]+}}([10 x [10 x [10 x i32]]] addrspace(1)* {{[^,]+}}, i32 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8 addrspace(1)* noalias{{[^,]+}})
 // CHECK: addrspacecast [10 x [10 x [10 x i32]]] addrspace(1)* %{{.+}} to [10 x [10 x [10 x i32]]]*
 // CHECK: call void [[NONDEBUG_WRAPPER:.+]](i32* {{[^,]+}}, i32* {{[^,]+}}, [10 x [10 x [10 x i32]]]* {{[^,]+}}, i64 {{[^,]+}}, [10 x [10 x i32]]* {{[^,]+}}, i8* {{[^)]+}})