From bcadb1f2e6afe51d5646c6e98faa14aa1a1c669c Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Tue, 18 Feb 2020 14:23:59 -0500
Subject: [PATCH] Revert "[CUDA][HIP][OpenMP] Emit deferred diagnostics by a
 post-parsing AST travese"

This reverts commit 1b978ddba05cb15e22b4e75adb5e7362ad861987.
---
 clang/include/clang/Sema/Sema.h               |  46 +++--
 clang/lib/Sema/Sema.cpp                       | 185 +++++++-----------
 clang/lib/Sema/SemaCUDA.cpp                   |  19 ++
 clang/lib/Sema/SemaDecl.cpp                   |  18 +-
 clang/lib/Sema/SemaExpr.cpp                   |  90 ++++++---
 clang/lib/Sema/SemaOpenMP.cpp                 | 158 ++++++++++++---
 clang/test/OpenMP/declare_target_messages.cpp |  12 +-
 .../nvptx_target_exceptions_messages.cpp      |   4 +-
 clang/test/SemaCUDA/bad-calls-on-same-line.cu |   4 +-
 .../test/SemaCUDA/call-device-fn-from-host.cu |   4 +-
 .../test/SemaCUDA/call-host-fn-from-device.cu |   4 +-
 clang/test/SemaCUDA/openmp-target.cu          |   4 +-
 clang/test/SemaCUDA/trace-through-global.cu   |   2 +-
 13 files changed, 349 insertions(+), 201 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 6a3add109b09..a6430353b241 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1464,12 +1464,6 @@ public:
 
   void emitAndClearUnusedLocalTypedefWarnings();
 
-  // Emit all deferred diagnostics.
-  void emitDeferredDiags();
-  // Emit any deferred diagnostics for FD and erase them from the map in which
-  // they're stored.
-  void emitDeferredDiags(FunctionDecl *FD, bool ShowCallStack);
-
   enum TUFragmentKind {
     /// The global module fragment, between 'module;' and a module-declaration.
     Global,
@@ -3689,8 +3683,7 @@ public:
     TemplateDiscarded, // Discarded due to uninstantiated templates
     Unknown,
   };
-  FunctionEmissionStatus getEmissionStatus(FunctionDecl *Decl,
-                                           bool Final = false);
+  FunctionEmissionStatus getEmissionStatus(FunctionDecl *Decl);
 
   // Whether the callee should be ignored in CUDA/HIP/OpenMP host/device check.
   bool shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee);
@@ -9684,10 +9677,22 @@ private:
   /// Pop OpenMP function region for non-capturing function.
   void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI);
 
+  /// Check whether we're allowed to call Callee from the current function.
+  void checkOpenMPDeviceFunction(SourceLocation Loc, FunctionDecl *Callee,
+                                 bool CheckForDelayedContext = true);
+
+  /// Check whether we're allowed to call Callee from the current function.
+  void checkOpenMPHostFunction(SourceLocation Loc, FunctionDecl *Callee,
+                               bool CheckCaller = true);
+
   /// Check if the expression is allowed to be used in expressions for the
   /// OpenMP devices.
   void checkOpenMPDeviceExpr(const Expr *E);
 
+  /// Finishes analysis of the deferred functions calls that may be declared as
+  /// host/nohost during device/host compilation.
+  void finalizeOpenMPDelayedAnalysis();
+
   /// Checks if a type or a declaration is disabled due to the owning extension
   /// being disabled, and emits diagnostic messages if it is disabled.
   /// \param D type or declaration to be checked.
@@ -9870,11 +9875,6 @@ public:
   void
   checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
                                    SourceLocation IdLoc = SourceLocation());
-  /// Finishes analysis of the deferred functions calls that may be declared as
-  /// host/nohost during device/host compilation.
-  void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
-                                     const FunctionDecl *Callee,
-                                     SourceLocation Loc);
   /// Return true inside OpenMP declare target region.
   bool isInOpenMPDeclareTargetContext() const {
     return DeclareTargetNestingLevel > 0;
@@ -11223,6 +11223,18 @@ public:
                  /* Caller = */ FunctionDeclAndLoc>
       DeviceKnownEmittedFns;
 
+  /// A partial call graph maintained during CUDA/OpenMP device code compilation
+  /// to support deferred diagnostics.
+  ///
+  /// Functions are only added here if, at the time they're considered, they are
+  /// not known-emitted.  As soon as we discover that a function is
+  /// known-emitted, we remove it and everything it transitively calls from this
+  /// set and add those functions to DeviceKnownEmittedFns.
+  llvm::DenseMap</* Caller = */ CanonicalDeclPtr<FunctionDecl>,
+                 /* Callees = */ llvm::MapVector<CanonicalDeclPtr<FunctionDecl>,
+                                                 SourceLocation>>
+      DeviceCallGraph;
+
   /// Diagnostic builder for CUDA/OpenMP devices errors which may or may not be
   /// deferred.
   ///
@@ -11297,6 +11309,14 @@ public:
     llvm::Optional<unsigned> PartialDiagId;
   };
 
+  /// Indicate that this function (and thus everything it transtively calls)
+  /// will be codegen'ed, and emit any deferred diagnostics on this function and
+  /// its (transitive) callees.
+  void markKnownEmitted(
+      Sema &S, FunctionDecl *OrigCaller, FunctionDecl *OrigCallee,
+      SourceLocation OrigLoc,
+      const llvm::function_ref<bool(Sema &, FunctionDecl *)> IsKnownEmitted);
+
   /// Creates a DeviceDiagBuilder that emits the diagnostic if the current context
   /// is "used as device code".
   ///
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index c3c2baa3e904..e24db7190496 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "UsedDeclVisitor.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/DeclCXX.h"
@@ -955,7 +954,9 @@ void Sema::ActOnEndOfTranslationUnitFragment(TUFragmentKind Kind) {
     PerformPendingInstantiations();
   }
 
-  emitDeferredDiags();
+  // Finalize analysis of OpenMP-specific constructs.
+  if (LangOpts.OpenMP)
+    finalizeOpenMPDelayedAnalysis();
 
   assert(LateParsedInstantiations.empty() &&
          "end of TU template instantiation should not create more "
@@ -1450,128 +1451,27 @@ static void emitCallStackNotes(Sema &S, FunctionDecl *FD) {
 
 // Emit any deferred diagnostics for FD and erase them from the map in which
 // they're stored.
-void Sema::emitDeferredDiags(FunctionDecl *FD, bool ShowCallStack) {
-  auto It = DeviceDeferredDiags.find(FD);
-  if (It == DeviceDeferredDiags.end())
+static void emitDeferredDiags(Sema &S, FunctionDecl *FD, bool ShowCallStack) {
+  auto It = S.DeviceDeferredDiags.find(FD);
+  if (It == S.DeviceDeferredDiags.end())
     return;
   bool HasWarningOrError = false;
   for (PartialDiagnosticAt &PDAt : It->second) {
     const SourceLocation &Loc = PDAt.first;
     const PartialDiagnostic &PD = PDAt.second;
-    HasWarningOrError |= getDiagnostics().getDiagnosticLevel(
+    HasWarningOrError |= S.getDiagnostics().getDiagnosticLevel(
                              PD.getDiagID(), Loc) >= DiagnosticsEngine::Warning;
-    DiagnosticBuilder Builder(Diags.Report(Loc, PD.getDiagID()));
+    DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID()));
     Builder.setForceEmit();
     PD.Emit(Builder);
   }
+  S.DeviceDeferredDiags.erase(It);
 
   // FIXME: Should this be called after every warning/error emitted in the loop
   // above, instead of just once per function?  That would be consistent with
   // how we handle immediate errors, but it also seems like a bit much.
   if (HasWarningOrError && ShowCallStack)
-    emitCallStackNotes(*this, FD);
-}
-
-namespace {
-/// Helper class that emits deferred diagnostic messages if an entity directly
-/// or indirectly using the function that causes the deferred diagnostic
-/// messages is known to be emitted.
-class DeferredDiagnosticsEmitter
-    : public UsedDeclVisitor<DeferredDiagnosticsEmitter> {
-public:
-  typedef UsedDeclVisitor<DeferredDiagnosticsEmitter> Inherited;
-  llvm::SmallSet<CanonicalDeclPtr<Decl>, 4> Visited;
-  llvm::SmallVector<CanonicalDeclPtr<FunctionDecl>, 4> UseStack;
-  bool ShouldEmit;
-  unsigned InOMPDeviceContext;
-
-  DeferredDiagnosticsEmitter(Sema &S)
-      : Inherited(S), ShouldEmit(false), InOMPDeviceContext(0) {}
-
-  void VisitDeclRefExpr(DeclRefExpr *E) {
-    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(E->getDecl())) {
-      visitUsedDecl(E->getLocation(), FD);
-    }
-  }
-
-  void VisitMemberExpr(MemberExpr *E) {
-    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(E->getMemberDecl()))
-      visitUsedDecl(E->getMemberLoc(), FD);
-  }
-
-  void VisitOMPTargetDirective(OMPTargetDirective *Node) {
-    ++InOMPDeviceContext;
-    Inherited::VisitOMPTargetDirective(Node);
-    --InOMPDeviceContext;
-  }
-
-  void VisitCapturedStmt(CapturedStmt *Node) {
-    visitUsedDecl(Node->getBeginLoc(), Node->getCapturedDecl());
-    Inherited::VisitCapturedStmt(Node);
-  }
-
-  void visitUsedDecl(SourceLocation Loc, Decl *D) {
-    if (auto *TD = dyn_cast<TranslationUnitDecl>(D)) {
-      for (auto *DD : TD->decls()) {
-        visitUsedDecl(Loc, DD);
-      }
-    } else if (auto *FTD = dyn_cast<FunctionTemplateDecl>(D)) {
-      for (auto *DD : FTD->specializations()) {
-        visitUsedDecl(Loc, DD);
-      }
-    } else if (auto *FD = dyn_cast<FunctionDecl>(D)) {
-      FunctionDecl *Caller = UseStack.empty() ? nullptr : UseStack.back();
-      auto IsKnownEmitted = S.getEmissionStatus(FD, /*Final=*/true) ==
-                            Sema::FunctionEmissionStatus::Emitted;
-      if (!Caller)
-        ShouldEmit = IsKnownEmitted;
-      if ((!ShouldEmit && !S.getLangOpts().OpenMP && !Caller) ||
-          S.shouldIgnoreInHostDeviceCheck(FD) || Visited.count(D))
-        return;
-      // Finalize analysis of OpenMP-specific constructs.
-      if (Caller && S.LangOpts.OpenMP && UseStack.size() == 1)
-        S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc);
-      if (Caller)
-        S.DeviceKnownEmittedFns[FD] = {Caller, Loc};
-      if (ShouldEmit || InOMPDeviceContext)
-        S.emitDeferredDiags(FD, Caller);
-      Visited.insert(D);
-      UseStack.push_back(FD);
-      if (auto *S = FD->getBody()) {
-        this->Visit(S);
-      }
-      UseStack.pop_back();
-      Visited.erase(D);
-    } else if (auto *RD = dyn_cast<RecordDecl>(D)) {
-      for (auto *DD : RD->decls()) {
-        visitUsedDecl(Loc, DD);
-      }
-    } else if (auto *CD = dyn_cast<CapturedDecl>(D)) {
-      if (auto *S = CD->getBody()) {
-        this->Visit(S);
-      }
-    } else if (auto *VD = dyn_cast<VarDecl>(D)) {
-      if (auto *Init = VD->getInit()) {
-        auto DevTy = OMPDeclareTargetDeclAttr::getDeviceType(VD);
-        bool IsDev = DevTy && (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost ||
-                               *DevTy == OMPDeclareTargetDeclAttr::DT_Any);
-        if (IsDev)
-          ++InOMPDeviceContext;
-        this->Visit(Init);
-        if (IsDev)
-          --InOMPDeviceContext;
-      }
-    }
-  }
-};
-} // namespace
-
-void Sema::emitDeferredDiags() {
-  if (DeviceDeferredDiags.empty() && !LangOpts.OpenMP)
-    return;
-
-  DeferredDiagnosticsEmitter(*this).visitUsedDecl(
-      SourceLocation(), Context.getTranslationUnitDecl());
+    emitCallStackNotes(S, FD);
 }
 
 // In CUDA, there are some constructs which may appear in semantically-valid
@@ -1644,6 +1544,71 @@ Sema::DeviceDiagBuilder::~DeviceDiagBuilder() {
   }
 }
 
+// Indicate that this function (and thus everything it transtively calls) will
+// be codegen'ed, and emit any deferred diagnostics on this function and its
+// (transitive) callees.
+void Sema::markKnownEmitted(
+    Sema &S, FunctionDecl *OrigCaller, FunctionDecl *OrigCallee,
+    SourceLocation OrigLoc,
+    const llvm::function_ref<bool(Sema &, FunctionDecl *)> IsKnownEmitted) {
+  // Nothing to do if we already know that FD is emitted.
+  if (IsKnownEmitted(S, OrigCallee)) {
+    assert(!S.DeviceCallGraph.count(OrigCallee));
+    return;
+  }
+
+  // We've just discovered that OrigCallee is known-emitted.  Walk our call
+  // graph to see what else we can now discover also must be emitted.
+
+  struct CallInfo {
+    FunctionDecl *Caller;
+    FunctionDecl *Callee;
+    SourceLocation Loc;
+  };
+  llvm::SmallVector<CallInfo, 4> Worklist = {{OrigCaller, OrigCallee, OrigLoc}};
+  llvm::SmallSet<CanonicalDeclPtr<FunctionDecl>, 4> Seen;
+  Seen.insert(OrigCallee);
+  while (!Worklist.empty()) {
+    CallInfo C = Worklist.pop_back_val();
+    assert(!IsKnownEmitted(S, C.Callee) &&
+           "Worklist should not contain known-emitted functions.");
+    S.DeviceKnownEmittedFns[C.Callee] = {C.Caller, C.Loc};
+    emitDeferredDiags(S, C.Callee, C.Caller);
+
+    // If this is a template instantiation, explore its callgraph as well:
+    // Non-dependent calls are part of the template's callgraph, while dependent
+    // calls are part of to the instantiation's call graph.
+    if (auto *Templ = C.Callee->getPrimaryTemplate()) {
+      FunctionDecl *TemplFD = Templ->getAsFunction();
+      if (!Seen.count(TemplFD) && !S.DeviceKnownEmittedFns.count(TemplFD)) {
+        Seen.insert(TemplFD);
+        Worklist.push_back(
+            {/* Caller = */ C.Caller, /* Callee = */ TemplFD, C.Loc});
+      }
+    }
+
+    // Add all functions called by Callee to our worklist.
+    auto CGIt = S.DeviceCallGraph.find(C.Callee);
+    if (CGIt == S.DeviceCallGraph.end())
+      continue;
+
+    for (std::pair<CanonicalDeclPtr<FunctionDecl>, SourceLocation> FDLoc :
+         CGIt->second) {
+      FunctionDecl *NewCallee = FDLoc.first;
+      SourceLocation CallLoc = FDLoc.second;
+      if (Seen.count(NewCallee) || IsKnownEmitted(S, NewCallee))
+        continue;
+      Seen.insert(NewCallee);
+      Worklist.push_back(
+          {/* Caller = */ C.Callee, /* Callee = */ NewCallee, CallLoc});
+    }
+
+    // C.Callee is now known-emitted, so we no longer need to maintain its list
+    // of callees in DeviceCallGraph.
+    S.DeviceCallGraph.erase(CGIt);
+  }
+}
+
 Sema::DeviceDiagBuilder Sema::targetDiag(SourceLocation Loc, unsigned DiagID) {
   if (LangOpts.OpenMP)
     return LangOpts.OpenMPIsDevice ? diagIfOpenMPDeviceCode(Loc, DiagID)
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 13b72c4c8574..0c61057e1072 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -674,6 +674,25 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   // Otherwise, mark the call in our call graph so we can traverse it later.
   bool CallerKnownEmitted =
       getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted;
+  if (CallerKnownEmitted) {
+    // Host-side references to a __global__ function refer to the stub, so the
+    // function itself is never emitted and therefore should not be marked.
+    if (!shouldIgnoreInHostDeviceCheck(Callee))
+      markKnownEmitted(
+          *this, Caller, Callee, Loc, [](Sema &S, FunctionDecl *FD) {
+            return S.getEmissionStatus(FD) == FunctionEmissionStatus::Emitted;
+          });
+  } else {
+    // If we have
+    //   host fn calls kernel fn calls host+device,
+    // the HD function does not get instantiated on the host.  We model this by
+    // omitting at the call to the kernel from the callgraph.  This ensures
+    // that, when compiling for host, only HD functions actually called from the
+    // host get marked as known-emitted.
+    if (!shouldIgnoreInHostDeviceCheck(Callee))
+      DeviceCallGraph[Caller].insert({Callee, Loc});
+  }
+
   DeviceDiagBuilder::Kind DiagKind = [this, Caller, Callee,
                                       CallerKnownEmitted] {
     switch (IdentifyCUDAPreference(Caller, Callee)) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index d6b6b0603346..4c088aa47f55 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -17929,8 +17929,7 @@ Decl *Sema::getObjCDeclContext() const {
   return (dyn_cast_or_null<ObjCContainerDecl>(CurContext));
 }
 
-Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
-                                                     bool Final) {
+Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD) {
   // Templates are emitted when they're instantiated.
   if (FD->isDependentContext())
     return FunctionEmissionStatus::TemplateDiscarded;
@@ -17942,10 +17941,8 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
     if (DevTy.hasValue()) {
       if (*DevTy == OMPDeclareTargetDeclAttr::DT_Host)
         OMPES = FunctionEmissionStatus::OMPDiscarded;
-      else if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost ||
-               *DevTy == OMPDeclareTargetDeclAttr::DT_Any) {
+      else if (DeviceKnownEmittedFns.count(FD) > 0)
         OMPES = FunctionEmissionStatus::Emitted;
-      }
     }
   } else if (LangOpts.OpenMP) {
     // In OpenMP 4.5 all the functions are host functions.
@@ -17961,11 +17958,10 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
       if (DevTy.hasValue()) {
         if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
           OMPES = FunctionEmissionStatus::OMPDiscarded;
-        } else if (*DevTy == OMPDeclareTargetDeclAttr::DT_Host ||
-                   *DevTy == OMPDeclareTargetDeclAttr::DT_Any)
+        } else if (DeviceKnownEmittedFns.count(FD) > 0) {
           OMPES = FunctionEmissionStatus::Emitted;
-      } else if (Final)
-        OMPES = FunctionEmissionStatus::Emitted;
+        }
+      }
     }
   }
   if (OMPES == FunctionEmissionStatus::OMPDiscarded ||
@@ -18000,7 +17996,9 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD,
 
   // Otherwise, the function is known-emitted if it's in our set of
   // known-emitted functions.
-  return FunctionEmissionStatus::Unknown;
+  return (DeviceKnownEmittedFns.count(FD) > 0)
+             ? FunctionEmissionStatus::Emitted
+             : FunctionEmissionStatus::Unknown;
 }
 
 bool Sema::shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 591d079784fe..9485906cf243 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TreeTransform.h"
-#include "UsedDeclVisitor.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTLambda.h"
@@ -15899,8 +15898,13 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
     Func->markUsed(Context);
   }
 
-  if (LangOpts.OpenMP)
+  if (LangOpts.OpenMP) {
     markOpenMPDeclareVariantFuncsReferenced(Loc, Func, MightBeOdrUse);
+    if (LangOpts.OpenMPIsDevice)
+      checkOpenMPDeviceFunction(Loc, Func);
+    else
+      checkOpenMPHostFunction(Loc, Func);
+  }
 }
 
 /// Directly mark a variable odr-used. Given a choice, prefer to use
@@ -17292,33 +17296,71 @@ void Sema::MarkDeclarationsReferencedInType(SourceLocation Loc, QualType T) {
 }
 
 namespace {
-/// Helper class that marks all of the declarations referenced by
-/// potentially-evaluated subexpressions as "referenced".
-class EvaluatedExprMarker : public UsedDeclVisitor<EvaluatedExprMarker> {
-public:
-  typedef UsedDeclVisitor<EvaluatedExprMarker> Inherited;
-  bool SkipLocalVariables;
+  /// Helper class that marks all of the declarations referenced by
+  /// potentially-evaluated subexpressions as "referenced".
+  class EvaluatedExprMarker : public EvaluatedExprVisitor<EvaluatedExprMarker> {
+    Sema &S;
+    bool SkipLocalVariables;
 
-  EvaluatedExprMarker(Sema &S, bool SkipLocalVariables)
-      : Inherited(S), SkipLocalVariables(SkipLocalVariables) {}
+  public:
+    typedef EvaluatedExprVisitor<EvaluatedExprMarker> Inherited;
 
-  void visitUsedDecl(SourceLocation Loc, Decl *D) {
-    S.MarkFunctionReferenced(Loc, cast<FunctionDecl>(D));
-  }
+    EvaluatedExprMarker(Sema &S, bool SkipLocalVariables)
+      : Inherited(S.Context), S(S), SkipLocalVariables(SkipLocalVariables) { }
 
-  void VisitDeclRefExpr(DeclRefExpr *E) {
-    // If we were asked not to visit local variables, don't.
-    if (SkipLocalVariables) {
-      if (VarDecl *VD = dyn_cast<VarDecl>(E->getDecl()))
-        if (VD->hasLocalStorage())
-          return;
+    void VisitDeclRefExpr(DeclRefExpr *E) {
+      // If we were asked not to visit local variables, don't.
+      if (SkipLocalVariables) {
+        if (VarDecl *VD = dyn_cast<VarDecl>(E->getDecl()))
+          if (VD->hasLocalStorage())
+            return;
+      }
+
+      S.MarkDeclRefReferenced(E);
     }
-    S.MarkDeclRefReferenced(E);
-  }
 
-  void VisitMemberExpr(MemberExpr *E) { S.MarkMemberReferenced(E); }
-};
-} // namespace
+    void VisitMemberExpr(MemberExpr *E) {
+      S.MarkMemberReferenced(E);
+      Inherited::VisitMemberExpr(E);
+    }
+
+    void VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
+      S.MarkFunctionReferenced(
+          E->getBeginLoc(),
+          const_cast<CXXDestructorDecl *>(E->getTemporary()->getDestructor()));
+      Visit(E->getSubExpr());
+    }
+
+    void VisitCXXNewExpr(CXXNewExpr *E) {
+      if (E->getOperatorNew())
+        S.MarkFunctionReferenced(E->getBeginLoc(), E->getOperatorNew());
+      if (E->getOperatorDelete())
+        S.MarkFunctionReferenced(E->getBeginLoc(), E->getOperatorDelete());
+      Inherited::VisitCXXNewExpr(E);
+    }
+
+    void VisitCXXDeleteExpr(CXXDeleteExpr *E) {
+      if (E->getOperatorDelete())
+        S.MarkFunctionReferenced(E->getBeginLoc(), E->getOperatorDelete());
+      QualType Destroyed = S.Context.getBaseElementType(E->getDestroyedType());
+      if (const RecordType *DestroyedRec = Destroyed->getAs<RecordType>()) {
+        CXXRecordDecl *Record = cast<CXXRecordDecl>(DestroyedRec->getDecl());
+        S.MarkFunctionReferenced(E->getBeginLoc(), S.LookupDestructor(Record));
+      }
+
+      Inherited::VisitCXXDeleteExpr(E);
+    }
+
+    void VisitCXXConstructExpr(CXXConstructExpr *E) {
+      S.MarkFunctionReferenced(E->getBeginLoc(), E->getConstructor());
+      Inherited::VisitCXXConstructExpr(E);
+    }
+
+    void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) {
+      Visit(E->getExpr());
+    }
+  };
+}
 
 /// Mark any declarations that appear within this expression or any
 /// potentially-evaluated subexpressions as "referenced".
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index b6afabda578d..ea1011067130 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -1710,6 +1710,92 @@ Sema::DeviceDiagBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
   return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this);
 }
 
+void Sema::checkOpenMPDeviceFunction(SourceLocation Loc, FunctionDecl *Callee,
+                                     bool CheckForDelayedContext) {
+  assert(LangOpts.OpenMP && LangOpts.OpenMPIsDevice &&
+         "Expected OpenMP device compilation.");
+  assert(Callee && "Callee may not be null.");
+  Callee = Callee->getMostRecentDecl();
+  FunctionDecl *Caller = getCurFunctionDecl();
+
+  // host only function are not available on the device.
+  if (Caller) {
+    FunctionEmissionStatus CallerS = getEmissionStatus(Caller);
+    FunctionEmissionStatus CalleeS = getEmissionStatus(Callee);
+    assert(CallerS != FunctionEmissionStatus::CUDADiscarded &&
+           CalleeS != FunctionEmissionStatus::CUDADiscarded &&
+           "CUDADiscarded unexpected in OpenMP device function check");
+    if ((CallerS == FunctionEmissionStatus::Emitted ||
+         (!isOpenMPDeviceDelayedContext(*this) &&
+          CallerS == FunctionEmissionStatus::Unknown)) &&
+        CalleeS == FunctionEmissionStatus::OMPDiscarded) {
+      StringRef HostDevTy = getOpenMPSimpleClauseTypeName(
+          OMPC_device_type, OMPC_DEVICE_TYPE_host);
+      Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0;
+      Diag(Callee->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
+           diag::note_omp_marked_device_type_here)
+          << HostDevTy;
+      return;
+    }
+  }
+  // If the caller is known-emitted, mark the callee as known-emitted.
+  // Otherwise, mark the call in our call graph so we can traverse it later.
+  if ((CheckForDelayedContext && !isOpenMPDeviceDelayedContext(*this)) ||
+      (!Caller && !CheckForDelayedContext) ||
+      (Caller && getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted))
+    markKnownEmitted(*this, Caller, Callee, Loc,
+                     [CheckForDelayedContext](Sema &S, FunctionDecl *FD) {
+                       return CheckForDelayedContext &&
+                              S.getEmissionStatus(FD) ==
+                                  FunctionEmissionStatus::Emitted;
+                     });
+  else if (Caller)
+    DeviceCallGraph[Caller].insert({Callee, Loc});
+}
+
+void Sema::checkOpenMPHostFunction(SourceLocation Loc, FunctionDecl *Callee,
+                                   bool CheckCaller) {
+  assert(LangOpts.OpenMP && !LangOpts.OpenMPIsDevice &&
+         "Expected OpenMP host compilation.");
+  assert(Callee && "Callee may not be null.");
+  Callee = Callee->getMostRecentDecl();
+  FunctionDecl *Caller = getCurFunctionDecl();
+
+  // device only function are not available on the host.
+  if (Caller) {
+    FunctionEmissionStatus CallerS = getEmissionStatus(Caller);
+    FunctionEmissionStatus CalleeS = getEmissionStatus(Callee);
+    assert(
+        (LangOpts.CUDA || (CallerS != FunctionEmissionStatus::CUDADiscarded &&
+                           CalleeS != FunctionEmissionStatus::CUDADiscarded)) &&
+        "CUDADiscarded unexpected in OpenMP host function check");
+    if (CallerS == FunctionEmissionStatus::Emitted &&
+        CalleeS == FunctionEmissionStatus::OMPDiscarded) {
+      StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
+          OMPC_device_type, OMPC_DEVICE_TYPE_nohost);
+      Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1;
+      Diag(Callee->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
+           diag::note_omp_marked_device_type_here)
+          << NoHostDevTy;
+      return;
+    }
+  }
+  // If the caller is known-emitted, mark the callee as known-emitted.
+  // Otherwise, mark the call in our call graph so we can traverse it later.
+  if (!shouldIgnoreInHostDeviceCheck(Callee)) {
+    if ((!CheckCaller && !Caller) ||
+        (Caller &&
+         getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted))
+      markKnownEmitted(
+          *this, Caller, Callee, Loc, [CheckCaller](Sema &S, FunctionDecl *FD) {
+            return CheckCaller &&
+                   S.getEmissionStatus(FD) == FunctionEmissionStatus::Emitted;
+          });
+    else if (Caller)
+      DeviceCallGraph[Caller].insert({Callee, Loc});
+  }
+}
+
 void Sema::checkOpenMPDeviceExpr(const Expr *E) {
   assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice &&
          "OpenMP device compilation mode is expected.");
@@ -2122,43 +2208,52 @@ bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
 
 void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }
 
-void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
-                                         const FunctionDecl *Callee,
-                                         SourceLocation Loc) {
+void Sema::finalizeOpenMPDelayedAnalysis() {
   assert(LangOpts.OpenMP && "Expected OpenMP compilation mode.");
-  Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
-      OMPDeclareTargetDeclAttr::getDeviceType(Caller->getMostRecentDecl());
-  // Ignore host functions during device analyzis.
-  if (LangOpts.OpenMPIsDevice && DevTy &&
-      *DevTy == OMPDeclareTargetDeclAttr::DT_Host)
-    return;
-  // Ignore nohost functions during host analyzis.
-  if (!LangOpts.OpenMPIsDevice && DevTy &&
-      *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
-    return;
-  const FunctionDecl *FD = Callee->getMostRecentDecl();
-  DevTy = OMPDeclareTargetDeclAttr::getDeviceType(FD);
-  if (LangOpts.OpenMPIsDevice && DevTy &&
-      *DevTy == OMPDeclareTargetDeclAttr::DT_Host) {
-    // Diagnose host function called during device codegen.
-    StringRef HostDevTy =
-        getOpenMPSimpleClauseTypeName(OMPC_device_type, OMPC_DEVICE_TYPE_host);
-    Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0;
-    Diag(FD->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
-         diag::note_omp_marked_device_type_here)
-        << HostDevTy;
-    return;
-  }
+  // Diagnose implicit declare target functions and their callees.
+  for (const auto &CallerCallees : DeviceCallGraph) {
+    Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
+        OMPDeclareTargetDeclAttr::getDeviceType(
+            CallerCallees.getFirst()->getMostRecentDecl());
+    // Ignore host functions during device analyzis.
+    if (LangOpts.OpenMPIsDevice && DevTy &&
+        *DevTy == OMPDeclareTargetDeclAttr::DT_Host)
+      continue;
+    // Ignore nohost functions during host analyzis.
+    if (!LangOpts.OpenMPIsDevice && DevTy &&
+        *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
+      continue;
+    for (const std::pair<CanonicalDeclPtr<FunctionDecl>, SourceLocation>
+             &Callee : CallerCallees.getSecond()) {
+      const FunctionDecl *FD = Callee.first->getMostRecentDecl();
+      Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
+          OMPDeclareTargetDeclAttr::getDeviceType(FD);
+      if (LangOpts.OpenMPIsDevice && DevTy &&
+          *DevTy == OMPDeclareTargetDeclAttr::DT_Host) {
+        // Diagnose host function called during device codegen.
+        StringRef HostDevTy = getOpenMPSimpleClauseTypeName(
+            OMPC_device_type, OMPC_DEVICE_TYPE_host);
+        Diag(Callee.second, diag::err_omp_wrong_device_function_call)
+            << HostDevTy << 0;
+        Diag(FD->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
+             diag::note_omp_marked_device_type_here)
+            << HostDevTy;
+        continue;
+      }
       if (!LangOpts.OpenMPIsDevice && DevTy &&
           *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
         // Diagnose nohost function called during host codegen.
         StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName(
             OMPC_device_type, OMPC_DEVICE_TYPE_nohost);
-        Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1;
+        Diag(Callee.second, diag::err_omp_wrong_device_function_call)
+            << NoHostDevTy << 1;
         Diag(FD->getAttr<OMPDeclareTargetDeclAttr>()->getLocation(),
              diag::note_omp_marked_device_type_here)
             << NoHostDevTy;
+        continue;
       }
+    }
+  }
 }
 
 void Sema::StartOpenMPDSABlock(OpenMPDirectiveKind DKind,
@@ -17077,6 +17172,15 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
       Diag(FD->getLocation(), diag::note_defined_here) << FD;
       return;
     }
+    // Mark the function as must be emitted for the device.
+    Optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
+        OMPDeclareTargetDeclAttr::getDeviceType(FD);
+    if (LangOpts.OpenMPIsDevice && Res.hasValue() && IdLoc.isValid() &&
+        *DevTy != OMPDeclareTargetDeclAttr::DT_Host)
+      checkOpenMPDeviceFunction(IdLoc, FD, /*CheckForDelayedContext=*/false);
+    if (!LangOpts.OpenMPIsDevice && Res.hasValue() && IdLoc.isValid() &&
+        *DevTy != OMPDeclareTargetDeclAttr::DT_NoHost)
+      checkOpenMPHostFunction(IdLoc, FD, /*CheckCaller=*/false);
   }
   if (auto *VD = dyn_cast<ValueDecl>(D)) {
     // Problem if any with var declared with incomplete type will be reported
diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp
index 1a371d699789..cc6558debde6 100644
--- a/clang/test/OpenMP/declare_target_messages.cpp
+++ b/clang/test/OpenMP/declare_target_messages.cpp
@@ -162,17 +162,17 @@ namespace {
 #pragma omp declare target link(x) // expected-error {{'x' must not appear in both clauses 'to' and 'link'}}
 
 void bazz() {}
-#pragma omp declare target to(bazz) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} host5-note 3{{marked as 'device_type(nohost)' here}}
+#pragma omp declare target to(bazz) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} host5-note {{marked as 'device_type(nohost)' here}}
 void bazzz() {bazz();}
 #pragma omp declare target to(bazzz) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 void any() {bazz();} // host5-error {{function with 'device_type(nohost)' is not available on host}}
-void host1() {bazz();} // host5-error {{function with 'device_type(nohost)' is not available on host}}
-#pragma omp declare target to(host1) device_type(host) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} dev5-note 4 {{marked as 'device_type(host)' here}}
-void host2() {bazz();} //host5-error {{function with 'device_type(nohost)' is not available on host}}
+void host1() {bazz();}
+#pragma omp declare target to(host1) device_type(host) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} dev5-note 2 {{marked as 'device_type(host)' here}}
+void host2() {bazz();}
 #pragma omp declare target to(host2)
-void device() {host1();} // dev5-error {{function with 'device_type(host)' is not available on device}}
+void device() {host1();}
 #pragma omp declare target to(device) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} host5-note 2 {{marked as 'device_type(nohost)' here}}
-void host3() {host1();} // dev5-error {{function with 'device_type(host)' is not available on device}}
+void host3() {host1();}
 #pragma omp declare target to(host3)
 
 #pragma omp declare target
diff --git a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
index faff77e0a43b..433ba13f73d6 100644
--- a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
+++ b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp
@@ -38,7 +38,7 @@ int d;
 #pragma omp end declare target
 int c;
 
-int bar() { return 1 + foo() + bar() + baz1() + baz2(); } // expected-note {{called by 'bar'}}
+int bar() { return 1 + foo() + bar() + baz1() + baz2(); }
 
 int maini1() {
   int a;
@@ -49,7 +49,7 @@ int maini1() {
   {
     S s(a);
     static long aaa = 23;
-    a = foo() + bar() + b + c + d + aa + aaa + FA<int>(); // expected-note{{called by 'maini1'}}
+    a = foo() + bar() + b + c + d + aa + aaa + FA<int>();
     if (!a)
       throw "Error"; // expected-error {{cannot use 'throw' with exceptions disabled}}
   }
diff --git a/clang/test/SemaCUDA/bad-calls-on-same-line.cu b/clang/test/SemaCUDA/bad-calls-on-same-line.cu
index 941452470dc7..67923323a94f 100644
--- a/clang/test/SemaCUDA/bad-calls-on-same-line.cu
+++ b/clang/test/SemaCUDA/bad-calls-on-same-line.cu
@@ -33,8 +33,8 @@ inline __host__ __device__ void hd() {
 
 void host_fn() {
   hd<int>();
-  hd<double>();
+  hd<double>();  // expected-note {{function template specialization 'hd<double>'}}
   // expected-note@-1 {{called by 'host_fn'}}
-  hd<float>();
+  hd<float>();  // expected-note {{function template specialization 'hd<float>'}}
   // expected-note@-1 {{called by 'host_fn'}}
 }
diff --git a/clang/test/SemaCUDA/call-device-fn-from-host.cu b/clang/test/SemaCUDA/call-device-fn-from-host.cu
index 4d66fccd84d5..5d506d65ea58 100644
--- a/clang/test/SemaCUDA/call-device-fn-from-host.cu
+++ b/clang/test/SemaCUDA/call-device-fn-from-host.cu
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - \
 // RUN:   -verify -verify-ignore-unexpected=note
 // RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - \
-// RUN:   -verify=expected,omp -verify-ignore-unexpected=note -fopenmp
+// RUN:   -verify -verify-ignore-unexpected=note -fopenmp
 
 // Note: This test won't work with -fsyntax-only, because some of these errors
 // are emitted during codegen.
@@ -39,7 +39,7 @@ __host__ __device__ void T::hd3() {
 }
 
 template <typename T> __host__ __device__ void hd2() { device_fn(); }
-// expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
+// expected-error@-1 2 {{reference to __device__ function 'device_fn' in __host__ __device__ function}}
 void host_fn() { hd2<int>(); }
 
 __host__ __device__ void hd() { device_fn(); }
diff --git a/clang/test/SemaCUDA/call-host-fn-from-device.cu b/clang/test/SemaCUDA/call-host-fn-from-device.cu
index acdd291b6645..c5bbd63d8e06 100644
--- a/clang/test/SemaCUDA/call-host-fn-from-device.cu
+++ b/clang/test/SemaCUDA/call-host-fn-from-device.cu
@@ -56,14 +56,14 @@ __host__ __device__ void T::hd3() {
 }
 
 template <typename T> __host__ __device__ void hd2() { host_fn(); }
-// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+// expected-error@-1 2 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
 __global__ void kernel() { hd2<int>(); }
 
 __host__ __device__ void hd() { host_fn(); }
 // expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
 
 template <typename T> __host__ __device__ void hd3() { host_fn(); }
-// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
+// expected-error@-1 2 {{reference to __host__ function 'host_fn' in __host__ __device__ function}}
 __device__ void device_fn() { hd3<int>(); }
 
 // No error because this is never instantiated.
diff --git a/clang/test/SemaCUDA/openmp-target.cu b/clang/test/SemaCUDA/openmp-target.cu
index c32aed44fb62..2775dc1e2c5b 100644
--- a/clang/test/SemaCUDA/openmp-target.cu
+++ b/clang/test/SemaCUDA/openmp-target.cu
@@ -16,9 +16,9 @@ void bazz() {}
 void bazzz() {bazz();}
 #pragma omp declare target to(bazzz) device_type(nohost)
 void any() {bazz();} // expected-error {{function with 'device_type(nohost)' is not available on host}}
-void host1() {bazz();} // expected-error {{function with 'device_type(nohost)' is not available on host}}
+void host1() {bazz();}
 #pragma omp declare target to(host1) device_type(host)
-void host2() {bazz();} // expected-error {{function with 'device_type(nohost)' is not available on host}}
+void host2() {bazz();}
 #pragma omp declare target to(host2)
 void device() {host1();}
 #pragma omp declare target to(device) device_type(nohost)
diff --git a/clang/test/SemaCUDA/trace-through-global.cu b/clang/test/SemaCUDA/trace-through-global.cu
index 0555afea0280..f73570fa6645 100644
--- a/clang/test/SemaCUDA/trace-through-global.cu
+++ b/clang/test/SemaCUDA/trace-through-global.cu
@@ -38,7 +38,7 @@ void launch_kernel() {
   // Notice that these two diagnostics are different: Because the call to hd1
   // is not dependent on T, the call to hd1 comes from 'launch_kernel', while
   // the call to hd3, being dependent, comes from 'launch_kernel<int>'.
-  hd1(); // expected-note {{called by 'launch_kernel<int>'}}
+  hd1(); // expected-note {{called by 'launch_kernel'}}
   hd3(T()); // expected-note {{called by 'launch_kernel<int>'}}
 }