[OpenMP] Create custom state machines for generic target regions

In the spirit of TRegions [0], this patch creates a custom state machine for a generic target region based on the potentially called parallel regions. The code analysis is done interprocedurally via an abstract attribute (AAKernelInfo). All outermost parallel regions are collected and we check if there might be unknown outermost parallel regions for which we need an indirect call. Other AAKernelInfo extensions are expected. [0] https://link.springer.com/chapter/10.1007/978-3-030-28596-8_11 Differential Revision: https://reviews.llvm.org/D101977
2021-05-20 00:37:29 -05:00 · 2021-05-20 00:37:29 -05:00 · f0628c6ff7
parent 1d5711c3ee
commit f0628c6ff7
8 changed files with 2988 additions and 69 deletions
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@ -610,8 +610,20 @@ void IRPosition::verify() {
 Optional<Constant *>
 Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
                               bool &UsedAssumedInformation) {
-  const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>(
-      AA, IRPosition::value(V, AA.getCallBaseContext()), DepClassTy::NONE);
+  // First check all callbacks provided by outside AAs. If any of them returns
+  // a non-null value that is different from the associated value, or None, we
+  // assume it's simpliied.
+  IRPosition IRP = IRPosition::value(V, AA.getCallBaseContext());
+  for (auto &CB : SimplificationCallbacks[IRP]) {
+    Optional<Value *> SimplifiedV = CB(IRP, &AA, UsedAssumedInformation);
+    if (!SimplifiedV.hasValue())
+      return llvm::None;
+    if (*SimplifiedV && *SimplifiedV != &IRP.getAssociatedValue() &&
+        isa<Constant>(*SimplifiedV))
+      return cast<Constant>(*SimplifiedV);
+  }
+  const auto &ValueSimplifyAA =
+      getAAFor<AAValueSimplify>(AA, IRP, DepClassTy::NONE);
  Optional<Value *> SimplifiedV =
      ValueSimplifyAA.getAssumedSimplifiedValue(*this);
  bool IsKnown = ValueSimplifyAA.isAtFixpoint();
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@ -25,6 +25,9 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Assumptions.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
@ -70,6 +73,15 @@ STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
          "Number of OpenMP runtime function uses identified");
 STATISTIC(NumOpenMPTargetRegionKernels,
          "Number of OpenMP target region entry points (=kernels) identified");
+STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
+          "Number of OpenMP target region entry points (=kernels) executed in "
+          "generic-mode without a state machines");
+STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
+          "Number of OpenMP target region entry points (=kernels) executed in "
+          "generic-mode with customized state machines with fallback");
+STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
+          "Number of OpenMP target region entry points (=kernels) executed in "
+          "generic-mode with customized state machines without fallback");
 STATISTIC(
    NumOpenMPParallelRegionsReplacedInGPUStateMachine,
    "Number of OpenMP parallel regions replaced with ID in GPU state machines");
@ -228,6 +240,11 @@ struct OMPInformationCache : public InformationCache {
    /// Map from functions to all uses of this runtime function contained in
    /// them.
    DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
+
+  public:
+    /// Iterators for the uses of this runtime function.
+    decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
+    decltype(UsesMap)::iterator end() { return UsesMap.end(); }
  };

  /// An OpenMP-IR-Builder instance
@ -238,6 +255,9 @@ struct OMPInformationCache : public InformationCache {
                  RuntimeFunction::OMPRTL___last>
      RFIs;

+  /// Map from function declarations/definitions to their runtime enum type.
+  DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
+
  /// Map from ICV kind to the ICV description.
  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
                  InternalControlVar::ICV___last>
@ -380,6 +400,7 @@ struct OMPInformationCache : public InformationCache {
    SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__});                           \
    Function *F = M.getFunction(_Name);                                        \
    if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) {           \
+      RuntimeFunctionIDMap[F] = _Enum;                                         \
      auto &RFI = RFIs[_Enum];                                                 \
      RFI.Kind = _Enum;                                                        \
      RFI.Name = _Name;                                                        \
@ -408,6 +429,141 @@ struct OMPInformationCache : public InformationCache {
  SmallPtrSetImpl<Kernel> &Kernels;
 };

+template <typename Ty, bool InsertInvalidates = true>
+struct BooleanStateWithPtrSetVector : public BooleanState {
+
+  bool contains(Ty *Elem) const { return Set.contains(Elem); }
+  bool insert(Ty *Elem) {
+    if (InsertInvalidates)
+      BooleanState::indicatePessimisticFixpoint();
+    return Set.insert(Elem);
+  }
+
+  Ty *operator[](int Idx) const { return Set[Idx]; }
+  bool operator==(const BooleanStateWithPtrSetVector &RHS) const {
+    return BooleanState::operator==(RHS) && Set == RHS.Set;
+  }
+  bool operator!=(const BooleanStateWithPtrSetVector &RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool empty() const { return Set.empty(); }
+  size_t size() const { return Set.size(); }
+
+  /// "Clamp" this state with \p RHS.
+  BooleanStateWithPtrSetVector &
+  operator^=(const BooleanStateWithPtrSetVector &RHS) {
+    BooleanState::operator^=(RHS);
+    Set.insert(RHS.Set.begin(), RHS.Set.end());
+    return *this;
+  }
+
+private:
+  /// A set to keep track of elements.
+  SetVector<Ty *> Set;
+
+public:
+  typename decltype(Set)::iterator begin() { return Set.begin(); }
+  typename decltype(Set)::iterator end() { return Set.end(); }
+  typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
+  typename decltype(Set)::const_iterator end() const { return Set.end(); }
+};
+
+struct KernelInfoState : AbstractState {
+  /// Flag to track if we reached a fixpoint.
+  bool IsAtFixpoint = false;
+
+  /// The parallel regions (identified by the outlined parallel functions) that
+  /// can be reached from the associated function.
+  BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
+      ReachedKnownParallelRegions;
+
+  /// State to track what parallel region we might reach.
+  BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
+
+  /// The __kmpc_target_init call in this kernel, if any. If we find more than
+  /// one we abort as the kernel is malformed.
+  CallBase *KernelInitCB = nullptr;
+
+  /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
+  /// one we abort as the kernel is malformed.
+  CallBase *KernelDeinitCB = nullptr;
+
+  /// Abstract State interface
+  ///{
+
+  KernelInfoState() {}
+  KernelInfoState(bool BestState) {
+    if (!BestState)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractState::isValidState(...)
+  bool isValidState() const override { return true; }
+
+  /// See AbstractState::isAtFixpoint(...)
+  bool isAtFixpoint() const override { return IsAtFixpoint; }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...)
+  ChangeStatus indicatePessimisticFixpoint() override {
+    IsAtFixpoint = true;
+    ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
+    return ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractState::indicateOptimisticFixpoint(...)
+  ChangeStatus indicateOptimisticFixpoint() override {
+    IsAtFixpoint = true;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// Return the assumed state
+  KernelInfoState &getAssumed() { return *this; }
+  const KernelInfoState &getAssumed() const { return *this; }
+
+  bool operator==(const KernelInfoState &RHS) const {
+    if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
+      return false;
+    if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
+      return false;
+    return true;
+  }
+
+  /// Return empty set as the best state of potential values.
+  static KernelInfoState getBestState() { return KernelInfoState(true); }
+
+  static KernelInfoState getBestState(KernelInfoState &KIS) {
+    return getBestState();
+  }
+
+  /// Return full set as the worst state of potential values.
+  static KernelInfoState getWorstState() { return KernelInfoState(false); }
+
+  /// "Clamp" this state with \p KIS.
+  KernelInfoState operator^=(const KernelInfoState &KIS) {
+    // Do not merge two different _init and _deinit call sites.
+    if (KIS.KernelInitCB) {
+      if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
+        indicatePessimisticFixpoint();
+      KernelInitCB = KIS.KernelInitCB;
+    }
+    if (KIS.KernelDeinitCB) {
+      if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
+        indicatePessimisticFixpoint();
+      KernelDeinitCB = KIS.KernelDeinitCB;
+    }
+    ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
+    ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
+    return *this;
+  }
+
+  KernelInfoState operator&=(const KernelInfoState &KIS) {
+    return (*this ^= KIS);
+  }
+
+  ///}
+};
+
 /// Used to map the values physically (in the IR) stored in an offload
 /// array, to a vector in memory.
 struct OffloadArray {
@ -522,7 +678,7 @@ struct OpenMPOpt {
                      << OMPInfoCache.ModuleSlice.size() << " functions\n");

    if (IsModulePass) {
-      Changed |= runAttributor();
+      Changed |= runAttributor(IsModulePass);

      // Recollect uses, in case Attributor deleted any.
      OMPInfoCache.recollectUses();
@ -535,14 +691,14 @@ struct OpenMPOpt {
      if (PrintOpenMPKernels)
        printKernels();

-      Changed |= rewriteDeviceCodeStateMachine();
-
-      Changed |= runAttributor();
+      Changed |= runAttributor(IsModulePass);

      // Recollect uses, in case Attributor deleted any.
      OMPInfoCache.recollectUses();

      Changed |= deleteParallelRegions();
+      Changed |= rewriteDeviceCodeStateMachine();
+
      if (HideMemoryTransferLatency)
        Changed |= hideMemTransfersLatency();
      Changed |= deduplicateRuntimeCalls();
@ -1573,11 +1729,11 @@ private:
  Attributor &A;

  /// Helper function to run Attributor on SCC.
-  bool runAttributor() {
+  bool runAttributor(bool IsModulePass) {
    if (SCC.empty())
      return false;

-    registerAAs();
+    registerAAs(IsModulePass);

    ChangeStatus Changed = A.run();

@ -1589,46 +1745,7 @@ private:

  /// Populate the Attributor with abstract attribute opportunities in the
  /// function.
-  void registerAAs() {
-    if (SCC.empty())
-      return;
-
-    // Create CallSite AA for all Getters.
-    for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
-      auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
-
-      auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
-
-      auto CreateAA = [&](Use &U, Function &Caller) {
-        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
-        if (!CI)
-          return false;
-
-        auto &CB = cast<CallBase>(*CI);
-
-        IRPosition CBPos = IRPosition::callsite_function(CB);
-        A.getOrCreateAAFor<AAICVTracker>(CBPos);
-        return false;
-      };
-
-      GetterRFI.foreachUse(SCC, CreateAA);
-    }
-    auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
-    auto CreateAA = [&](Use &U, Function &F) {
-      A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
-      return false;
-    };
-    GlobalizationRFI.foreachUse(SCC, CreateAA);
-
-    // Create an ExecutionDomain AA for every function and a HeapToStack AA for
-    // every function if there is a device kernel.
-    for (auto *F : SCC) {
-      if (!F->isDeclaration())
-        A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
-      if (isOpenMPDevice(M))
-        A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
-    }
-  }
+  void registerAAs(bool IsModulePass);
 };

 Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
@ -1766,7 +1883,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
    // TODO: Checking the number of uses is not a necessary restriction and
    // should be lifted.
    if (UnknownUse || NumDirectCalls != 1 ||
-        ToBeReplacedStateMachineUses.size() != 2) {
+        ToBeReplacedStateMachineUses.size() > 2) {
      {
        auto Remark = [&](OptimizationRemarkAnalysis ORA) {
          return ORA << "Parallel region is used in "
@ -2541,9 +2658,587 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
  SmallPtrSet<CallBase *, 4> MallocCalls;
 };

+struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
+  using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
+  AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  /// Statistics are tracked as part of manifest for now.
+  void trackStatistics() const override {}
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    if (!isValidState())
+      return "<invalid>";
+    return
+
+           std::string(" #PRs: ") +
+           std::to_string(ReachedKnownParallelRegions.size()) +
+           ", #Unknown PRs: " +
+           std::to_string(ReachedUnknownParallelRegions.size());
+  }
+
+  /// Create an abstract attribute biew for the position \p IRP.
+  static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AAKernelInfo"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is AAKernelInfo
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  static const char ID;
+};
+
+/// The function kernel info abstract attribute, basically, what can we say
+/// about a function with regards to the KernelInfoState.
+struct AAKernelInfoFunction : AAKernelInfo {
+  AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
+      : AAKernelInfo(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // This is a high-level transform that might change the constant arguments
+    // of the init and dinit calls. We need to tell the Attributor about this
+    // to avoid other parts using the current constant value for simpliication.
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+    Function *Fn = getAnchorScope();
+    if (!OMPInfoCache.Kernels.count(Fn))
+      return;
+
+    OMPInformationCache::RuntimeFunctionInfo &InitRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+    OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
+
+    // For kernels we perform more initialization work, first we find the init
+    // and deinit calls.
+    auto StoreCallBase = [](Use &U,
+                            OMPInformationCache::RuntimeFunctionInfo &RFI,
+                            CallBase *&Storage) {
+      CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
+      assert(CB &&
+             "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
+      assert(!Storage &&
+             "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
+      Storage = CB;
+      return false;
+    };
+    InitRFI.foreachUse(
+        [&](Use &U, Function &) {
+          StoreCallBase(U, InitRFI, KernelInitCB);
+          return false;
+        },
+        Fn);
+    DeinitRFI.foreachUse(
+        [&](Use &U, Function &) {
+          StoreCallBase(U, DeinitRFI, KernelDeinitCB);
+          return false;
+        },
+        Fn);
+
+    assert((KernelInitCB && KernelDeinitCB) &&
+           "Kernel without __kmpc_target_init or __kmpc_target_deinit!");
+
+    // For kernels we need to register a simplification callback so that the Attributor
+    // knows the constant arguments to ___kmpc_target_init and
+    // __kmpc_target_deinit might actually change.
+
+    Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
+        [&](const IRPosition &IRP, const AbstractAttribute *AA,
+            bool &UsedAssumedInformation) -> Optional<Value *> {
+      // IRP represents the "use generic state machine" argument of an
+      // __kmpc_target_init call. We will answer this one with the internal
+      // state. As long as we are not in an invalid state, we will create a
+      // custom state machine so the value should be a `i1 false`. If we are
+      // in an invalid state, we won't change the value that is in the IR.
+      if (!isValidState())
+        return nullptr;
+      if (AA)
+        A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
+      UsedAssumedInformation = !isAtFixpoint();
+      auto *FalseVal =
+          ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0);
+      return FalseVal;
+    };
+
+    constexpr const int InitUseStateMachineArgNo = 2;
+    A.registerSimplificationCallback(
+        IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
+        StateMachineSimplifyCB);
+  }
+
+  /// Modify the IR based on the KernelInfoState as the fixpoint iteration is
+  /// finished now.
+  ChangeStatus manifest(Attributor &A) override {
+    // If we are not looking at a kernel with __kmpc_target_init and
+    // __kmpc_target_deinit call we cannot actually manifest the information.
+    if (!KernelInitCB || !KernelDeinitCB)
+      return ChangeStatus::UNCHANGED;
+
+    buildCustomStateMachine(A);
+
+    return ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus buildCustomStateMachine(Attributor &A) {
+    assert(ReachedKnownParallelRegions.isValidState() &&
+           "Custom state machine with invalid parallel region states?");
+
+    const int InitIsSPMDArgNo = 1;
+    const int InitUseStateMachineArgNo = 2;
+
+    // Check if the current configuration is non-SPMD and generic state machine.
+    // If we already have SPMD mode or a custom state machine we do not need to
+    // go any further. If it is anything but a constant something is weird and
+    // we give up.
+    ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
+        KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
+    ConstantInt *IsSPMD =
+        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
+
+    // If we are stuck with generic mode, try to create a custom device (=GPU)
+    // state machine which is specialized for the parallel regions that are
+    // reachable by the kernel.
+    if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD ||
+        !IsSPMD->isZero())
+      return ChangeStatus::UNCHANGED;
+
+    // First, indicate we use a custom state machine now.
+    auto &Ctx = getAnchorValue().getContext();
+    auto *FalseVal = ConstantInt::getBool(Ctx, 0);
+    A.changeUseAfterManifest(
+        KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
+
+    // If we don't actually need a state machine we are done here. This can
+    // happen if there simply are no parallel regions. In the resulting kernel
+    // all worker threads will simply exit right away, leaving the main thread
+    // to do the work alone.
+    if (ReachedKnownParallelRegions.empty() &&
+        ReachedUnknownParallelRegions.empty()) {
+      ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Generic-mode kernel is executed without state machine "
+                     "(good)";
+      };
+      A.emitRemark<OptimizationRemark>(
+          KernelInitCB, "OpenMPKernelWithoutStateMachine", Remark);
+
+      return ChangeStatus::CHANGED;
+    }
+
+    // Keep track in the statistics of our new shiny custom state machine.
+    if (ReachedUnknownParallelRegions.empty()) {
+      ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Generic-mode kernel is executed with a customized state "
+                     "machine ["
+                  << ore::NV("ParallelRegions",
+                             ReachedKnownParallelRegions.size())
+                  << " known parallel regions] (good).";
+      };
+      A.emitRemark<OptimizationRemark>(
+          KernelInitCB, "OpenMPKernelWithCustomizedStateMachine", Remark);
+    } else {
+      ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Generic-mode kernel is executed with a customized state "
+                     "machine that requires a fallback ["
+                  << ore::NV("ParallelRegions",
+                             ReachedKnownParallelRegions.size())
+                  << " known parallel regions, "
+                  << ore::NV("UnknownParallelRegions",
+                             ReachedUnknownParallelRegions.size())
+                  << " unkown parallel regions] (bad).";
+      };
+      A.emitRemark<OptimizationRemark>(
+          KernelInitCB, "OpenMPKernelWithCustomizedStateMachineAndFallback",
+          Remark);
+
+      // Tell the user why we ended up with a fallback.
+      for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
+        if (!UnknownParallelRegionCB)
+          continue;
+        auto Remark = [&](OptimizationRemarkAnalysis ORA) {
+          return ORA
+                 << "State machine fallback caused by this call. If it is a "
+                    "false positive, use "
+                    "`__attribute__((assume(\"omp_no_openmp\"))` "
+                    "(or \"omp_no_parallelism\").";
+        };
+        A.emitRemark<OptimizationRemarkAnalysis>(
+            UnknownParallelRegionCB,
+            "OpenMPKernelWithCustomizedStateMachineAndFallback", Remark);
+      }
+    }
+
+    // Create all the blocks:
+    //
+    //                       InitCB = __kmpc_target_init(...)
+    //                       bool IsWorker = InitCB >= 0;
+    //                       if (IsWorker) {
+    // SMBeginBB:               __kmpc_barrier_simple_spmd(...);
+    //                         void *WorkFn;
+    //                         bool Active = __kmpc_kernel_parallel(&WorkFn);
+    //                         if (!WorkFn) return;
+    // SMIsActiveCheckBB:       if (Active) {
+    // SMIfCascadeCurrentBB:      if      (WorkFn == <ParFn0>)
+    //                              ParFn0(...);
+    // SMIfCascadeCurrentBB:      else if (WorkFn == <ParFn1>)
+    //                              ParFn1(...);
+    //                            ...
+    // SMIfCascadeCurrentBB:      else
+    //                              ((WorkFnTy*)WorkFn)(...);
+    // SMEndParallelBB:           __kmpc_kernel_end_parallel(...);
+    //                          }
+    // SMDoneBB:                __kmpc_barrier_simple_spmd(...);
+    //                          goto SMBeginBB;
+    //                       }
+    // UserCodeEntryBB:      // user code
+    //                       __kmpc_target_deinit(...)
+    //
+    Function *Kernel = getAssociatedFunction();
+    assert(Kernel && "Expected an associated function!");
+
+    BasicBlock *InitBB = KernelInitCB->getParent();
+    BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
+        KernelInitCB->getNextNode(), "thread.user_code.check");
+    BasicBlock *StateMachineBeginBB = BasicBlock::Create(
+        Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
+    BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
+        Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
+    BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
+        Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
+    BasicBlock *StateMachineIfCascadeCurrentBB =
+        BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
+                           Kernel, UserCodeEntryBB);
+    BasicBlock *StateMachineEndParallelBB =
+        BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
+                           Kernel, UserCodeEntryBB);
+    BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
+        Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
+
+    const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
+    ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
+
+    InitBB->getTerminator()->eraseFromParent();
+    Instruction *IsWorker =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
+                         ConstantInt::get(KernelInitCB->getType(), -1),
+                         "thread.is_worker", InitBB);
+    IsWorker->setDebugLoc(DLoc);
+    BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
+
+    // Create local storage for the work function pointer.
+    Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+    AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr",
+                                          &Kernel->getEntryBlock().front());
+    WorkFnAI->setDebugLoc(DLoc);
+
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    OMPInfoCache.OMPBuilder.updateToLocation(
+        OpenMPIRBuilder::LocationDescription(
+            IRBuilder<>::InsertPoint(StateMachineBeginBB,
+                                     StateMachineBeginBB->end()),
+            DLoc));
+
+    Value *Ident = KernelInitCB->getArgOperand(0);
+    Value *GTid = KernelInitCB;
+
+    Module &M = *Kernel->getParent();
+    FunctionCallee BarrierFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_barrier_simple_spmd);
+    CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
+        ->setDebugLoc(DLoc);
+
+    FunctionCallee KernelParallelFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_kernel_parallel);
+    Instruction *IsActiveWorker = CallInst::Create(
+        KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
+    IsActiveWorker->setDebugLoc(DLoc);
+    Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
+                                       StateMachineBeginBB);
+    WorkFn->setDebugLoc(DLoc);
+
+    FunctionType *ParallelRegionFnTy = FunctionType::get(
+        Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
+        false);
+    Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+        WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
+        StateMachineBeginBB);
+
+    Instruction *IsDone =
+        ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
+                         Constant::getNullValue(VoidPtrTy), "worker.is_done",
+                         StateMachineBeginBB);
+    IsDone->setDebugLoc(DLoc);
+    BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
+                       IsDone, StateMachineBeginBB)
+        ->setDebugLoc(DLoc);
+
+    BranchInst::Create(StateMachineIfCascadeCurrentBB,
+                       StateMachineDoneBarrierBB, IsActiveWorker,
+                       StateMachineIsActiveCheckBB)
+        ->setDebugLoc(DLoc);
+
+    Value *ZeroArg =
+        Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
+
+    // Now that we have most of the CFG skeleton it is time for the if-cascade
+    // that checks the function pointer we got from the runtime against the
+    // parallel regions we expect, if there are any.
+    for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) {
+      auto *ParallelRegion = ReachedKnownParallelRegions[i];
+      BasicBlock *PRExecuteBB = BasicBlock::Create(
+          Ctx, "worker_state_machine.parallel_region.execute", Kernel,
+          StateMachineEndParallelBB);
+      CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
+          ->setDebugLoc(DLoc);
+      BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
+          ->setDebugLoc(DLoc);
+
+      BasicBlock *PRNextBB =
+          BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
+                             Kernel, StateMachineEndParallelBB);
+
+      // Check if we need to compare the pointer at all or if we can just
+      // call the parallel region function.
+      Value *IsPR;
+      if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) {
+        Instruction *CmpI = ICmpInst::Create(
+            ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
+            "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
+        CmpI->setDebugLoc(DLoc);
+        IsPR = CmpI;
+      } else {
+        IsPR = ConstantInt::getTrue(Ctx);
+      }
+
+      BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
+                         StateMachineIfCascadeCurrentBB)
+          ->setDebugLoc(DLoc);
+      StateMachineIfCascadeCurrentBB = PRNextBB;
+    }
+
+    // At the end of the if-cascade we place the indirect function pointer call
+    // in case we might need it, that is if there can be parallel regions we
+    // have not handled in the if-cascade above.
+    if (!ReachedUnknownParallelRegions.empty()) {
+      StateMachineIfCascadeCurrentBB->setName(
+          "worker_state_machine.parallel_region.fallback.execute");
+      CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
+                       StateMachineIfCascadeCurrentBB)
+          ->setDebugLoc(DLoc);
+    }
+    BranchInst::Create(StateMachineEndParallelBB,
+                       StateMachineIfCascadeCurrentBB)
+        ->setDebugLoc(DLoc);
+
+    CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+                         M, OMPRTL___kmpc_kernel_end_parallel),
+                     {}, "", StateMachineEndParallelBB)
+        ->setDebugLoc(DLoc);
+    BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
+        ->setDebugLoc(DLoc);
+
+    CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
+        ->setDebugLoc(DLoc);
+    BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
+        ->setDebugLoc(DLoc);
+
+    return ChangeStatus::CHANGED;
+  }
+
+  /// Fixpoint iteration update function. Will be called every time a dependence
+  /// changed its state (and in the beginning).
+  ChangeStatus updateImpl(Attributor &A) override {
+    KernelInfoState StateBefore = getState();
+
+    // Callback to check a call instruction.
+    auto CheckCallInst = [&](Instruction &I) {
+      auto &CB = cast<CallBase>(I);
+      auto &CBAA = A.getAAFor<AAKernelInfo>(
+          *this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
+      if (CBAA.getState().isValidState())
+        getState() ^= CBAA.getState();
+      return true;
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this))
+      return indicatePessimisticFixpoint();
+
+    return StateBefore == getState() ? ChangeStatus::UNCHANGED
+                                     : ChangeStatus::CHANGED;
+  }
+};
+
+/// The call site kernel info abstract attribute, basically, what can we say
+/// about a call site with regards to the KernelInfoState. For now this simply
+/// forwards the information from the callee.
+struct AAKernelInfoCallSite : AAKernelInfo {
+  AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
+      : AAKernelInfo(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAKernelInfo::initialize(A);
+
+    CallBase &CB = cast<CallBase>(getAssociatedValue());
+    Function *Callee = getAssociatedFunction();
+
+    // Helper to lookup an assumption string.
+    auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) {
+      return Fn && hasAssumption(*Fn, AssumptionStr);
+    };
+
+    // First weed out calls we do not care about, that is readonly/readnone
+    // calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
+    // parallel region or anything else we are looking for.
+    if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    // Next we check if we know the callee. If it is a known OpenMP function
+    // we will handle them explicitly in the switch below. If it is not, we
+    // will use an AAKernelInfo object on the callee to gather information and
+    // merge that into the current state. The latter happens in the updateImpl.
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
+    if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
+      // Unknown caller or declarations are not analyzable, we give up.
+      if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
+
+        // Unknown callees might contain parallel regions, except if they have
+        // an appropriate assumption attached.
+        if (!(HasAssumption(Callee, "omp_no_openmp") ||
+              HasAssumption(Callee, "omp_no_parallelism")))
+          ReachedUnknownParallelRegions.insert(&CB);
+
+        // We have updated the state for this unknown call properly, there won't
+        // be any change so we indicate a fixpoint.
+        indicateOptimisticFixpoint();
+      }
+      // If the callee is known and can be used in IPO, we will update the state
+      // based on the callee state in updateImpl.
+      return;
+    }
+
+    const unsigned int WrapperFunctionArgNo = 6;
+    RuntimeFunction RF = It->getSecond();
+    switch (RF) {
+    case OMPRTL___kmpc_target_init:
+      KernelInitCB = &CB;
+      break;
+    case OMPRTL___kmpc_target_deinit:
+      KernelDeinitCB = &CB;
+      break;
+    case OMPRTL___kmpc_parallel_51:
+      if (auto *ParallelRegion = dyn_cast<Function>(
+              CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
+        ReachedKnownParallelRegions.insert(ParallelRegion);
+        break;
+      }
+      // The condition above should usually get the parallel region function
+      // pointer and record it. In the off chance it doesn't we assume the
+      // worst.
+      ReachedUnknownParallelRegions.insert(&CB);
+      break;
+    case OMPRTL___kmpc_omp_task:
+      // We do not look into tasks right now, just give up.
+      ReachedUnknownParallelRegions.insert(&CB);
+      break;
+    default:
+      break;
+    }
+    // All other OpenMP runtime calls will not reach parallel regions so they
+    // can be safely ignored for now. Since it is a known OpenMP runtime call we
+    // have now modeled all effects and there is no need for any update.
+    indicateOptimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
+    if (getState() == FnAA.getState())
+      return ChangeStatus::UNCHANGED;
+    getState() = FnAA.getState();
+    return ChangeStatus::CHANGED;
+  }
+};
+
 } // namespace

+void OpenMPOpt::registerAAs(bool IsModulePass) {
+  if (SCC.empty())
+
+    return;
+  if (IsModulePass) {
+    // Ensure we create the AAKernelInfo AAs first and without triggering an
+    // update. This will make sure we register all value simplification
+    // callbacks before any other AA has the chance to create an AAValueSimplify
+    // or similar.
+    for (Function *Kernel : OMPInfoCache.Kernels)
+      A.getOrCreateAAFor<AAKernelInfo>(
+          IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
+          DepClassTy::NONE, /* ForceUpdate */ false,
+          /* UpdateAfterInit */ false);
+  }
+
+  // Create CallSite AA for all Getters.
+  for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
+    auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
+
+    auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
+
+    auto CreateAA = [&](Use &U, Function &Caller) {
+      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
+      if (!CI)
+        return false;
+
+      auto &CB = cast<CallBase>(*CI);
+
+      IRPosition CBPos = IRPosition::callsite_function(CB);
+      A.getOrCreateAAFor<AAICVTracker>(CBPos);
+      return false;
+    };
+
+    GetterRFI.foreachUse(SCC, CreateAA);
+  }
+  auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+  auto CreateAA = [&](Use &U, Function &F) {
+    A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
+    return false;
+  };
+  GlobalizationRFI.foreachUse(SCC, CreateAA);
+
+  // Create an ExecutionDomain AA for every function and a HeapToStack AA for
+  // every function if there is a device kernel.
+  for (auto *F : SCC) {
+    if (!F->isDeclaration())
+      A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
+    if (isOpenMPDevice(M))
+      A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
+  }
+}
+
 const char AAICVTracker::ID = 0;
+const char AAKernelInfo::ID = 0;
 const char AAExecutionDomain::ID = 0;
 const char AAHeapToShared::ID = 0;

@ -2615,6 +3310,28 @@ AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
  return *AA;
 }

+AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
+                                              Attributor &A) {
+  AAKernelInfo *AA = nullptr;
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_RETURNED:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+    llvm_unreachable("KernelInfo can only be created for function position!");
+  case IRPosition::IRP_CALL_SITE:
+    AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
+    break;
+  case IRPosition::IRP_FUNCTION:
+    AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
+    break;
+  }
+
+  return *AA;
+}
+
 PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
  if (!containsOpenMP(M))
    return PreservedAnalyses::all();
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@ -0,0 +1,224 @@
+; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
+target triple = "nvptx64"
+
+; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback [1 known parallel regions, 2 unkown parallel regions] (bad)
+; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism")
+; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism")
+; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Generic-mode kernel is executed with a customized state machine [1 known parallel regions] (good)
+
+;; void unknown(void);
+;; void known(void) {
+;;   #pragma omp parallel
+;;   {
+;;     unknown();
+;;   }
+;; }
+;; 
+;; void test_fallback(void) {
+;;   #pragma omp target teams
+;;   {
+;;     unknown();
+;;     known();
+;;     unknown();
+;;   }
+;; }
+;; 
+;; void test_no_fallback(void) {
+;;   #pragma omp target teams
+;;   {
+;;     known();
+;;     known();
+;;     known();
+;;   }
+;; }
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant [82 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_fallback;11;1;;\00", align 1
+@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([82 x i8], [82 x i8]* @2, i32 0, i32 0) }, align 8
+@4 = private unnamed_addr constant [114 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1
+@5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([114 x i8], [114 x i8]* @4, i32 0, i32 0) }, align 8
+@__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1
+@6 = private unnamed_addr constant [116 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1
+@7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([116 x i8], [116 x i8]* @6, i32 0, i32 0) }, align 8
+@8 = private unnamed_addr constant [85 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_no_fallback;20;1;;\00", align 1
+@9 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([85 x i8], [85 x i8]* @8, i32 0, i32 0) }, align 8
+@10 = private unnamed_addr constant [117 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1
+@11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([117 x i8], [117 x i8]* @10, i32 0, i32 0) }, align 8
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1
+@12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
+@13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8
+@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata"
+
+; Function Attrs: convergent norecurse nounwind
+define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 {
+entry:
+  %captured_vars_addrs.i.i = alloca [0 x i8*], align 8
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3, !dbg !18
+  %exec_user_code = icmp eq i32 %0, -1, !dbg !18
+  br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18
+
+common.ret:                                       ; preds = %entry, %user_code.entry
+  ret void, !dbg !19
+
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @3) #3
+  call void @unknown() #6, !dbg !20
+  %2 = bitcast [0 x i8*]* %captured_vars_addrs.i.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+  %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i.i, i64 0, i64 0, !dbg !23
+  call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23
+  call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26
+  call void @unknown() #6, !dbg !27
+  call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i1 false, i1 true) #3, !dbg !28
+  br label %common.ret
+}
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr
+
+; Function Attrs: convergent
+declare void @unknown() local_unnamed_addr #1
+
+; Function Attrs: nounwind
+define hidden void @known() local_unnamed_addr #2 !dbg !29 {
+entry:
+  %captured_vars_addrs = alloca [0 x i8*], align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @13)
+  %1 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs, i64 0, i64 0, !dbg !30
+  call void @__kmpc_parallel_51(%struct.ident_t* nonnull @13, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** nonnull %1, i64 0) #3, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: nounwind
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3
+
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 {
+entry:
+  %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i1 false, i1 true, i1 true) #3, !dbg !33
+  %exec_user_code = icmp eq i32 %0, -1, !dbg !33
+  br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33
+
+common.ret:                                       ; preds = %entry, %user_code.entry
+  ret void, !dbg !34
+
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @9) #3
+  %2 = bitcast [0 x i8*]* %captured_vars_addrs.i2.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+  %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i2.i, i64 0, i64 0, !dbg !35
+  call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !35
+  call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !39
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+  call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %5, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !40
+  call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !42
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+  call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43
+  call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45
+  call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46
+  br label %common.ret
+}
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__2(i32* noalias nocapture nofree readnone %.global_tid., i32* noalias nocapture nofree readnone %.bound_tid.) #0 !dbg !47 {
+entry:
+  call void @unknown() #6, !dbg !48
+  ret void, !dbg !49
+}
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 !dbg !50 {
+entry:
+  %global_args = alloca i8**, align 8
+  call void @__kmpc_get_shared_variables(i8*** nonnull %global_args) #3, !dbg !51
+  call void @unknown() #6, !dbg !52
+  ret void, !dbg !51
+}
+
+declare void @__kmpc_get_shared_variables(i8***) local_unnamed_addr
+
+declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
+
+attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #3 = { nounwind }
+attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #5 = { argmemonly nofree nosync nounwind willreturn }
+attributes #6 = { convergent nounwind }
+
+!llvm.dbg.cu = !{!0}
+!omp_offload.info = !{!3, !4}
+!nvvm.annotations = !{!5, !6}
+!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
+!llvm.ident = !{!14}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "custom_state_machines_remarks.c", directory: "/data/src/llvm-project")
+!2 = !{}
+!3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1}
+!4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0}
+!5 = !{void ()* @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1}
+!6 = !{void ()* @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1}
+!7 = !{i32 7, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 7, !"openmp", i32 50}
+!11 = !{i32 7, !"openmp-device", i32 50}
+!12 = !{i32 7, !"PIC Level", i32 2}
+!13 = !{i32 7, !"frame-pointer", i32 2}
+!14 = !{!"clang version 13.0.0"}
+!15 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_fallback_l11", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!16 = !DIFile(filename: "llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c", directory: "/data/src/llvm-project")
+!17 = !DISubroutineType(types: !2)
+!18 = !DILocation(line: 11, column: 1, scope: !15)
+!19 = !DILocation(line: 0, scope: !15)
+!20 = !DILocation(line: 13, column: 5, scope: !21, inlinedAt: !22)
+!21 = distinct !DISubprogram(name: "__omp_outlined__", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!22 = distinct !DILocation(line: 11, column: 1, scope: !15)
+!23 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !25)
+!24 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!25 = distinct !DILocation(line: 14, column: 5, scope: !21, inlinedAt: !22)
+!26 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !25)
+!27 = !DILocation(line: 15, column: 5, scope: !21, inlinedAt: !22)
+!28 = !DILocation(line: 11, column: 25, scope: !15)
+!29 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!30 = !DILocation(line: 4, column: 1, scope: !29)
+!31 = !DILocation(line: 8, column: 1, scope: !29)
+!32 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_no_fallback_l20", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!33 = !DILocation(line: 20, column: 1, scope: !32)
+!34 = !DILocation(line: 0, scope: !32)
+!35 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !36)
+!36 = distinct !DILocation(line: 22, column: 5, scope: !37, inlinedAt: !38)
+!37 = distinct !DISubprogram(name: "__omp_outlined__1", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!38 = distinct !DILocation(line: 20, column: 1, scope: !32)
+!39 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !36)
+!40 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !41)
+!41 = distinct !DILocation(line: 23, column: 5, scope: !37, inlinedAt: !38)
+!42 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !41)
+!43 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !44)
+!44 = distinct !DILocation(line: 24, column: 5, scope: !37, inlinedAt: !38)
+!45 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !44)
+!46 = !DILocation(line: 20, column: 25, scope: !32)
+!47 = distinct !DISubprogram(name: "__omp_outlined__2", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!48 = !DILocation(line: 6, column: 5, scope: !47)
+!49 = !DILocation(line: 7, column: 3, scope: !47)
+!50 = distinct !DISubprogram(linkageName: "__omp_outlined__2_wrapper", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!51 = !DILocation(line: 4, column: 1, scope: !50)
+!52 = !DILocation(line: 6, column: 5, scope: !47, inlinedAt: !53)
+!53 = distinct !DILocation(line: 4, column: 1, scope: !50)
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@ -7,15 +7,19 @@ target triple = "nvptx64"
 ; CHECK: remark: globalization_remarks.c:5:7: Could not move globalized variable to the stack. Variable is potentially captured.
 ; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.

+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
@S = external local_unnamed_addr global i8*

 define void @foo() {
 entry:
+  %c = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true)
  %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !10
  %x_on_stack = bitcast i8* %0 to i32*
  %1 = bitcast i32* %x_on_stack to i8*
  call void @share(i8* %1)
  call void @__kmpc_free_shared(i8* %0)
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
  ret void
 }

@ -29,6 +33,8 @@ declare i8* @__kmpc_alloc_shared(i64)

 declare void @__kmpc_free_shared(i8*)

+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1);
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)

 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@ -7,25 +7,34 @@ target triple = "nvptx64"
 ; CHECK-REMARKS: remark: remove_globalization.c:4:2: Could not move globalized variable to the stack. Variable is potentially captured. Mark as noescape to override.
 ; CHECK-REMARKS: remark: remove_globalization.c:2:2: Moving globalized variable to the stack.
 ; CHECK-REMARKS: remark: remove_globalization.c:6:2: Moving globalized variable to the stack.
+; CHECK-REMARKS: remark: remove_globalization.c:4:2: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.

@S = external local_unnamed_addr global i8*

+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
 define void @kernel() {
 ; CHECK-LABEL: define {{[^@]+}}@kernel() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @foo()
-; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 false, i1 true)
+; CHECK-NEXT:    call void @foo() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @bar() #[[ATTR0]]
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
 ; CHECK-NEXT:    ret void
-;
 entry:
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 true, i1 true)
  call void @foo()
  call void @bar()
+  call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
  ret void
 }

 define internal void @foo() {
 ; CHECK-LABEL: define {{[^@]+}}@foo
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    ret void
@ -41,8 +50,8 @@ define internal void @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG6:![0-9]+]]
-; CHECK-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[TMP0]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
@ -54,13 +63,18 @@ entry:
 }

 define internal void @use(i8* %x) {
+; CHECK-LABEL: define {{[^@]+}}@use
+; CHECK-SAME: (i8* noalias nocapture nofree readnone [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
 entry:
  ret void
 }

 define internal void @share(i8* %x) {
 ; CHECK-LABEL: define {{[^@]+}}@share
-; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
 ; CHECK-NEXT:    ret void
@ -71,6 +85,12 @@ entry:
 }

 define void @unused() {
+; CHECK-LABEL: define {{[^@]+}}@unused() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; CHECK-NEXT:    call void @use(i8* noalias readnone undef)
+; CHECK-NEXT:    ret void
+;
 entry:
  %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !14
  call void @use(i8* %0)
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@ -20,18 +20,22 @@ target triple = "nvptx64"
 ; CHECK: call void @__kmpc_free_shared({{.*}})
 define dso_local void @foo() {
 entry:
+  %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
  %x = call i8* @__kmpc_alloc_shared(i64 4)
  %x_on_stack = bitcast i8* %x to i32*
  %0 = bitcast i32* %x_on_stack to i8*
  call void @use(i8* %0)
  call void @__kmpc_free_shared(i8* %x)
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
  ret void
 }

 define void @bar() {
+  %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
  call void @baz()
  call void @qux()
  call void @negative_qux_spmd()
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
  ret void
 }

@ -104,6 +108,8 @@ declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()

 declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)

+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
 !nvvm.annotations = !{!7, !8}
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@ -8,25 +8,36 @@
@0 = private unnamed_addr constant [1 x i8] c"\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8

+
+; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread.
+; CHECK: [openmp-opt] Basic block @kernel if.then is executed by a single thread.
+; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread.
+; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread.
 define void @kernel() {
-  call void @__kmpc_kernel_prepare_parallel(i8* null)
+  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false)
+  %cmp = icmp eq i32 %call, -1
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
  call void @nvptx()
  call void @amdgcn()
+  br label %if.end
+if.else:
+  br label %if.end
+if.end:
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
  ret void
 }

 ; REMARKS: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible.
 ; REMARKS-NOT: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible.

-; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
-; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
-; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
+; CHECK-DAG: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
+; CHECK-DAG: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
+; CHECK-DAG: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
 ; Function Attrs: noinline
 define internal void @nvptx() {
 entry:
-  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false)
-  %cmp = icmp eq i32 %call, -1
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 true, label %if.then, label %if.end

 if.then:
  call void @foo()
@ -39,15 +50,13 @@ if.end:
  ret void
 }

-; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
-; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
-; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
+; CHECK-DAG: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
+; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
+; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
 ; Function Attrs: noinline
 define internal void @amdgcn() {
 entry:
-  %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true)
-  %cmp = icmp eq i32 %call, -1
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 false, label %if.then, label %if.end

 if.then:
  call void @foo()
@ -95,6 +104,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 declare void @__kmpc_kernel_prepare_parallel(i8*)

 declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)

 attributes #0 = { cold noinline }