[OpenMP] Create custom state machines for generic target regions

In the spirit of TRegions [0], this patch creates a custom state
machine for a generic target region based on the potentially called
parallel regions.

The code analysis is done interprocedurally via an abstract attribute
(AAKernelInfo). All outermost parallel regions are collected and we
check if there might be unknown outermost parallel regions for which
we need an indirect call. Other AAKernelInfo extensions are expected.

[0] https://link.springer.com/chapter/10.1007/978-3-030-28596-8_11

Differential Revision: https://reviews.llvm.org/D101977
This commit is contained in:
Johannes Doerfert 2021-05-20 00:37:29 -05:00
parent 1d5711c3ee
commit f0628c6ff7
8 changed files with 2988 additions and 69 deletions

View File

@ -610,8 +610,20 @@ void IRPosition::verify() {
Optional<Constant *> Optional<Constant *>
Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA, Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
bool &UsedAssumedInformation) { bool &UsedAssumedInformation) {
const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>( // First check all callbacks provided by outside AAs. If any of them returns
AA, IRPosition::value(V, AA.getCallBaseContext()), DepClassTy::NONE); // a non-null value that is different from the associated value, or None, we
// assume it's simpliied.
IRPosition IRP = IRPosition::value(V, AA.getCallBaseContext());
for (auto &CB : SimplificationCallbacks[IRP]) {
Optional<Value *> SimplifiedV = CB(IRP, &AA, UsedAssumedInformation);
if (!SimplifiedV.hasValue())
return llvm::None;
if (*SimplifiedV && *SimplifiedV != &IRP.getAssociatedValue() &&
isa<Constant>(*SimplifiedV))
return cast<Constant>(*SimplifiedV);
}
const auto &ValueSimplifyAA =
getAAFor<AAValueSimplify>(AA, IRP, DepClassTy::NONE);
Optional<Value *> SimplifiedV = Optional<Value *> SimplifiedV =
ValueSimplifyAA.getAssumedSimplifiedValue(*this); ValueSimplifyAA.getAssumedSimplifiedValue(*this);
bool IsKnown = ValueSimplifyAA.isAtFixpoint(); bool IsKnown = ValueSimplifyAA.isAtFixpoint();

View File

@ -25,6 +25,9 @@
#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/ValueTracking.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicInst.h"
#include "llvm/InitializePasses.h" #include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h" #include "llvm/Support/CommandLine.h"
@ -70,6 +73,15 @@ STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
"Number of OpenMP runtime function uses identified"); "Number of OpenMP runtime function uses identified");
STATISTIC(NumOpenMPTargetRegionKernels, STATISTIC(NumOpenMPTargetRegionKernels,
"Number of OpenMP target region entry points (=kernels) identified"); "Number of OpenMP target region entry points (=kernels) identified");
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines");
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines with fallback");
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode with customized state machines without fallback");
STATISTIC( STATISTIC(
NumOpenMPParallelRegionsReplacedInGPUStateMachine, NumOpenMPParallelRegionsReplacedInGPUStateMachine,
"Number of OpenMP parallel regions replaced with ID in GPU state machines"); "Number of OpenMP parallel regions replaced with ID in GPU state machines");
@ -228,6 +240,11 @@ struct OMPInformationCache : public InformationCache {
/// Map from functions to all uses of this runtime function contained in /// Map from functions to all uses of this runtime function contained in
/// them. /// them.
DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap; DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
public:
/// Iterators for the uses of this runtime function.
decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
decltype(UsesMap)::iterator end() { return UsesMap.end(); }
}; };
/// An OpenMP-IR-Builder instance /// An OpenMP-IR-Builder instance
@ -238,6 +255,9 @@ struct OMPInformationCache : public InformationCache {
RuntimeFunction::OMPRTL___last> RuntimeFunction::OMPRTL___last>
RFIs; RFIs;
/// Map from function declarations/definitions to their runtime enum type.
DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
/// Map from ICV kind to the ICV description. /// Map from ICV kind to the ICV description.
EnumeratedArray<InternalControlVarInfo, InternalControlVar, EnumeratedArray<InternalControlVarInfo, InternalControlVar,
InternalControlVar::ICV___last> InternalControlVar::ICV___last>
@ -380,6 +400,7 @@ struct OMPInformationCache : public InformationCache {
SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \ SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
Function *F = M.getFunction(_Name); \ Function *F = M.getFunction(_Name); \
if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \ if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
RuntimeFunctionIDMap[F] = _Enum; \
auto &RFI = RFIs[_Enum]; \ auto &RFI = RFIs[_Enum]; \
RFI.Kind = _Enum; \ RFI.Kind = _Enum; \
RFI.Name = _Name; \ RFI.Name = _Name; \
@ -408,6 +429,141 @@ struct OMPInformationCache : public InformationCache {
SmallPtrSetImpl<Kernel> &Kernels; SmallPtrSetImpl<Kernel> &Kernels;
}; };
template <typename Ty, bool InsertInvalidates = true>
struct BooleanStateWithPtrSetVector : public BooleanState {
bool contains(Ty *Elem) const { return Set.contains(Elem); }
bool insert(Ty *Elem) {
if (InsertInvalidates)
BooleanState::indicatePessimisticFixpoint();
return Set.insert(Elem);
}
Ty *operator[](int Idx) const { return Set[Idx]; }
bool operator==(const BooleanStateWithPtrSetVector &RHS) const {
return BooleanState::operator==(RHS) && Set == RHS.Set;
}
bool operator!=(const BooleanStateWithPtrSetVector &RHS) const {
return !(*this == RHS);
}
bool empty() const { return Set.empty(); }
size_t size() const { return Set.size(); }
/// "Clamp" this state with \p RHS.
BooleanStateWithPtrSetVector &
operator^=(const BooleanStateWithPtrSetVector &RHS) {
BooleanState::operator^=(RHS);
Set.insert(RHS.Set.begin(), RHS.Set.end());
return *this;
}
private:
/// A set to keep track of elements.
SetVector<Ty *> Set;
public:
typename decltype(Set)::iterator begin() { return Set.begin(); }
typename decltype(Set)::iterator end() { return Set.end(); }
typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
typename decltype(Set)::const_iterator end() const { return Set.end(); }
};
struct KernelInfoState : AbstractState {
/// Flag to track if we reached a fixpoint.
bool IsAtFixpoint = false;
/// The parallel regions (identified by the outlined parallel functions) that
/// can be reached from the associated function.
BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
ReachedKnownParallelRegions;
/// State to track what parallel region we might reach.
BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
/// The __kmpc_target_init call in this kernel, if any. If we find more than
/// one we abort as the kernel is malformed.
CallBase *KernelInitCB = nullptr;
/// The __kmpc_target_deinit call in this kernel, if any. If we find more than
/// one we abort as the kernel is malformed.
CallBase *KernelDeinitCB = nullptr;
/// Abstract State interface
///{
KernelInfoState() {}
KernelInfoState(bool BestState) {
if (!BestState)
indicatePessimisticFixpoint();
}
/// See AbstractState::isValidState(...)
bool isValidState() const override { return true; }
/// See AbstractState::isAtFixpoint(...)
bool isAtFixpoint() const override { return IsAtFixpoint; }
/// See AbstractState::indicatePessimisticFixpoint(...)
ChangeStatus indicatePessimisticFixpoint() override {
IsAtFixpoint = true;
ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
return ChangeStatus::CHANGED;
}
/// See AbstractState::indicateOptimisticFixpoint(...)
ChangeStatus indicateOptimisticFixpoint() override {
IsAtFixpoint = true;
return ChangeStatus::UNCHANGED;
}
/// Return the assumed state
KernelInfoState &getAssumed() { return *this; }
const KernelInfoState &getAssumed() const { return *this; }
bool operator==(const KernelInfoState &RHS) const {
if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
return false;
if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
return false;
return true;
}
/// Return empty set as the best state of potential values.
static KernelInfoState getBestState() { return KernelInfoState(true); }
static KernelInfoState getBestState(KernelInfoState &KIS) {
return getBestState();
}
/// Return full set as the worst state of potential values.
static KernelInfoState getWorstState() { return KernelInfoState(false); }
/// "Clamp" this state with \p KIS.
KernelInfoState operator^=(const KernelInfoState &KIS) {
// Do not merge two different _init and _deinit call sites.
if (KIS.KernelInitCB) {
if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
indicatePessimisticFixpoint();
KernelInitCB = KIS.KernelInitCB;
}
if (KIS.KernelDeinitCB) {
if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
indicatePessimisticFixpoint();
KernelDeinitCB = KIS.KernelDeinitCB;
}
ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
return *this;
}
KernelInfoState operator&=(const KernelInfoState &KIS) {
return (*this ^= KIS);
}
///}
};
/// Used to map the values physically (in the IR) stored in an offload /// Used to map the values physically (in the IR) stored in an offload
/// array, to a vector in memory. /// array, to a vector in memory.
struct OffloadArray { struct OffloadArray {
@ -522,7 +678,7 @@ struct OpenMPOpt {
<< OMPInfoCache.ModuleSlice.size() << " functions\n"); << OMPInfoCache.ModuleSlice.size() << " functions\n");
if (IsModulePass) { if (IsModulePass) {
Changed |= runAttributor(); Changed |= runAttributor(IsModulePass);
// Recollect uses, in case Attributor deleted any. // Recollect uses, in case Attributor deleted any.
OMPInfoCache.recollectUses(); OMPInfoCache.recollectUses();
@ -535,14 +691,14 @@ struct OpenMPOpt {
if (PrintOpenMPKernels) if (PrintOpenMPKernels)
printKernels(); printKernels();
Changed |= rewriteDeviceCodeStateMachine(); Changed |= runAttributor(IsModulePass);
Changed |= runAttributor();
// Recollect uses, in case Attributor deleted any. // Recollect uses, in case Attributor deleted any.
OMPInfoCache.recollectUses(); OMPInfoCache.recollectUses();
Changed |= deleteParallelRegions(); Changed |= deleteParallelRegions();
Changed |= rewriteDeviceCodeStateMachine();
if (HideMemoryTransferLatency) if (HideMemoryTransferLatency)
Changed |= hideMemTransfersLatency(); Changed |= hideMemTransfersLatency();
Changed |= deduplicateRuntimeCalls(); Changed |= deduplicateRuntimeCalls();
@ -1573,11 +1729,11 @@ private:
Attributor &A; Attributor &A;
/// Helper function to run Attributor on SCC. /// Helper function to run Attributor on SCC.
bool runAttributor() { bool runAttributor(bool IsModulePass) {
if (SCC.empty()) if (SCC.empty())
return false; return false;
registerAAs(); registerAAs(IsModulePass);
ChangeStatus Changed = A.run(); ChangeStatus Changed = A.run();
@ -1589,46 +1745,7 @@ private:
/// Populate the Attributor with abstract attribute opportunities in the /// Populate the Attributor with abstract attribute opportunities in the
/// function. /// function.
void registerAAs() { void registerAAs(bool IsModulePass);
if (SCC.empty())
return;
// Create CallSite AA for all Getters.
for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
auto CreateAA = [&](Use &U, Function &Caller) {
CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
if (!CI)
return false;
auto &CB = cast<CallBase>(*CI);
IRPosition CBPos = IRPosition::callsite_function(CB);
A.getOrCreateAAFor<AAICVTracker>(CBPos);
return false;
};
GetterRFI.foreachUse(SCC, CreateAA);
}
auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
auto CreateAA = [&](Use &U, Function &F) {
A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
return false;
};
GlobalizationRFI.foreachUse(SCC, CreateAA);
// Create an ExecutionDomain AA for every function and a HeapToStack AA for
// every function if there is a device kernel.
for (auto *F : SCC) {
if (!F->isDeclaration())
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
if (isOpenMPDevice(M))
A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
}
}
}; };
Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
@ -1766,7 +1883,7 @@ bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
// TODO: Checking the number of uses is not a necessary restriction and // TODO: Checking the number of uses is not a necessary restriction and
// should be lifted. // should be lifted.
if (UnknownUse || NumDirectCalls != 1 || if (UnknownUse || NumDirectCalls != 1 ||
ToBeReplacedStateMachineUses.size() != 2) { ToBeReplacedStateMachineUses.size() > 2) {
{ {
auto Remark = [&](OptimizationRemarkAnalysis ORA) { auto Remark = [&](OptimizationRemarkAnalysis ORA) {
return ORA << "Parallel region is used in " return ORA << "Parallel region is used in "
@ -2541,9 +2658,587 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
SmallPtrSet<CallBase *, 4> MallocCalls; SmallPtrSet<CallBase *, 4> MallocCalls;
}; };
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
/// Statistics are tracked as part of manifest for now.
void trackStatistics() const override {}
/// See AbstractAttribute::getAsStr()
const std::string getAsStr() const override {
if (!isValidState())
return "<invalid>";
return
std::string(" #PRs: ") +
std::to_string(ReachedKnownParallelRegions.size()) +
", #Unknown PRs: " +
std::to_string(ReachedUnknownParallelRegions.size());
}
/// Create an abstract attribute biew for the position \p IRP.
static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
/// See AbstractAttribute::getName()
const std::string getName() const override { return "AAKernelInfo"; }
/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is AAKernelInfo
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
static const char ID;
};
/// The function kernel info abstract attribute, basically, what can we say
/// about a function with regards to the KernelInfoState.
struct AAKernelInfoFunction : AAKernelInfo {
AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
: AAKernelInfo(IRP, A) {}
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
// This is a high-level transform that might change the constant arguments
// of the init and dinit calls. We need to tell the Attributor about this
// to avoid other parts using the current constant value for simpliication.
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
Function *Fn = getAnchorScope();
if (!OMPInfoCache.Kernels.count(Fn))
return;
OMPInformationCache::RuntimeFunctionInfo &InitRFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
// For kernels we perform more initialization work, first we find the init
// and deinit calls.
auto StoreCallBase = [](Use &U,
OMPInformationCache::RuntimeFunctionInfo &RFI,
CallBase *&Storage) {
CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
assert(CB &&
"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
assert(!Storage &&
"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
Storage = CB;
return false;
};
InitRFI.foreachUse(
[&](Use &U, Function &) {
StoreCallBase(U, InitRFI, KernelInitCB);
return false;
},
Fn);
DeinitRFI.foreachUse(
[&](Use &U, Function &) {
StoreCallBase(U, DeinitRFI, KernelDeinitCB);
return false;
},
Fn);
assert((KernelInitCB && KernelDeinitCB) &&
"Kernel without __kmpc_target_init or __kmpc_target_deinit!");
// For kernels we need to register a simplification callback so that the Attributor
// knows the constant arguments to ___kmpc_target_init and
// __kmpc_target_deinit might actually change.
Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
[&](const IRPosition &IRP, const AbstractAttribute *AA,
bool &UsedAssumedInformation) -> Optional<Value *> {
// IRP represents the "use generic state machine" argument of an
// __kmpc_target_init call. We will answer this one with the internal
// state. As long as we are not in an invalid state, we will create a
// custom state machine so the value should be a `i1 false`. If we are
// in an invalid state, we won't change the value that is in the IR.
if (!isValidState())
return nullptr;
if (AA)
A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
UsedAssumedInformation = !isAtFixpoint();
auto *FalseVal =
ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0);
return FalseVal;
};
constexpr const int InitUseStateMachineArgNo = 2;
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
StateMachineSimplifyCB);
}
/// Modify the IR based on the KernelInfoState as the fixpoint iteration is
/// finished now.
ChangeStatus manifest(Attributor &A) override {
// If we are not looking at a kernel with __kmpc_target_init and
// __kmpc_target_deinit call we cannot actually manifest the information.
if (!KernelInitCB || !KernelDeinitCB)
return ChangeStatus::UNCHANGED;
buildCustomStateMachine(A);
return ChangeStatus::CHANGED;
}
ChangeStatus buildCustomStateMachine(Attributor &A) {
assert(ReachedKnownParallelRegions.isValidState() &&
"Custom state machine with invalid parallel region states?");
const int InitIsSPMDArgNo = 1;
const int InitUseStateMachineArgNo = 2;
// Check if the current configuration is non-SPMD and generic state machine.
// If we already have SPMD mode or a custom state machine we do not need to
// go any further. If it is anything but a constant something is weird and
// we give up.
ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
ConstantInt *IsSPMD =
dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
// If we are stuck with generic mode, try to create a custom device (=GPU)
// state machine which is specialized for the parallel regions that are
// reachable by the kernel.
if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD ||
!IsSPMD->isZero())
return ChangeStatus::UNCHANGED;
// First, indicate we use a custom state machine now.
auto &Ctx = getAnchorValue().getContext();
auto *FalseVal = ConstantInt::getBool(Ctx, 0);
A.changeUseAfterManifest(
KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
// If we don't actually need a state machine we are done here. This can
// happen if there simply are no parallel regions. In the resulting kernel
// all worker threads will simply exit right away, leaving the main thread
// to do the work alone.
if (ReachedKnownParallelRegions.empty() &&
ReachedUnknownParallelRegions.empty()) {
++NumOpenMPTargetRegionKernelsWithoutStateMachine;
auto Remark = [&](OptimizationRemark OR) {
return OR << "Generic-mode kernel is executed without state machine "
"(good)";
};
A.emitRemark<OptimizationRemark>(
KernelInitCB, "OpenMPKernelWithoutStateMachine", Remark);
return ChangeStatus::CHANGED;
}
// Keep track in the statistics of our new shiny custom state machine.
if (ReachedUnknownParallelRegions.empty()) {
++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
auto Remark = [&](OptimizationRemark OR) {
return OR << "Generic-mode kernel is executed with a customized state "
"machine ["
<< ore::NV("ParallelRegions",
ReachedKnownParallelRegions.size())
<< " known parallel regions] (good).";
};
A.emitRemark<OptimizationRemark>(
KernelInitCB, "OpenMPKernelWithCustomizedStateMachine", Remark);
} else {
++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
auto Remark = [&](OptimizationRemark OR) {
return OR << "Generic-mode kernel is executed with a customized state "
"machine that requires a fallback ["
<< ore::NV("ParallelRegions",
ReachedKnownParallelRegions.size())
<< " known parallel regions, "
<< ore::NV("UnknownParallelRegions",
ReachedUnknownParallelRegions.size())
<< " unkown parallel regions] (bad).";
};
A.emitRemark<OptimizationRemark>(
KernelInitCB, "OpenMPKernelWithCustomizedStateMachineAndFallback",
Remark);
// Tell the user why we ended up with a fallback.
for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
if (!UnknownParallelRegionCB)
continue;
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
return ORA
<< "State machine fallback caused by this call. If it is a "
"false positive, use "
"`__attribute__((assume(\"omp_no_openmp\"))` "
"(or \"omp_no_parallelism\").";
};
A.emitRemark<OptimizationRemarkAnalysis>(
UnknownParallelRegionCB,
"OpenMPKernelWithCustomizedStateMachineAndFallback", Remark);
}
}
// Create all the blocks:
//
// InitCB = __kmpc_target_init(...)
// bool IsWorker = InitCB >= 0;
// if (IsWorker) {
// SMBeginBB: __kmpc_barrier_simple_spmd(...);
// void *WorkFn;
// bool Active = __kmpc_kernel_parallel(&WorkFn);
// if (!WorkFn) return;
// SMIsActiveCheckBB: if (Active) {
// SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
// ParFn0(...);
// SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
// ParFn1(...);
// ...
// SMIfCascadeCurrentBB: else
// ((WorkFnTy*)WorkFn)(...);
// SMEndParallelBB: __kmpc_kernel_end_parallel(...);
// }
// SMDoneBB: __kmpc_barrier_simple_spmd(...);
// goto SMBeginBB;
// }
// UserCodeEntryBB: // user code
// __kmpc_target_deinit(...)
//
Function *Kernel = getAssociatedFunction();
assert(Kernel && "Expected an associated function!");
BasicBlock *InitBB = KernelInitCB->getParent();
BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
KernelInitCB->getNextNode(), "thread.user_code.check");
BasicBlock *StateMachineBeginBB = BasicBlock::Create(
Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
BasicBlock *StateMachineIfCascadeCurrentBB =
BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
Kernel, UserCodeEntryBB);
BasicBlock *StateMachineEndParallelBB =
BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
Kernel, UserCodeEntryBB);
BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
InitBB->getTerminator()->eraseFromParent();
Instruction *IsWorker =
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
ConstantInt::get(KernelInitCB->getType(), -1),
"thread.is_worker", InitBB);
IsWorker->setDebugLoc(DLoc);
BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
// Create local storage for the work function pointer.
Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr",
&Kernel->getEntryBlock().front());
WorkFnAI->setDebugLoc(DLoc);
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
OMPInfoCache.OMPBuilder.updateToLocation(
OpenMPIRBuilder::LocationDescription(
IRBuilder<>::InsertPoint(StateMachineBeginBB,
StateMachineBeginBB->end()),
DLoc));
Value *Ident = KernelInitCB->getArgOperand(0);
Value *GTid = KernelInitCB;
Module &M = *Kernel->getParent();
FunctionCallee BarrierFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_barrier_simple_spmd);
CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
->setDebugLoc(DLoc);
FunctionCallee KernelParallelFn =
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_kernel_parallel);
Instruction *IsActiveWorker = CallInst::Create(
KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
IsActiveWorker->setDebugLoc(DLoc);
Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
StateMachineBeginBB);
WorkFn->setDebugLoc(DLoc);
FunctionType *ParallelRegionFnTy = FunctionType::get(
Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
false);
Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
StateMachineBeginBB);
Instruction *IsDone =
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
Constant::getNullValue(VoidPtrTy), "worker.is_done",
StateMachineBeginBB);
IsDone->setDebugLoc(DLoc);
BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
IsDone, StateMachineBeginBB)
->setDebugLoc(DLoc);
BranchInst::Create(StateMachineIfCascadeCurrentBB,
StateMachineDoneBarrierBB, IsActiveWorker,
StateMachineIsActiveCheckBB)
->setDebugLoc(DLoc);
Value *ZeroArg =
Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
// Now that we have most of the CFG skeleton it is time for the if-cascade
// that checks the function pointer we got from the runtime against the
// parallel regions we expect, if there are any.
for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) {
auto *ParallelRegion = ReachedKnownParallelRegions[i];
BasicBlock *PRExecuteBB = BasicBlock::Create(
Ctx, "worker_state_machine.parallel_region.execute", Kernel,
StateMachineEndParallelBB);
CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
->setDebugLoc(DLoc);
BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
->setDebugLoc(DLoc);
BasicBlock *PRNextBB =
BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
Kernel, StateMachineEndParallelBB);
// Check if we need to compare the pointer at all or if we can just
// call the parallel region function.
Value *IsPR;
if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) {
Instruction *CmpI = ICmpInst::Create(
ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
"worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
CmpI->setDebugLoc(DLoc);
IsPR = CmpI;
} else {
IsPR = ConstantInt::getTrue(Ctx);
}
BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
StateMachineIfCascadeCurrentBB)
->setDebugLoc(DLoc);
StateMachineIfCascadeCurrentBB = PRNextBB;
}
// At the end of the if-cascade we place the indirect function pointer call
// in case we might need it, that is if there can be parallel regions we
// have not handled in the if-cascade above.
if (!ReachedUnknownParallelRegions.empty()) {
StateMachineIfCascadeCurrentBB->setName(
"worker_state_machine.parallel_region.fallback.execute");
CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
StateMachineIfCascadeCurrentBB)
->setDebugLoc(DLoc);
}
BranchInst::Create(StateMachineEndParallelBB,
StateMachineIfCascadeCurrentBB)
->setDebugLoc(DLoc);
CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
M, OMPRTL___kmpc_kernel_end_parallel),
{}, "", StateMachineEndParallelBB)
->setDebugLoc(DLoc);
BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
->setDebugLoc(DLoc);
CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
->setDebugLoc(DLoc);
BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
->setDebugLoc(DLoc);
return ChangeStatus::CHANGED;
}
/// Fixpoint iteration update function. Will be called every time a dependence
/// changed its state (and in the beginning).
ChangeStatus updateImpl(Attributor &A) override {
KernelInfoState StateBefore = getState();
// Callback to check a call instruction.
auto CheckCallInst = [&](Instruction &I) {
auto &CB = cast<CallBase>(I);
auto &CBAA = A.getAAFor<AAKernelInfo>(
*this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
if (CBAA.getState().isValidState())
getState() ^= CBAA.getState();
return true;
};
if (!A.checkForAllCallLikeInstructions(CheckCallInst, *this))
return indicatePessimisticFixpoint();
return StateBefore == getState() ? ChangeStatus::UNCHANGED
: ChangeStatus::CHANGED;
}
};
/// The call site kernel info abstract attribute, basically, what can we say
/// about a call site with regards to the KernelInfoState. For now this simply
/// forwards the information from the callee.
struct AAKernelInfoCallSite : AAKernelInfo {
AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
: AAKernelInfo(IRP, A) {}
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
AAKernelInfo::initialize(A);
CallBase &CB = cast<CallBase>(getAssociatedValue());
Function *Callee = getAssociatedFunction();
// Helper to lookup an assumption string.
auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) {
return Fn && hasAssumption(*Fn, AssumptionStr);
};
// First weed out calls we do not care about, that is readonly/readnone
// calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
// parallel region or anything else we are looking for.
if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
indicateOptimisticFixpoint();
return;
}
// Next we check if we know the callee. If it is a known OpenMP function
// we will handle them explicitly in the switch below. If it is not, we
// will use an AAKernelInfo object on the callee to gather information and
// merge that into the current state. The latter happens in the updateImpl.
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
// Unknown caller or declarations are not analyzable, we give up.
if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
// Unknown callees might contain parallel regions, except if they have
// an appropriate assumption attached.
if (!(HasAssumption(Callee, "omp_no_openmp") ||
HasAssumption(Callee, "omp_no_parallelism")))
ReachedUnknownParallelRegions.insert(&CB);
// We have updated the state for this unknown call properly, there won't
// be any change so we indicate a fixpoint.
indicateOptimisticFixpoint();
}
// If the callee is known and can be used in IPO, we will update the state
// based on the callee state in updateImpl.
return;
}
const unsigned int WrapperFunctionArgNo = 6;
RuntimeFunction RF = It->getSecond();
switch (RF) {
case OMPRTL___kmpc_target_init:
KernelInitCB = &CB;
break;
case OMPRTL___kmpc_target_deinit:
KernelDeinitCB = &CB;
break;
case OMPRTL___kmpc_parallel_51:
if (auto *ParallelRegion = dyn_cast<Function>(
CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
ReachedKnownParallelRegions.insert(ParallelRegion);
break;
}
// The condition above should usually get the parallel region function
// pointer and record it. In the off chance it doesn't we assume the
// worst.
ReachedUnknownParallelRegions.insert(&CB);
break;
case OMPRTL___kmpc_omp_task:
// We do not look into tasks right now, just give up.
ReachedUnknownParallelRegions.insert(&CB);
break;
default:
break;
}
// All other OpenMP runtime calls will not reach parallel regions so they
// can be safely ignored for now. Since it is a known OpenMP runtime call we
// have now modeled all effects and there is no need for any update.
indicateOptimisticFixpoint();
}
ChangeStatus updateImpl(Attributor &A) override {
// TODO: Once we have call site specific value information we can provide
// call site specific liveness information and then it makes
// sense to specialize attributes for call sites arguments instead of
// redirecting requests to the callee argument.
Function *F = getAssociatedFunction();
const IRPosition &FnPos = IRPosition::function(*F);
auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
if (getState() == FnAA.getState())
return ChangeStatus::UNCHANGED;
getState() = FnAA.getState();
return ChangeStatus::CHANGED;
}
};
} // namespace } // namespace
void OpenMPOpt::registerAAs(bool IsModulePass) {
if (SCC.empty())
return;
if (IsModulePass) {
// Ensure we create the AAKernelInfo AAs first and without triggering an
// update. This will make sure we register all value simplification
// callbacks before any other AA has the chance to create an AAValueSimplify
// or similar.
for (Function *Kernel : OMPInfoCache.Kernels)
A.getOrCreateAAFor<AAKernelInfo>(
IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
DepClassTy::NONE, /* ForceUpdate */ false,
/* UpdateAfterInit */ false);
}
// Create CallSite AA for all Getters.
for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
auto CreateAA = [&](Use &U, Function &Caller) {
CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
if (!CI)
return false;
auto &CB = cast<CallBase>(*CI);
IRPosition CBPos = IRPosition::callsite_function(CB);
A.getOrCreateAAFor<AAICVTracker>(CBPos);
return false;
};
GetterRFI.foreachUse(SCC, CreateAA);
}
auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
auto CreateAA = [&](Use &U, Function &F) {
A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
return false;
};
GlobalizationRFI.foreachUse(SCC, CreateAA);
// Create an ExecutionDomain AA for every function and a HeapToStack AA for
// every function if there is a device kernel.
for (auto *F : SCC) {
if (!F->isDeclaration())
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
if (isOpenMPDevice(M))
A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
}
}
const char AAICVTracker::ID = 0; const char AAICVTracker::ID = 0;
const char AAKernelInfo::ID = 0;
const char AAExecutionDomain::ID = 0; const char AAExecutionDomain::ID = 0;
const char AAHeapToShared::ID = 0; const char AAHeapToShared::ID = 0;
@ -2615,6 +3310,28 @@ AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
return *AA; return *AA;
} }
AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
Attributor &A) {
AAKernelInfo *AA = nullptr;
switch (IRP.getPositionKind()) {
case IRPosition::IRP_INVALID:
case IRPosition::IRP_FLOAT:
case IRPosition::IRP_ARGUMENT:
case IRPosition::IRP_RETURNED:
case IRPosition::IRP_CALL_SITE_RETURNED:
case IRPosition::IRP_CALL_SITE_ARGUMENT:
llvm_unreachable("KernelInfo can only be created for function position!");
case IRPosition::IRP_CALL_SITE:
AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
break;
case IRPosition::IRP_FUNCTION:
AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
break;
}
return *AA;
}
PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) { PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
if (!containsOpenMP(M)) if (!containsOpenMP(M))
return PreservedAnalyses::all(); return PreservedAnalyses::all();

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,224 @@
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
target triple = "nvptx64"
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback [1 known parallel regions, 2 unkown parallel regions] (bad)
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism")
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp"))` (or "omp_no_parallelism")
; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Generic-mode kernel is executed with a customized state machine [1 known parallel regions] (good)
;; void unknown(void);
;; void known(void) {
;; #pragma omp parallel
;; {
;; unknown();
;; }
;; }
;;
;; void test_fallback(void) {
;; #pragma omp target teams
;; {
;; unknown();
;; known();
;; unknown();
;; }
;; }
;;
;; void test_no_fallback(void) {
;; #pragma omp target teams
;; {
;; known();
;; known();
;; known();
;; }
;; }
%struct.ident_t = type { i32, i32, i32, i32, i8* }
@0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([113 x i8], [113 x i8]* @0, i32 0, i32 0) }, align 8
@2 = private unnamed_addr constant [82 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_fallback;11;1;;\00", align 1
@3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([82 x i8], [82 x i8]* @2, i32 0, i32 0) }, align 8
@4 = private unnamed_addr constant [114 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1
@5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([114 x i8], [114 x i8]* @4, i32 0, i32 0) }, align 8
@__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1
@6 = private unnamed_addr constant [116 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1
@7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([116 x i8], [116 x i8]* @6, i32 0, i32 0) }, align 8
@8 = private unnamed_addr constant [85 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;test_no_fallback;20;1;;\00", align 1
@9 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([85 x i8], [85 x i8]* @8, i32 0, i32 0) }, align 8
@10 = private unnamed_addr constant [117 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1
@11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([117 x i8], [117 x i8]* @10, i32 0, i32 0) }, align 8
@__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1
@12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
@13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8
@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata"
; Function Attrs: convergent norecurse nounwind
define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 {
entry:
%captured_vars_addrs.i.i = alloca [0 x i8*], align 8
%0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3, !dbg !18
%exec_user_code = icmp eq i32 %0, -1, !dbg !18
br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18
common.ret: ; preds = %entry, %user_code.entry
ret void, !dbg !19
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @3) #3
call void @unknown() #6, !dbg !20
%2 = bitcast [0 x i8*]* %captured_vars_addrs.i.i to i8*
call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
%3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
%4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i.i, i64 0, i64 0, !dbg !23
call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23
call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26
call void @unknown() #6, !dbg !27
call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i1 false, i1 true) #3, !dbg !28
br label %common.ret
}
declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr
; Function Attrs: convergent
declare void @unknown() local_unnamed_addr #1
; Function Attrs: nounwind
define hidden void @known() local_unnamed_addr #2 !dbg !29 {
entry:
%captured_vars_addrs = alloca [0 x i8*], align 8
%0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @13)
%1 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs, i64 0, i64 0, !dbg !30
call void @__kmpc_parallel_51(%struct.ident_t* nonnull @13, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** nonnull %1, i64 0) #3, !dbg !30
ret void, !dbg !31
}
; Function Attrs: nounwind
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3
declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr
; Function Attrs: norecurse nounwind
define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 {
entry:
%captured_vars_addrs.i2.i = alloca [0 x i8*], align 8
%0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i1 false, i1 true, i1 true) #3, !dbg !33
%exec_user_code = icmp eq i32 %0, -1, !dbg !33
br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33
common.ret: ; preds = %entry, %user_code.entry
ret void, !dbg !34
user_code.entry: ; preds = %entry
%1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @9) #3
%2 = bitcast [0 x i8*]* %captured_vars_addrs.i2.i to i8*
call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
%3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
%4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i2.i, i64 0, i64 0, !dbg !35
call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !35
call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !39
call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
%5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %5, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !40
call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !42
call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
%6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43
call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45
call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46
br label %common.ret
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__2(i32* noalias nocapture nofree readnone %.global_tid., i32* noalias nocapture nofree readnone %.bound_tid.) #0 !dbg !47 {
entry:
call void @unknown() #6, !dbg !48
ret void, !dbg !49
}
; Function Attrs: convergent norecurse nounwind
define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 !dbg !50 {
entry:
%global_args = alloca i8**, align 8
call void @__kmpc_get_shared_variables(i8*** nonnull %global_args) #3, !dbg !51
call void @unknown() #6, !dbg !52
ret void, !dbg !51
}
declare void @__kmpc_get_shared_variables(i8***) local_unnamed_addr
declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #3 = { nounwind }
attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #5 = { argmemonly nofree nosync nounwind willreturn }
attributes #6 = { convergent nounwind }
!llvm.dbg.cu = !{!0}
!omp_offload.info = !{!3, !4}
!nvvm.annotations = !{!5, !6}
!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
!llvm.ident = !{!14}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "custom_state_machines_remarks.c", directory: "/data/src/llvm-project")
!2 = !{}
!3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1}
!4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0}
!5 = !{void ()* @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1}
!6 = !{void ()* @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1}
!7 = !{i32 7, !"Dwarf Version", i32 2}
!8 = !{i32 2, !"Debug Info Version", i32 3}
!9 = !{i32 1, !"wchar_size", i32 4}
!10 = !{i32 7, !"openmp", i32 50}
!11 = !{i32 7, !"openmp-device", i32 50}
!12 = !{i32 7, !"PIC Level", i32 2}
!13 = !{i32 7, !"frame-pointer", i32 2}
!14 = !{!"clang version 13.0.0"}
!15 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_fallback_l11", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!16 = !DIFile(filename: "llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c", directory: "/data/src/llvm-project")
!17 = !DISubroutineType(types: !2)
!18 = !DILocation(line: 11, column: 1, scope: !15)
!19 = !DILocation(line: 0, scope: !15)
!20 = !DILocation(line: 13, column: 5, scope: !21, inlinedAt: !22)
!21 = distinct !DISubprogram(name: "__omp_outlined__", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!22 = distinct !DILocation(line: 11, column: 1, scope: !15)
!23 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !25)
!24 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!25 = distinct !DILocation(line: 14, column: 5, scope: !21, inlinedAt: !22)
!26 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !25)
!27 = !DILocation(line: 15, column: 5, scope: !21, inlinedAt: !22)
!28 = !DILocation(line: 11, column: 25, scope: !15)
!29 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!30 = !DILocation(line: 4, column: 1, scope: !29)
!31 = !DILocation(line: 8, column: 1, scope: !29)
!32 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_no_fallback_l20", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!33 = !DILocation(line: 20, column: 1, scope: !32)
!34 = !DILocation(line: 0, scope: !32)
!35 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !36)
!36 = distinct !DILocation(line: 22, column: 5, scope: !37, inlinedAt: !38)
!37 = distinct !DISubprogram(name: "__omp_outlined__1", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!38 = distinct !DILocation(line: 20, column: 1, scope: !32)
!39 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !36)
!40 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !41)
!41 = distinct !DILocation(line: 23, column: 5, scope: !37, inlinedAt: !38)
!42 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !41)
!43 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !44)
!44 = distinct !DILocation(line: 24, column: 5, scope: !37, inlinedAt: !38)
!45 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !44)
!46 = !DILocation(line: 20, column: 25, scope: !32)
!47 = distinct !DISubprogram(name: "__omp_outlined__2", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!48 = !DILocation(line: 6, column: 5, scope: !47)
!49 = !DILocation(line: 7, column: 3, scope: !47)
!50 = distinct !DISubprogram(linkageName: "__omp_outlined__2_wrapper", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!51 = !DILocation(line: 4, column: 1, scope: !50)
!52 = !DILocation(line: 6, column: 5, scope: !47, inlinedAt: !53)
!53 = distinct !DILocation(line: 4, column: 1, scope: !50)

View File

@ -7,15 +7,19 @@ target triple = "nvptx64"
; CHECK: remark: globalization_remarks.c:5:7: Could not move globalized variable to the stack. Variable is potentially captured. ; CHECK: remark: globalization_remarks.c:5:7: Could not move globalized variable to the stack. Variable is potentially captured.
; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization. ; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
%struct.ident_t = type { i32, i32, i32, i32, i8* }
@S = external local_unnamed_addr global i8* @S = external local_unnamed_addr global i8*
define void @foo() { define void @foo() {
entry: entry:
%c = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true)
%0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !10 %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !10
%x_on_stack = bitcast i8* %0 to i32* %x_on_stack = bitcast i8* %0 to i32*
%1 = bitcast i32* %x_on_stack to i8* %1 = bitcast i32* %x_on_stack to i8*
call void @share(i8* %1) call void @share(i8* %1)
call void @__kmpc_free_shared(i8* %0) call void @__kmpc_free_shared(i8* %0)
call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
ret void ret void
} }
@ -29,6 +33,8 @@ declare i8* @__kmpc_alloc_shared(i64)
declare void @__kmpc_free_shared(i8*) declare void @__kmpc_free_shared(i8*)
declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1);
declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
!llvm.dbg.cu = !{!0} !llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !6} !llvm.module.flags = !{!3, !4, !5, !6}

View File

@ -7,25 +7,34 @@ target triple = "nvptx64"
; CHECK-REMARKS: remark: remove_globalization.c:4:2: Could not move globalized variable to the stack. Variable is potentially captured. Mark as noescape to override. ; CHECK-REMARKS: remark: remove_globalization.c:4:2: Could not move globalized variable to the stack. Variable is potentially captured. Mark as noescape to override.
; CHECK-REMARKS: remark: remove_globalization.c:2:2: Moving globalized variable to the stack. ; CHECK-REMARKS: remark: remove_globalization.c:2:2: Moving globalized variable to the stack.
; CHECK-REMARKS: remark: remove_globalization.c:6:2: Moving globalized variable to the stack. ; CHECK-REMARKS: remark: remove_globalization.c:6:2: Moving globalized variable to the stack.
; CHECK-REMARKS: remark: remove_globalization.c:4:2: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
@S = external local_unnamed_addr global i8* @S = external local_unnamed_addr global i8*
%struct.ident_t = type { i32, i32, i32, i32, i8* }
declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
define void @kernel() { define void @kernel() {
; CHECK-LABEL: define {{[^@]+}}@kernel() { ; CHECK-LABEL: define {{[^@]+}}@kernel() {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: call void @foo() ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 false, i1 true)
; CHECK-NEXT: call void @bar() ; CHECK-NEXT: call void @foo() #[[ATTR0:[0-9]+]]
; CHECK-NEXT: call void @bar() #[[ATTR0]]
; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
;
entry: entry:
%0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull null, i1 false, i1 true, i1 true)
call void @foo() call void @foo()
call void @bar() call void @bar()
call void @__kmpc_target_deinit(%struct.ident_t* nonnull null, i1 false, i1 true)
ret void ret void
} }
define internal void @foo() { define internal void @foo() {
; CHECK-LABEL: define {{[^@]+}}@foo ; CHECK-LABEL: define {{[^@]+}}@foo
; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1 ; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
@ -41,8 +50,8 @@ define internal void @bar() {
; CHECK-LABEL: define {{[^@]+}}@bar ; CHECK-LABEL: define {{[^@]+}}@bar
; CHECK-SAME: () #[[ATTR0]] { ; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG6:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]]
; CHECK-NEXT: call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR3:[0-9]+]]
; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]]) #[[ATTR0]] ; CHECK-NEXT: call void @__kmpc_free_shared(i8* [[TMP0]]) #[[ATTR0]]
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
@ -54,13 +63,18 @@ entry:
} }
define internal void @use(i8* %x) { define internal void @use(i8* %x) {
; CHECK-LABEL: define {{[^@]+}}@use
; CHECK-SAME: (i8* noalias nocapture nofree readnone [[X:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: ret void
;
entry: entry:
ret void ret void
} }
define internal void @share(i8* %x) { define internal void @share(i8* %x) {
; CHECK-LABEL: define {{[^@]+}}@share ; CHECK-LABEL: define {{[^@]+}}@share
; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: store i8* [[X]], i8** @S, align 8 ; CHECK-NEXT: store i8* [[X]], i8** @S, align 8
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
@ -71,6 +85,12 @@ entry:
} }
define void @unused() { define void @unused() {
; CHECK-LABEL: define {{[^@]+}}@unused() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 1
; CHECK-NEXT: call void @use(i8* noalias readnone undef)
; CHECK-NEXT: ret void
;
entry: entry:
%0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !14 %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !14
call void @use(i8* %0) call void @use(i8* %0)

View File

@ -20,18 +20,22 @@ target triple = "nvptx64"
; CHECK: call void @__kmpc_free_shared({{.*}}) ; CHECK: call void @__kmpc_free_shared({{.*}})
define dso_local void @foo() { define dso_local void @foo() {
entry: entry:
%c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
%x = call i8* @__kmpc_alloc_shared(i64 4) %x = call i8* @__kmpc_alloc_shared(i64 4)
%x_on_stack = bitcast i8* %x to i32* %x_on_stack = bitcast i8* %x to i32*
%0 = bitcast i32* %x_on_stack to i8* %0 = bitcast i32* %x_on_stack to i8*
call void @use(i8* %0) call void @use(i8* %0)
call void @__kmpc_free_shared(i8* %x) call void @__kmpc_free_shared(i8* %x)
call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
ret void ret void
} }
define void @bar() { define void @bar() {
%c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
call void @baz() call void @baz()
call void @qux() call void @qux()
call void @negative_qux_spmd() call void @negative_qux_spmd()
call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
ret void ret void
} }
@ -104,6 +108,8 @@ declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
!llvm.dbg.cu = !{!0} !llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5, !6} !llvm.module.flags = !{!3, !4, !5, !6}
!nvvm.annotations = !{!7, !8} !nvvm.annotations = !{!7, !8}

View File

@ -8,25 +8,36 @@
@0 = private unnamed_addr constant [1 x i8] c"\00", align 1 @0 = private unnamed_addr constant [1 x i8] c"\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @0, i32 0, i32 0) }, align 8
; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @kernel if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread.
define void @kernel() { define void @kernel() {
call void @__kmpc_kernel_prepare_parallel(i8* null) %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false)
%cmp = icmp eq i32 %call, -1
br i1 %cmp, label %if.then, label %if.else
if.then:
call void @nvptx() call void @nvptx()
call void @amdgcn() call void @amdgcn()
br label %if.end
if.else:
br label %if.end
if.end:
call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true)
ret void ret void
} }
; REMARKS: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible. ; REMARKS: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible.
; REMARKS-NOT: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible. ; REMARKS-NOT: remark: single_threaded_execution.c:1:0: Could not internalize function. Some optimizations may not be possible.
; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
; Function Attrs: noinline ; Function Attrs: noinline
define internal void @nvptx() { define internal void @nvptx() {
entry: entry:
%call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false) br i1 true, label %if.then, label %if.end
%cmp = icmp eq i32 %call, -1
br i1 %cmp, label %if.then, label %if.end
if.then: if.then:
call void @foo() call void @foo()
@ -39,15 +50,13 @@ if.end:
ret void ret void
} }
; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread. ; CHECK-DAG: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
; Function Attrs: noinline ; Function Attrs: noinline
define internal void @amdgcn() { define internal void @amdgcn() {
entry: entry:
%call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) br i1 false, label %if.then, label %if.end
%cmp = icmp eq i32 %call, -1
br i1 %cmp, label %if.then, label %if.end
if.then: if.then:
call void @foo() call void @foo()
@ -95,6 +104,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
declare void @__kmpc_kernel_prepare_parallel(i8*) declare void @__kmpc_kernel_prepare_parallel(i8*)
declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
attributes #0 = { cold noinline } attributes #0 = { cold noinline }