[OpenMP] Add OpenMP data sharing infrastructure using global memory

Summary:
This patch handles the Clang code generation phase for the OpenMP data sharing infrastructure.

TODO: add a more detailed description.

Reviewers: ABataev, carlo.bertolli, caomhin, hfinkel, Hahnfeld

Reviewed By: ABataev

Subscribers: jholewinski, guansong, cfe-commits

Differential Revision: https://reviews.llvm.org/D43660

llvm-svn: 327513
This commit is contained in:
Gheorghe-Teodor Bercea 2018-03-14 14:17:45 +00:00
parent 81ccb97024
commit d3dcf2f05d
9 changed files with 980 additions and 271 deletions

View File

@ -1068,9 +1068,17 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
}
// A normal fixed sized variable becomes an alloca in the entry block,
// unless it's an NRVO variable.
// unless:
// - it's an NRVO variable.
// - we are compiling OpenMP and it's an OpenMP local variable.
if (NRVO) {
Address OpenMPLocalAddr =
getLangOpts().OpenMP
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
: Address::invalid();
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
address = OpenMPLocalAddr;
} else if (NRVO) {
// The named return value optimization: allocate this variable in the
// return slot, so that we can elide the copy when returning this
// variable (C++0x [class.copy]p34).
@ -1896,9 +1904,18 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
}
}
} else {
// Otherwise, create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr");
// Check if the parameter address is controlled by OpenMP runtime.
Address OpenMPLocalAddr =
getLangOpts().OpenMP
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
: Address::invalid();
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
DeclPtr = OpenMPLocalAddr;
} else {
// Otherwise, create a temporary to hold the value.
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
D.getName() + ".addr");
}
DoStore = true;
}

View File

@ -8100,6 +8100,11 @@ Address CGOpenMPRuntime::getParameterAddress(CodeGenFunction &CGF,
return CGF.GetAddrOfLocalVar(NativeParam);
}
Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) {
return Address::invalid();
}
llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {

View File

@ -676,7 +676,7 @@ public:
/// \brief Cleans up references to the objects in finished function.
///
void functionFinished(CodeGenFunction &CGF);
virtual void functionFinished(CodeGenFunction &CGF);
/// \brief Emits code for parallel or serial call of the \a OutlinedFn with
/// variables captured in a record which address is stored in \a
@ -1362,6 +1362,14 @@ public:
emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> Args = llvm::None) const;
/// Emits OpenMP-specific function prolog.
/// Required for device constructs.
virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) {}
/// Gets the OpenMP-specific address of the local variable.
virtual Address getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD);
};
/// Class supports emissionof SIMD-only code.

View File

@ -13,9 +13,11 @@
//===----------------------------------------------------------------------===//
#include "CGOpenMPRuntimeNVPTX.h"
#include "clang/AST/DeclOpenMP.h"
#include "CodeGenFunction.h"
#include "clang/AST/DeclOpenMP.h"
#include "clang/AST/StmtOpenMP.h"
#include "clang/AST/StmtVisitor.h"
#include "llvm/ADT/SmallPtrSet.h"
using namespace clang;
using namespace CodeGen;
@ -70,7 +72,21 @@ enum OpenMPRTLFunctionNVPTX {
/// index, int32_t width, int32_t reduce))
OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
/// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
OMPRTL_NVPTX__kmpc_end_reduce_nowait
OMPRTL_NVPTX__kmpc_end_reduce_nowait,
/// \brief Call to void __kmpc_data_sharing_init_stack();
OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
/// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size,
/// int16_t UseSharedMemory);
OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
/// \brief Call to void __kmpc_data_sharing_pop_stack(void *a);
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
/// \brief Call to void __kmpc_begin_sharing_variables(void ***args,
/// size_t n_args);
OMPRTL_NVPTX__kmpc_begin_sharing_variables,
/// \brief Call to void __kmpc_end_sharing_variables();
OMPRTL_NVPTX__kmpc_end_sharing_variables,
/// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
OMPRTL_NVPTX__kmpc_get_shared_variables,
};
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@ -149,6 +165,245 @@ enum NamedBarrier : unsigned {
/// barrier.
NB_Parallel = 1,
};
/// Get the list of variables that can escape their declaration context.
class CheckVarsEscapingDeclContext final
: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
CodeGenFunction &CGF;
llvm::SetVector<const ValueDecl *> EscapedDecls;
llvm::SmallPtrSet<const ValueDecl *, 4> IgnoredDecls;
bool AllEscaped = false;
RecordDecl *GlobalizedRD = nullptr;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
void markAsEscaped(const ValueDecl *VD) {
if (IgnoredDecls.count(VD) ||
(CGF.CapturedStmtInfo &&
CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD))))
return;
EscapedDecls.insert(VD);
}
void VisitValueDecl(const ValueDecl *VD) {
if (VD->getType()->isLValueReferenceType()) {
markAsEscaped(VD);
if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(VarD->getInit());
AllEscaped = SavedAllEscaped;
}
}
}
}
void VisitOpenMPCapturedStmt(const CapturedStmt *S) {
if (!S)
return;
for (const auto &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
return P1.first > P2.first;
}
void buildRecordForGlobalizedVars() {
assert(!GlobalizedRD &&
"Record for globalized variables is built already.");
if (EscapedDecls.empty())
return;
ASTContext &C = CGF.getContext();
SmallVector<VarsDataTy, 4> GlobalizedVars;
for (const auto *D : EscapedDecls)
GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
stable_sort_comparator);
// Build struct _globalized_locals_ty {
// /* globalized vars */
// };
GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
GlobalizedRD->startDefinition();
for (const auto &Pair : GlobalizedVars) {
const ValueDecl *VD = Pair.second;
QualType Type = VD->getType();
if (Type->isLValueReferenceType())
Type = C.getPointerType(Type.getNonReferenceType());
else
Type = Type.getNonReferenceType();
SourceLocation Loc = VD->getLocation();
auto *Field = FieldDecl::Create(
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
GlobalizedRD->addDecl(Field);
if (VD->hasAttrs()) {
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
E(VD->getAttrs().end());
I != E; ++I)
Field->addAttr(*I);
}
MappedDeclsFields.try_emplace(VD, Field);
}
GlobalizedRD->completeDefinition();
}
public:
CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
ArrayRef<const ValueDecl *> IgnoredDecls)
: CGF(CGF), IgnoredDecls(IgnoredDecls.begin(), IgnoredDecls.end()) {}
virtual ~CheckVarsEscapingDeclContext() = default;
void VisitDeclStmt(const DeclStmt *S) {
if (!S)
return;
for (const auto *D : S->decls())
if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
VisitValueDecl(VD);
}
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
if (!D)
return;
if (D->hasAssociatedStmt()) {
if (const auto *S =
dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt()))
VisitOpenMPCapturedStmt(S);
}
}
void VisitCapturedStmt(const CapturedStmt *S) {
if (!S)
return;
for (const auto &C : S->captures()) {
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
void VisitLambdaExpr(const LambdaExpr *E) {
if (!E)
return;
for (const auto &C : E->captures()) {
if (C.capturesVariable()) {
if (C.getCaptureKind() == LCK_ByRef) {
const ValueDecl *VD = C.getCapturedVar();
markAsEscaped(VD);
if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
}
}
}
}
void VisitBlockExpr(const BlockExpr *E) {
if (!E)
return;
for (const auto &C : E->getBlockDecl()->captures()) {
if (C.isByRef()) {
const VarDecl *VD = C.getVariable();
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
VisitValueDecl(VD);
}
}
}
void VisitCallExpr(const CallExpr *E) {
if (!E)
return;
for (const Expr *Arg : E->arguments()) {
if (!Arg)
continue;
if (Arg->isLValue()) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(Arg);
AllEscaped = SavedAllEscaped;
} else
Visit(Arg);
}
Visit(E->getCallee());
}
void VisitDeclRefExpr(const DeclRefExpr *E) {
if (!E)
return;
const ValueDecl *VD = E->getDecl();
if (AllEscaped)
markAsEscaped(VD);
if (isa<OMPCapturedExprDecl>(VD))
VisitValueDecl(VD);
else if (const auto *VarD = dyn_cast<VarDecl>(VD))
if (VarD->isInitCapture())
VisitValueDecl(VD);
}
void VisitUnaryOperator(const UnaryOperator *E) {
if (!E)
return;
if (E->getOpcode() == UO_AddrOf) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(E->getSubExpr());
AllEscaped = SavedAllEscaped;
} else
Visit(E->getSubExpr());
}
void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
if (!E)
return;
if (E->getCastKind() == CK_ArrayToPointerDecay) {
const bool SavedAllEscaped = AllEscaped;
AllEscaped = true;
Visit(E->getSubExpr());
AllEscaped = SavedAllEscaped;
} else
Visit(E->getSubExpr());
}
void VisitExpr(const Expr *E) {
if (!E)
return;
bool SavedAllEscaped = AllEscaped;
if (!E->isLValue())
AllEscaped = false;
for (const auto *Child : E->children())
if (Child)
Visit(Child);
AllEscaped = SavedAllEscaped;
}
void VisitStmt(const Stmt *S) {
if (!S)
return;
for (const auto *Child : S->children())
if (Child)
Visit(Child);
}
const RecordDecl *getGlobalizedRecord() {
if (!GlobalizedRD)
buildRecordForGlobalizedVars();
return GlobalizedRD;
}
const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
assert(GlobalizedRD &&
"Record for globalized variables must be generated already.");
auto I = MappedDeclsFields.find(VD);
if (I == MappedDeclsFields.end())
return nullptr;
return I->getSecond();
}
ArrayRef<const ValueDecl *> getEscapedDecls() const {
return EscapedDecls.getArrayRef();
}
};
} // anonymous namespace
/// Get the GPU warp size.
@ -288,6 +543,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
EntryFunctionState EST;
WorkerFunctionState WST(CGM, D.getLocStart());
Work.clear();
WrapperFunctionsMap.clear();
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
@ -344,6 +600,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
CGF.EmitBlock(MasterBB);
// SEQUENTIAL (MASTER) REGION START
// First action in sequential region:
// Initialize the state of the OpenMP runtime library on the GPU.
// TODO: Optimize runtime initialization and pass in correct value.
@ -351,10 +608,65 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
Bld.getInt16(/*RequiresOMPRuntime=*/1)};
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
// For data sharing, we need to initialize the stack.
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return;
const RecordDecl *GlobalizedVarsRecord = I->getSecond().first;
QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
// Recover pointer to this function's global record. The runtime will
// handle the specifics of the allocation of the memory.
// Use actual memory size of the record including the padding
// for alignment purposes.
unsigned Alignment =
CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
unsigned GlobalRecordSize =
CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
llvm::Value *GlobalRecordSizeArg[] = {
llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
GlobalRecordSizeArg);
llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
FunctionToGlobalRecPtr.try_emplace(CGF.CurFn, GlobalRecValue);
// Emit the "global alloca" which is a GEP from the global declaration record
// using the pointer returned by the runtime.
LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
auto &Res = I->getSecond().second;
for (auto &Rec : Res) {
const FieldDecl *FD = Rec.second.first;
LValue VarAddr = CGF.EmitLValueForField(Base, FD);
Rec.second.second = VarAddr.getAddress();
}
}
void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
EntryFunctionState &EST) {
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
if (!CGF.HaveInsertPoint())
return;
auto I = FunctionToGlobalRecPtr.find(CGF.CurFn);
if (I != FunctionToGlobalRecPtr.end()) {
llvm::Value *Args[] = {I->getSecond()};
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
Args);
}
}
if (!EST.ExitBB)
EST.ExitBB = CGF.createBasicBlock(".exit");
@ -543,14 +855,13 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
// Execute this outlined function.
CGF.EmitBlock(ExecuteFNBB);
// Insert call to work function.
// FIXME: Pass arguments to outlined function from master thread.
auto *Fn = cast<llvm::Function>(W);
Address ZeroAddr =
CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
emitCall(CGF, WST.Loc, Fn, FnArgs);
// Insert call to work function via shared wrapper. The shared
// wrapper takes two arguments:
// - the parallelism level;
// - the master thread ID;
emitOutlinedFunctionCall(CGF, WST.Loc, W,
{Bld.getInt16(/*ParallelLevel=*/0),
getMasterThreadID(CGF)});
// Go to end of parallel region.
CGF.EmitBranch(TerminateBB);
@ -619,8 +930,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
/// Build void __kmpc_kernel_prepare_parallel(
/// void *outlined_function, int16_t IsOMPRuntimeInitialized);
llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
CGM.Int16Ty};
llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
@ -758,6 +1068,56 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
break;
}
case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
/// Build void __kmpc_data_sharing_init_stack();
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
break;
}
case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
// Build void *__kmpc_data_sharing_push_stack(size_t size,
// int16_t UseSharedMemory);
llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(
FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
break;
}
case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
// Build void __kmpc_data_sharing_pop_stack(void *a);
llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(FnTy,
/*Name=*/"__kmpc_data_sharing_pop_stack");
break;
}
case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
/// Build void __kmpc_begin_sharing_variables(void ***args,
/// size_t n_args);
llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
break;
}
case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
/// Build void __kmpc_end_sharing_variables();
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
break;
}
case OMPRTL_NVPTX__kmpc_get_shared_variables: {
/// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
llvm::FunctionType *FnTy =
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
break;
}
}
return RTLFn;
}
@ -847,8 +1207,16 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
InnermostKind, CodeGen);
auto *OutlinedFun =
cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
D, ThreadIDVar, InnermostKind, CodeGen));
if (!isInSpmdExecutionMode()) {
llvm::Function *WrapperFun =
createParallelDataSharingWrapper(OutlinedFun, D);
WrapperFunctionsMap[OutlinedFun] = WrapperFun;
}
return OutlinedFun;
}
llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
@ -900,16 +1268,58 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
llvm::Function *WFn = WrapperFunctionsMap[Fn];
auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
assert(WFn && "Wrapper function does not exist!");
// Force inline this outlined function at its call site.
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
PrePostActionTy &) {
CGBuilderTy &Bld = CGF.Builder;
// TODO: Optimize runtime initialization.
llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy),
/*RequiresOMPRuntime=*/Bld.getInt16(1)};
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
Args);
llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
// Prepare for parallel region. Indicate the outlined function.
llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
Args);
// Create a private scope that will globalize the arguments
// passed from the outside of the target region.
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
// There's somehting to share.
if (!CapturedVars.empty()) {
// Prepare for parallel region. Indicate the outlined function.
Address SharedArgs =
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
llvm::Value *DataSharingArgs[] = {
SharedArgsPtr,
llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_begin_sharing_variables),
DataSharingArgs);
// Store variable address in a list of references to pass to workers.
unsigned Idx = 0;
ASTContext &Ctx = CGF.getContext();
Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
.castAs<PointerType>());
for (llvm::Value *V : CapturedVars) {
Address Dst = Bld.CreateConstInBoundsGEP(
SharedArgListAddress, Idx, CGF.getPointerSize());
llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
Ctx.getPointerType(Ctx.VoidPtrTy));
Idx++;
}
}
// Activate workers. This barrier is used by the master to signal
// work for the workers.
@ -923,8 +1333,12 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
// The master waits at this barrier until all workers are done.
syncCTAThreads(CGF);
if (!CapturedVars.empty())
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
// Remember for post-processing in worker loop.
Work.emplace_back(Fn);
Work.emplace_back(WFn);
};
auto *RTLoc = emitUpdateLocation(CGF, Loc);
@ -2343,3 +2757,149 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
}
CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
}
/// Emit function which wraps the outline parallel region
/// and controls the arguments which are passed to this function.
/// The wrapper ensures that the outlined function is called
/// with the correct arguments when data is shared.
llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
ASTContext &Ctx = CGM.getContext();
const auto &CS = *D.getCapturedStmt(OMPD_parallel);
// Create a function that takes as argument the source thread.
FunctionArgList WrapperArgs;
QualType Int16QTy =
Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
QualType Int32QTy =
Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
/*Id=*/nullptr, Int16QTy,
ImplicitParamDecl::Other);
ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
/*Id=*/nullptr, Int32QTy,
ImplicitParamDecl::Other);
WrapperArgs.emplace_back(&ParallelLevelArg);
WrapperArgs.emplace_back(&WrapperArg);
auto &CGFI =
CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
auto *Fn = llvm::Function::Create(
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
CGM.SetInternalFunctionAttributes(/*D=*/GlobalDecl(), Fn, CGFI);
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
D.getLocStart(), D.getLocStart());
const auto *RD = CS.getCapturedRecordDecl();
auto CurField = RD->field_begin();
// Get the array of arguments.
SmallVector<llvm::Value *, 8> Args;
// TODO: suppport SIMD and pass actual values
Args.emplace_back(
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
Args.emplace_back(
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
CGBuilderTy &Bld = CGF.Builder;
auto CI = CS.capture_begin();
// Use global memory for data sharing.
// Handle passing of global args to workers.
Address GlobalArgs =
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
DataSharingArgs);
// Retrieve the shared variables from the list of references returned
// by the runtime. Pass the variables to the outlined function.
if (CS.capture_size() > 0) {
ASTContext &CGFContext = CGF.getContext();
Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs,
CGFContext
.getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy))
.castAs<PointerType>());
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
QualType ElemTy = CurField->getType();
Address Src = Bld.CreateConstInBoundsGEP(
SharedArgListAddress, I, CGF.getPointerSize());
Address TypedAddress = Bld.CreateBitCast(
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
/*Volatile=*/false,
CGFContext.getPointerType(ElemTy),
CI->getLocation());
Args.emplace_back(Arg);
}
}
emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
CGF.FinishFunction();
return Fn;
}
void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
const Decl *D) {
assert(D && "Expected function or captured|block decl.");
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
"Function is registered already.");
SmallVector<const ValueDecl *, 4> IgnoredDecls;
const Stmt *Body = nullptr;
if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
Body = FD->getBody();
} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
Body = BD->getBody();
} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
Body = CD->getBody();
if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) {
if (const auto *CS = dyn_cast<CapturedStmt>(Body)) {
IgnoredDecls.reserve(CS->capture_size());
for (const auto &Capture : CS->captures())
if (Capture.capturesVariable())
IgnoredDecls.emplace_back(Capture.getCapturedVar());
}
}
}
if (!Body)
return;
CheckVarsEscapingDeclContext VarChecker(CGF, IgnoredDecls);
VarChecker.Visit(Body);
const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
if (!GlobalizedVarsRecord)
return;
auto &Res =
FunctionGlobalizedDecls
.try_emplace(CGF.CurFn, GlobalizedVarsRecord, DeclToAddrMapTy())
.first->getSecond()
.second;
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Res.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
}
}
Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) {
auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return Address::invalid();
auto VDI = I->getSecond().second.find(VD);
if (VDI == I->getSecond().second.end())
return Address::invalid();
return VDI->second.second;
}
void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
FunctionToGlobalRecPtr.erase(CGF.CurFn);
FunctionGlobalizedDecls.erase(CGF.CurFn);
CGOpenMPRuntime::functionFinished(CGF);
}

View File

@ -289,6 +289,14 @@ public:
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
ArrayRef<llvm::Value *> Args = llvm::None) const override;
/// Emits OpenMP-specific function prolog.
/// Required for device constructs.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override;
/// Gets the OpenMP-specific address of the local variable.
Address getAddressOfLocalVariable(CodeGenFunction &CGF,
const VarDecl *VD) override;
/// Target codegen is specialized based on two programming models: the
/// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
/// model for constructs like 'target parallel' that support it.
@ -300,12 +308,37 @@ public:
Unknown,
};
/// Cleans up references to the objects in finished function.
///
void functionFinished(CodeGenFunction &CGF) override;
private:
// Track the execution mode when codegening directives within a target
// region. The appropriate mode (generic/spmd) is set on entry to the
// target region and used by containing directives such as 'parallel'
// to emit optimized code.
ExecutionMode CurrentExecutionMode;
/// Map between an outlined function and its wrapper.
llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
/// Emit function which wraps the outline parallel region
/// and controls the parameters which are passed to this function.
/// The wrapper ensures that the outlined function is called
/// with the correct arguments when data is shared.
llvm::Function *createParallelDataSharingWrapper(
llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D);
/// The map of local variables to their addresses in the global memory.
using DeclToAddrMapTy = llvm::MapVector<const Decl *,
std::pair<const FieldDecl *, Address>>;
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::DenseMap<llvm::Function *,
std::pair<const RecordDecl *, DeclToAddrMapTy>>
FunctionGlobalizedDecls;
/// Map from function to global record pointer.
llvm::DenseMap<llvm::Function *, llvm::Value *> FunctionToGlobalRecPtr;
};
} // CodeGen namespace.

View File

@ -585,6 +585,7 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) {
/*RegisterCastedArgsOnly=*/true,
CapturedStmtInfo->getHelperName());
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
Args.clear();
LocalAddrs.clear();
VLASizes.clear();

View File

@ -1067,6 +1067,11 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
EmitStartEHSpec(CurCodeDecl);
PrologueCleanupDepth = EHStack.stable_begin();
// Emit OpenMP specific initialization of the device functions.
if (getLangOpts().OpenMP && CurCodeDecl)
CGM.getOpenMPRuntime().emitFunctionProlog(*this, CurCodeDecl);
EmitFunctionProlog(*CurFnInfo, CurFn, Args);
if (D && isa<CXXMethodDecl>(D) && cast<CXXMethodDecl>(D)->isInstance()) {

View File

@ -0,0 +1,91 @@
// Test device global memory data sharing codegen.
///==========================================================================///
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
void test_ds(){
#pragma omp target
{
int a = 10;
#pragma omp parallel
{
a = 1000;
}
int b = 100;
#pragma omp parallel
{
b = a + 10000;
}
}
}
/// ========= In the kernel function ========= ///
// CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}()
// CK1: [[SHAREDARGS1:%.+]] = alloca i8**
// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
// CK1: call void @__kmpc_kernel_init
// CK1: call void @__kmpc_data_sharing_init_stack
// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0)
// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
// CK1: store i32 10, i32* [[A]]
// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1)
// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1)
// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0
// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8*
// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]]
// CK1: call void @llvm.nvvm.barrier0()
// CK1: call void @llvm.nvvm.barrier0()
// CK1: call void @__kmpc_end_sharing_variables()
// CK1: store i32 100, i32* [[B]]
// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1)
// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2)
// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]]
// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0
// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8*
// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]]
// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1
// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8*
// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]]
// CK1: call void @llvm.nvvm.barrier0()
// CK1: call void @llvm.nvvm.barrier0()
// CK1: call void @__kmpc_end_sharing_variables()
// CK1: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]])
// CK1: call void @__kmpc_kernel_deinit(i16 1)
/// ========= In the data sharing wrapper function ========= ///
// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
// CK1: [[SHAREDARGS4:%.+]] = alloca i8**
// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]])
// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]]
// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0
// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32**
// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]]
// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]])
/// ========= In the data sharing wrapper function ========= ///
// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
// CK1: [[SHAREDARGS3:%.+]] = alloca i8**
// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]])
// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]]
// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0
// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32**
// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]]
// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1
// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32**
// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]]
// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]])
#endif

View File

@ -64,254 +64,243 @@ int bar(int n){
// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
//
// CHECK: [[SEL_WORKERS]]
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
//
// CHECK: [[EXEC_PARALLEL]]
// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
//
// CHECK: [[EXEC_PFN1]]
// CHECK: call void [[PARALLEL_FN1]]_wrapper(
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT1]]
// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
//
// CHECK: [[EXEC_PFN2]]
// CHECK: call void [[PARALLEL_FN2]]_wrapper(
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT2]]
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[TERM_PARALLEL]]
// CHECK: call void @__kmpc_kernel_end_parallel()
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
//
// CHECK: [[BAR_PARALLEL]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[AWAIT_WORK]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
// Create local storage for each capture.
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
// Store captures in the context.
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
//
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
// CHECK: [[WORKER]]
// CHECK: {{call|invoke}} void [[T6]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[CHECK_MASTER]]
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @__kmpc_serialized_parallel(
// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
// CHECK: call void @__kmpc_end_serialized_parallel(
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK-64-DAG: load i32, i32* [[REF_A]]
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
// CHECK: br label {{%?}}[[TERMINATE:.+]]
//
// CHECK: [[TERMINATE]]
// CHECK: call void @__kmpc_kernel_deinit(
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[EXIT]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK-DAG: define internal void [[PARALLEL_FN1]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 42, i[[SZ]]* %a,
// CHECK: ret void
// CHECK-DAG: define internal void [[PARALLEL_FN3]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 43, i[[SZ]]* %a,
// CHECK: ret void
// CHECK-DAG: define internal void [[PARALLEL_FN2]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
//
// CHECK: [[SEL_WORKERS]]
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
//
// CHECK: [[EXEC_PARALLEL]]
// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
//
// CHECK: [[EXEC_PFN1]]
// CHECK: call void [[PARALLEL_FN1]](
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT1]]
// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
//
// CHECK: [[EXEC_PFN2]]
// CHECK: call void [[PARALLEL_FN2]](
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT2]]
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[TERM_PARALLEL]]
// CHECK: call void @__kmpc_kernel_end_parallel()
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
//
// CHECK: [[BAR_PARALLEL]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[AWAIT_WORK]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
//
// CHECK: [[SEL_WORKERS]]
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
//
// CHECK: [[EXEC_PARALLEL]]
// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32)* [[PARALLEL_FN4:@.+]]_wrapper to i8*)
// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
//
// CHECK: [[EXEC_PFN]]
// CHECK: call void [[PARALLEL_FN4]]_wrapper(
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT]]
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[TERM_PARALLEL]]
// CHECK: call void @__kmpc_kernel_end_parallel()
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
//
// CHECK: [[BAR_PARALLEL]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[AWAIT_WORK]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
// Create local storage for each capture.
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
// Store captures in the context.
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
//
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
// CHECK: [[WORKER]]
// CHECK: {{call|invoke}} void [[T6]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[CHECK_MASTER]]
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @__kmpc_serialized_parallel(
// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
// CHECK: call void @__kmpc_end_serialized_parallel(
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK-64-DAG: load i32, i32* [[REF_A]]
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
// CHECK: br label {{%?}}[[TERMINATE:.+]]
//
// CHECK: [[TERMINATE]]
// CHECK: call void @__kmpc_kernel_deinit(
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[EXIT]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
// Create local storage for each capture.
// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]],
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]],
// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]*
// CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]]
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
// Store captures in the context.
// CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32*
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
//
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
// CHECK: [[WORKER]]
// CHECK: {{call|invoke}} void [[T6]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[CHECK_MASTER]]
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]],
// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]],
// CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000
// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
//
// CHECK: [[IF_THEN]]
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN4]]_wrapper to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[IF_END:.+]]
//
// CHECK: [[IF_ELSE]]
// CHECK: call void @__kmpc_serialized_parallel(
// CHECK: {{call|invoke}} void [[PARALLEL_FN4]](
// CHECK: call void @__kmpc_end_serialized_parallel(
// br label [[IF_END]]
//
// CHECK: [[IF_END]]
// CHECK-64-DAG: load i32, i32* [[REF_A]]
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
// CHECK-DAG: load i16, i16* [[REF_AA]]
// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
//
// CHECK: br label {{%?}}[[TERMINATE:.+]]
//
// CHECK: [[TERMINATE]]
// CHECK: call void @__kmpc_kernel_deinit(
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[EXIT]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK-DAG: define internal void [[PARALLEL_FN1]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 42, i[[SZ]]* %a,
// CHECK: ret void
// CHECK-DAG: define internal void [[PARALLEL_FN3]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 43, i[[SZ]]* %a,
// CHECK: ret void
// CHECK-DAG: define internal void [[PARALLEL_FN2]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
// CHECK: ret void
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
//
// CHECK: [[AWAIT_WORK]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
//
// CHECK: [[SEL_WORKERS]]
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
//
// CHECK: [[EXEC_PARALLEL]]
// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
// CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*)
// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
//
// CHECK: [[EXEC_PFN]]
// CHECK: call void [[PARALLEL_FN4]](
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[CHECK_NEXT]]
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
//
// CHECK: [[TERM_PARALLEL]]
// CHECK: call void @__kmpc_kernel_end_parallel()
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
//
// CHECK: [[BAR_PARALLEL]]
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[AWAIT_WORK]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
// Create local storage for each capture.
// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]],
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]],
// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]*
// CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]]
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
// Store captures in the context.
// CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32*
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
//
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
//
// CHECK: [[WORKER]]
// CHECK: {{call|invoke}} void [[T6]]_worker()
// CHECK: br label {{%?}}[[EXIT:.+]]
//
// CHECK: [[CHECK_MASTER]]
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
//
// CHECK: [[MASTER]]
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]],
// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]],
// CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000
// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
//
// CHECK: [[IF_THEN]]
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*),
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[IF_END:.+]]
//
// CHECK: [[IF_ELSE]]
// CHECK: call void @__kmpc_serialized_parallel(
// CHECK: {{call|invoke}} void [[PARALLEL_FN4]](
// CHECK: call void @__kmpc_end_serialized_parallel(
// br label [[IF_END]]
//
// CHECK: [[IF_END]]
// CHECK-64-DAG: load i32, i32* [[REF_A]]
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
// CHECK-DAG: load i16, i16* [[REF_AA]]
// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
//
// CHECK: br label {{%?}}[[TERMINATE:.+]]
//
// CHECK: [[TERMINATE]]
// CHECK: call void @__kmpc_kernel_deinit(
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: br label {{%?}}[[EXIT]]
//
// CHECK: [[EXIT]]
// CHECK: ret void
// CHECK: define internal void [[PARALLEL_FN4]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 45, i[[SZ]]* %a,
// CHECK: ret void
// CHECK: define internal void [[PARALLEL_FN4]](
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
// CHECK: store i[[SZ]] 45, i[[SZ]]* %a,
// CHECK: ret void
#endif