forked from OSchip/llvm-project
[OpenMP] Add OpenMP data sharing infrastructure using global memory
Summary: This patch handles the Clang code generation phase for the OpenMP data sharing infrastructure. TODO: add a more detailed description. Reviewers: ABataev, carlo.bertolli, caomhin, hfinkel, Hahnfeld Reviewed By: ABataev Subscribers: jholewinski, guansong, cfe-commits Differential Revision: https://reviews.llvm.org/D43660 llvm-svn: 327513
This commit is contained in:
parent
81ccb97024
commit
d3dcf2f05d
|
@ -1068,9 +1068,17 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
|
|||
}
|
||||
|
||||
// A normal fixed sized variable becomes an alloca in the entry block,
|
||||
// unless it's an NRVO variable.
|
||||
// unless:
|
||||
// - it's an NRVO variable.
|
||||
// - we are compiling OpenMP and it's an OpenMP local variable.
|
||||
|
||||
if (NRVO) {
|
||||
Address OpenMPLocalAddr =
|
||||
getLangOpts().OpenMP
|
||||
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
|
||||
: Address::invalid();
|
||||
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
|
||||
address = OpenMPLocalAddr;
|
||||
} else if (NRVO) {
|
||||
// The named return value optimization: allocate this variable in the
|
||||
// return slot, so that we can elide the copy when returning this
|
||||
// variable (C++0x [class.copy]p34).
|
||||
|
@ -1896,9 +1904,18 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// Otherwise, create a temporary to hold the value.
|
||||
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
|
||||
D.getName() + ".addr");
|
||||
// Check if the parameter address is controlled by OpenMP runtime.
|
||||
Address OpenMPLocalAddr =
|
||||
getLangOpts().OpenMP
|
||||
? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D)
|
||||
: Address::invalid();
|
||||
if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) {
|
||||
DeclPtr = OpenMPLocalAddr;
|
||||
} else {
|
||||
// Otherwise, create a temporary to hold the value.
|
||||
DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D),
|
||||
D.getName() + ".addr");
|
||||
}
|
||||
DoStore = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -8100,6 +8100,11 @@ Address CGOpenMPRuntime::getParameterAddress(CodeGenFunction &CGF,
|
|||
return CGF.GetAddrOfLocalVar(NativeParam);
|
||||
}
|
||||
|
||||
Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
|
||||
const VarDecl *VD) {
|
||||
return Address::invalid();
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction(
|
||||
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
|
||||
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
|
||||
|
|
|
@ -676,7 +676,7 @@ public:
|
|||
|
||||
/// \brief Cleans up references to the objects in finished function.
|
||||
///
|
||||
void functionFinished(CodeGenFunction &CGF);
|
||||
virtual void functionFinished(CodeGenFunction &CGF);
|
||||
|
||||
/// \brief Emits code for parallel or serial call of the \a OutlinedFn with
|
||||
/// variables captured in a record which address is stored in \a
|
||||
|
@ -1362,6 +1362,14 @@ public:
|
|||
emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc,
|
||||
llvm::Value *OutlinedFn,
|
||||
ArrayRef<llvm::Value *> Args = llvm::None) const;
|
||||
|
||||
/// Emits OpenMP-specific function prolog.
|
||||
/// Required for device constructs.
|
||||
virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) {}
|
||||
|
||||
/// Gets the OpenMP-specific address of the local variable.
|
||||
virtual Address getAddressOfLocalVariable(CodeGenFunction &CGF,
|
||||
const VarDecl *VD);
|
||||
};
|
||||
|
||||
/// Class supports emissionof SIMD-only code.
|
||||
|
|
|
@ -13,9 +13,11 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CGOpenMPRuntimeNVPTX.h"
|
||||
#include "clang/AST/DeclOpenMP.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/DeclOpenMP.h"
|
||||
#include "clang/AST/StmtOpenMP.h"
|
||||
#include "clang/AST/StmtVisitor.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
|
||||
using namespace clang;
|
||||
using namespace CodeGen;
|
||||
|
@ -70,7 +72,21 @@ enum OpenMPRTLFunctionNVPTX {
|
|||
/// index, int32_t width, int32_t reduce))
|
||||
OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
|
||||
/// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
|
||||
OMPRTL_NVPTX__kmpc_end_reduce_nowait
|
||||
OMPRTL_NVPTX__kmpc_end_reduce_nowait,
|
||||
/// \brief Call to void __kmpc_data_sharing_init_stack();
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
|
||||
/// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size,
|
||||
/// int16_t UseSharedMemory);
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
|
||||
/// \brief Call to void __kmpc_data_sharing_pop_stack(void *a);
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
|
||||
/// \brief Call to void __kmpc_begin_sharing_variables(void ***args,
|
||||
/// size_t n_args);
|
||||
OMPRTL_NVPTX__kmpc_begin_sharing_variables,
|
||||
/// \brief Call to void __kmpc_end_sharing_variables();
|
||||
OMPRTL_NVPTX__kmpc_end_sharing_variables,
|
||||
/// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
|
||||
OMPRTL_NVPTX__kmpc_get_shared_variables,
|
||||
};
|
||||
|
||||
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
|
||||
|
@ -149,6 +165,245 @@ enum NamedBarrier : unsigned {
|
|||
/// barrier.
|
||||
NB_Parallel = 1,
|
||||
};
|
||||
|
||||
/// Get the list of variables that can escape their declaration context.
|
||||
class CheckVarsEscapingDeclContext final
|
||||
: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
|
||||
CodeGenFunction &CGF;
|
||||
llvm::SetVector<const ValueDecl *> EscapedDecls;
|
||||
llvm::SmallPtrSet<const ValueDecl *, 4> IgnoredDecls;
|
||||
bool AllEscaped = false;
|
||||
RecordDecl *GlobalizedRD = nullptr;
|
||||
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
|
||||
|
||||
void markAsEscaped(const ValueDecl *VD) {
|
||||
if (IgnoredDecls.count(VD) ||
|
||||
(CGF.CapturedStmtInfo &&
|
||||
CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD))))
|
||||
return;
|
||||
EscapedDecls.insert(VD);
|
||||
}
|
||||
|
||||
void VisitValueDecl(const ValueDecl *VD) {
|
||||
if (VD->getType()->isLValueReferenceType()) {
|
||||
markAsEscaped(VD);
|
||||
if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
|
||||
if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
|
||||
const bool SavedAllEscaped = AllEscaped;
|
||||
AllEscaped = true;
|
||||
Visit(VarD->getInit());
|
||||
AllEscaped = SavedAllEscaped;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void VisitOpenMPCapturedStmt(const CapturedStmt *S) {
|
||||
if (!S)
|
||||
return;
|
||||
for (const auto &C : S->captures()) {
|
||||
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
|
||||
const ValueDecl *VD = C.getCapturedVar();
|
||||
markAsEscaped(VD);
|
||||
if (isa<OMPCapturedExprDecl>(VD))
|
||||
VisitValueDecl(VD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
|
||||
static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
|
||||
return P1.first > P2.first;
|
||||
}
|
||||
|
||||
void buildRecordForGlobalizedVars() {
|
||||
assert(!GlobalizedRD &&
|
||||
"Record for globalized variables is built already.");
|
||||
if (EscapedDecls.empty())
|
||||
return;
|
||||
ASTContext &C = CGF.getContext();
|
||||
SmallVector<VarsDataTy, 4> GlobalizedVars;
|
||||
for (const auto *D : EscapedDecls)
|
||||
GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
|
||||
std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
|
||||
stable_sort_comparator);
|
||||
// Build struct _globalized_locals_ty {
|
||||
// /* globalized vars */
|
||||
// };
|
||||
GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
|
||||
GlobalizedRD->startDefinition();
|
||||
for (const auto &Pair : GlobalizedVars) {
|
||||
const ValueDecl *VD = Pair.second;
|
||||
QualType Type = VD->getType();
|
||||
if (Type->isLValueReferenceType())
|
||||
Type = C.getPointerType(Type.getNonReferenceType());
|
||||
else
|
||||
Type = Type.getNonReferenceType();
|
||||
SourceLocation Loc = VD->getLocation();
|
||||
auto *Field = FieldDecl::Create(
|
||||
C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
|
||||
C.getTrivialTypeSourceInfo(Type, SourceLocation()),
|
||||
/*BW=*/nullptr, /*Mutable=*/false,
|
||||
/*InitStyle=*/ICIS_NoInit);
|
||||
Field->setAccess(AS_public);
|
||||
GlobalizedRD->addDecl(Field);
|
||||
if (VD->hasAttrs()) {
|
||||
for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
|
||||
E(VD->getAttrs().end());
|
||||
I != E; ++I)
|
||||
Field->addAttr(*I);
|
||||
}
|
||||
MappedDeclsFields.try_emplace(VD, Field);
|
||||
}
|
||||
GlobalizedRD->completeDefinition();
|
||||
}
|
||||
|
||||
public:
|
||||
CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
|
||||
ArrayRef<const ValueDecl *> IgnoredDecls)
|
||||
: CGF(CGF), IgnoredDecls(IgnoredDecls.begin(), IgnoredDecls.end()) {}
|
||||
virtual ~CheckVarsEscapingDeclContext() = default;
|
||||
void VisitDeclStmt(const DeclStmt *S) {
|
||||
if (!S)
|
||||
return;
|
||||
for (const auto *D : S->decls())
|
||||
if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
|
||||
VisitValueDecl(VD);
|
||||
}
|
||||
void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
|
||||
if (!D)
|
||||
return;
|
||||
if (D->hasAssociatedStmt()) {
|
||||
if (const auto *S =
|
||||
dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt()))
|
||||
VisitOpenMPCapturedStmt(S);
|
||||
}
|
||||
}
|
||||
void VisitCapturedStmt(const CapturedStmt *S) {
|
||||
if (!S)
|
||||
return;
|
||||
for (const auto &C : S->captures()) {
|
||||
if (C.capturesVariable() && !C.capturesVariableByCopy()) {
|
||||
const ValueDecl *VD = C.getCapturedVar();
|
||||
markAsEscaped(VD);
|
||||
if (isa<OMPCapturedExprDecl>(VD))
|
||||
VisitValueDecl(VD);
|
||||
}
|
||||
}
|
||||
}
|
||||
void VisitLambdaExpr(const LambdaExpr *E) {
|
||||
if (!E)
|
||||
return;
|
||||
for (const auto &C : E->captures()) {
|
||||
if (C.capturesVariable()) {
|
||||
if (C.getCaptureKind() == LCK_ByRef) {
|
||||
const ValueDecl *VD = C.getCapturedVar();
|
||||
markAsEscaped(VD);
|
||||
if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
|
||||
VisitValueDecl(VD);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void VisitBlockExpr(const BlockExpr *E) {
|
||||
if (!E)
|
||||
return;
|
||||
for (const auto &C : E->getBlockDecl()->captures()) {
|
||||
if (C.isByRef()) {
|
||||
const VarDecl *VD = C.getVariable();
|
||||
markAsEscaped(VD);
|
||||
if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
|
||||
VisitValueDecl(VD);
|
||||
}
|
||||
}
|
||||
}
|
||||
void VisitCallExpr(const CallExpr *E) {
|
||||
if (!E)
|
||||
return;
|
||||
for (const Expr *Arg : E->arguments()) {
|
||||
if (!Arg)
|
||||
continue;
|
||||
if (Arg->isLValue()) {
|
||||
const bool SavedAllEscaped = AllEscaped;
|
||||
AllEscaped = true;
|
||||
Visit(Arg);
|
||||
AllEscaped = SavedAllEscaped;
|
||||
} else
|
||||
Visit(Arg);
|
||||
}
|
||||
Visit(E->getCallee());
|
||||
}
|
||||
void VisitDeclRefExpr(const DeclRefExpr *E) {
|
||||
if (!E)
|
||||
return;
|
||||
const ValueDecl *VD = E->getDecl();
|
||||
if (AllEscaped)
|
||||
markAsEscaped(VD);
|
||||
if (isa<OMPCapturedExprDecl>(VD))
|
||||
VisitValueDecl(VD);
|
||||
else if (const auto *VarD = dyn_cast<VarDecl>(VD))
|
||||
if (VarD->isInitCapture())
|
||||
VisitValueDecl(VD);
|
||||
}
|
||||
void VisitUnaryOperator(const UnaryOperator *E) {
|
||||
if (!E)
|
||||
return;
|
||||
if (E->getOpcode() == UO_AddrOf) {
|
||||
const bool SavedAllEscaped = AllEscaped;
|
||||
AllEscaped = true;
|
||||
Visit(E->getSubExpr());
|
||||
AllEscaped = SavedAllEscaped;
|
||||
} else
|
||||
Visit(E->getSubExpr());
|
||||
}
|
||||
void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
|
||||
if (!E)
|
||||
return;
|
||||
if (E->getCastKind() == CK_ArrayToPointerDecay) {
|
||||
const bool SavedAllEscaped = AllEscaped;
|
||||
AllEscaped = true;
|
||||
Visit(E->getSubExpr());
|
||||
AllEscaped = SavedAllEscaped;
|
||||
} else
|
||||
Visit(E->getSubExpr());
|
||||
}
|
||||
void VisitExpr(const Expr *E) {
|
||||
if (!E)
|
||||
return;
|
||||
bool SavedAllEscaped = AllEscaped;
|
||||
if (!E->isLValue())
|
||||
AllEscaped = false;
|
||||
for (const auto *Child : E->children())
|
||||
if (Child)
|
||||
Visit(Child);
|
||||
AllEscaped = SavedAllEscaped;
|
||||
}
|
||||
void VisitStmt(const Stmt *S) {
|
||||
if (!S)
|
||||
return;
|
||||
for (const auto *Child : S->children())
|
||||
if (Child)
|
||||
Visit(Child);
|
||||
}
|
||||
|
||||
const RecordDecl *getGlobalizedRecord() {
|
||||
if (!GlobalizedRD)
|
||||
buildRecordForGlobalizedVars();
|
||||
return GlobalizedRD;
|
||||
}
|
||||
|
||||
const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
|
||||
assert(GlobalizedRD &&
|
||||
"Record for globalized variables must be generated already.");
|
||||
auto I = MappedDeclsFields.find(VD);
|
||||
if (I == MappedDeclsFields.end())
|
||||
return nullptr;
|
||||
return I->getSecond();
|
||||
}
|
||||
|
||||
ArrayRef<const ValueDecl *> getEscapedDecls() const {
|
||||
return EscapedDecls.getArrayRef();
|
||||
}
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
/// Get the GPU warp size.
|
||||
|
@ -288,6 +543,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
|
|||
EntryFunctionState EST;
|
||||
WorkerFunctionState WST(CGM, D.getLocStart());
|
||||
Work.clear();
|
||||
WrapperFunctionsMap.clear();
|
||||
|
||||
// Emit target region as a standalone region.
|
||||
class NVPTXPrePostActionTy : public PrePostActionTy {
|
||||
|
@ -344,6 +600,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
|
|||
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
|
||||
|
||||
CGF.EmitBlock(MasterBB);
|
||||
// SEQUENTIAL (MASTER) REGION START
|
||||
// First action in sequential region:
|
||||
// Initialize the state of the OpenMP runtime library on the GPU.
|
||||
// TODO: Optimize runtime initialization and pass in correct value.
|
||||
|
@ -351,10 +608,65 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
|
|||
Bld.getInt16(/*RequiresOMPRuntime=*/1)};
|
||||
CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
|
||||
|
||||
// For data sharing, we need to initialize the stack.
|
||||
CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
|
||||
|
||||
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
|
||||
if (I == FunctionGlobalizedDecls.end())
|
||||
return;
|
||||
const RecordDecl *GlobalizedVarsRecord = I->getSecond().first;
|
||||
QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
|
||||
|
||||
// Recover pointer to this function's global record. The runtime will
|
||||
// handle the specifics of the allocation of the memory.
|
||||
// Use actual memory size of the record including the padding
|
||||
// for alignment purposes.
|
||||
unsigned Alignment =
|
||||
CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
|
||||
unsigned GlobalRecordSize =
|
||||
CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
|
||||
GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
|
||||
// TODO: allow the usage of shared memory to be controlled by
|
||||
// the user, for now, default to global.
|
||||
llvm::Value *GlobalRecordSizeArg[] = {
|
||||
llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
|
||||
CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
|
||||
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
|
||||
GlobalRecordSizeArg);
|
||||
llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
|
||||
GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
|
||||
FunctionToGlobalRecPtr.try_emplace(CGF.CurFn, GlobalRecValue);
|
||||
|
||||
// Emit the "global alloca" which is a GEP from the global declaration record
|
||||
// using the pointer returned by the runtime.
|
||||
LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
|
||||
auto &Res = I->getSecond().second;
|
||||
for (auto &Rec : Res) {
|
||||
const FieldDecl *FD = Rec.second.first;
|
||||
LValue VarAddr = CGF.EmitLValueForField(Base, FD);
|
||||
Rec.second.second = VarAddr.getAddress();
|
||||
}
|
||||
}
|
||||
|
||||
void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
|
||||
EntryFunctionState &EST) {
|
||||
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
|
||||
if (I != FunctionGlobalizedDecls.end()) {
|
||||
if (!CGF.HaveInsertPoint())
|
||||
return;
|
||||
auto I = FunctionToGlobalRecPtr.find(CGF.CurFn);
|
||||
if (I != FunctionToGlobalRecPtr.end()) {
|
||||
llvm::Value *Args[] = {I->getSecond()};
|
||||
CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
|
||||
Args);
|
||||
}
|
||||
}
|
||||
|
||||
if (!EST.ExitBB)
|
||||
EST.ExitBB = CGF.createBasicBlock(".exit");
|
||||
|
||||
|
@ -543,14 +855,13 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
|
|||
// Execute this outlined function.
|
||||
CGF.EmitBlock(ExecuteFNBB);
|
||||
|
||||
// Insert call to work function.
|
||||
// FIXME: Pass arguments to outlined function from master thread.
|
||||
auto *Fn = cast<llvm::Function>(W);
|
||||
Address ZeroAddr =
|
||||
CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
|
||||
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
|
||||
llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
|
||||
emitCall(CGF, WST.Loc, Fn, FnArgs);
|
||||
// Insert call to work function via shared wrapper. The shared
|
||||
// wrapper takes two arguments:
|
||||
// - the parallelism level;
|
||||
// - the master thread ID;
|
||||
emitOutlinedFunctionCall(CGF, WST.Loc, W,
|
||||
{Bld.getInt16(/*ParallelLevel=*/0),
|
||||
getMasterThreadID(CGF)});
|
||||
|
||||
// Go to end of parallel region.
|
||||
CGF.EmitBranch(TerminateBB);
|
||||
|
@ -619,8 +930,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
|
|||
case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
|
||||
/// Build void __kmpc_kernel_prepare_parallel(
|
||||
/// void *outlined_function, int16_t IsOMPRuntimeInitialized);
|
||||
llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
|
||||
CGM.Int16Ty};
|
||||
llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
|
||||
|
@ -758,6 +1068,56 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
|
|||
FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
|
||||
/// Build void __kmpc_data_sharing_init_stack();
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
|
||||
// Build void *__kmpc_data_sharing_push_stack(size_t size,
|
||||
// int16_t UseSharedMemory);
|
||||
llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(
|
||||
FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
|
||||
// Build void __kmpc_data_sharing_pop_stack(void *a);
|
||||
llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(FnTy,
|
||||
/*Name=*/"__kmpc_data_sharing_pop_stack");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
|
||||
/// Build void __kmpc_begin_sharing_variables(void ***args,
|
||||
/// size_t n_args);
|
||||
llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
|
||||
/// Build void __kmpc_end_sharing_variables();
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_get_shared_variables: {
|
||||
/// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
|
||||
llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
|
||||
llvm::FunctionType *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
|
||||
break;
|
||||
}
|
||||
}
|
||||
return RTLFn;
|
||||
}
|
||||
|
@ -847,8 +1207,16 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
|
|||
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
|
||||
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
|
||||
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
|
||||
return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
|
||||
InnermostKind, CodeGen);
|
||||
auto *OutlinedFun =
|
||||
cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
|
||||
D, ThreadIDVar, InnermostKind, CodeGen));
|
||||
if (!isInSpmdExecutionMode()) {
|
||||
llvm::Function *WrapperFun =
|
||||
createParallelDataSharingWrapper(OutlinedFun, D);
|
||||
WrapperFunctionsMap[OutlinedFun] = WrapperFun;
|
||||
}
|
||||
|
||||
return OutlinedFun;
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
|
||||
|
@ -900,16 +1268,58 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
|
|||
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
|
||||
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
|
||||
llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
|
||||
llvm::Function *WFn = WrapperFunctionsMap[Fn];
|
||||
|
||||
auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
|
||||
assert(WFn && "Wrapper function does not exist!");
|
||||
|
||||
// Force inline this outlined function at its call site.
|
||||
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
|
||||
auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
|
||||
PrePostActionTy &) {
|
||||
CGBuilderTy &Bld = CGF.Builder;
|
||||
|
||||
// TODO: Optimize runtime initialization.
|
||||
llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy),
|
||||
/*RequiresOMPRuntime=*/Bld.getInt16(1)};
|
||||
CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
|
||||
Args);
|
||||
llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
|
||||
|
||||
// Prepare for parallel region. Indicate the outlined function.
|
||||
llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
|
||||
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
|
||||
OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
|
||||
Args);
|
||||
|
||||
// Create a private scope that will globalize the arguments
|
||||
// passed from the outside of the target region.
|
||||
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
|
||||
|
||||
// There's somehting to share.
|
||||
if (!CapturedVars.empty()) {
|
||||
// Prepare for parallel region. Indicate the outlined function.
|
||||
Address SharedArgs =
|
||||
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
|
||||
llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
|
||||
|
||||
llvm::Value *DataSharingArgs[] = {
|
||||
SharedArgsPtr,
|
||||
llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
|
||||
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
|
||||
OMPRTL_NVPTX__kmpc_begin_sharing_variables),
|
||||
DataSharingArgs);
|
||||
|
||||
// Store variable address in a list of references to pass to workers.
|
||||
unsigned Idx = 0;
|
||||
ASTContext &Ctx = CGF.getContext();
|
||||
Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
|
||||
Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
|
||||
.castAs<PointerType>());
|
||||
for (llvm::Value *V : CapturedVars) {
|
||||
Address Dst = Bld.CreateConstInBoundsGEP(
|
||||
SharedArgListAddress, Idx, CGF.getPointerSize());
|
||||
llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
|
||||
CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
|
||||
Ctx.getPointerType(Ctx.VoidPtrTy));
|
||||
Idx++;
|
||||
}
|
||||
}
|
||||
|
||||
// Activate workers. This barrier is used by the master to signal
|
||||
// work for the workers.
|
||||
|
@ -923,8 +1333,12 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
|
|||
// The master waits at this barrier until all workers are done.
|
||||
syncCTAThreads(CGF);
|
||||
|
||||
if (!CapturedVars.empty())
|
||||
CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
|
||||
|
||||
// Remember for post-processing in worker loop.
|
||||
Work.emplace_back(Fn);
|
||||
Work.emplace_back(WFn);
|
||||
};
|
||||
|
||||
auto *RTLoc = emitUpdateLocation(CGF, Loc);
|
||||
|
@ -2343,3 +2757,149 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
|
|||
}
|
||||
CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
|
||||
}
|
||||
|
||||
/// Emit function which wraps the outline parallel region
|
||||
/// and controls the arguments which are passed to this function.
|
||||
/// The wrapper ensures that the outlined function is called
|
||||
/// with the correct arguments when data is shared.
|
||||
llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
|
||||
llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
|
||||
ASTContext &Ctx = CGM.getContext();
|
||||
const auto &CS = *D.getCapturedStmt(OMPD_parallel);
|
||||
|
||||
// Create a function that takes as argument the source thread.
|
||||
FunctionArgList WrapperArgs;
|
||||
QualType Int16QTy =
|
||||
Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
|
||||
QualType Int32QTy =
|
||||
Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
|
||||
ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
|
||||
/*Id=*/nullptr, Int16QTy,
|
||||
ImplicitParamDecl::Other);
|
||||
ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
|
||||
/*Id=*/nullptr, Int32QTy,
|
||||
ImplicitParamDecl::Other);
|
||||
WrapperArgs.emplace_back(&ParallelLevelArg);
|
||||
WrapperArgs.emplace_back(&WrapperArg);
|
||||
|
||||
auto &CGFI =
|
||||
CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
|
||||
|
||||
auto *Fn = llvm::Function::Create(
|
||||
CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
|
||||
OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
|
||||
CGM.SetInternalFunctionAttributes(/*D=*/GlobalDecl(), Fn, CGFI);
|
||||
Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
|
||||
|
||||
CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
|
||||
CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
|
||||
D.getLocStart(), D.getLocStart());
|
||||
|
||||
const auto *RD = CS.getCapturedRecordDecl();
|
||||
auto CurField = RD->field_begin();
|
||||
|
||||
// Get the array of arguments.
|
||||
SmallVector<llvm::Value *, 8> Args;
|
||||
|
||||
// TODO: suppport SIMD and pass actual values
|
||||
Args.emplace_back(
|
||||
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
|
||||
Args.emplace_back(
|
||||
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
|
||||
|
||||
CGBuilderTy &Bld = CGF.Builder;
|
||||
auto CI = CS.capture_begin();
|
||||
|
||||
// Use global memory for data sharing.
|
||||
// Handle passing of global args to workers.
|
||||
Address GlobalArgs =
|
||||
CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
|
||||
llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
|
||||
llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
|
||||
CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
|
||||
DataSharingArgs);
|
||||
|
||||
// Retrieve the shared variables from the list of references returned
|
||||
// by the runtime. Pass the variables to the outlined function.
|
||||
if (CS.capture_size() > 0) {
|
||||
ASTContext &CGFContext = CGF.getContext();
|
||||
Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs,
|
||||
CGFContext
|
||||
.getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy))
|
||||
.castAs<PointerType>());
|
||||
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
|
||||
QualType ElemTy = CurField->getType();
|
||||
Address Src = Bld.CreateConstInBoundsGEP(
|
||||
SharedArgListAddress, I, CGF.getPointerSize());
|
||||
Address TypedAddress = Bld.CreateBitCast(
|
||||
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
|
||||
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
|
||||
/*Volatile=*/false,
|
||||
CGFContext.getPointerType(ElemTy),
|
||||
CI->getLocation());
|
||||
Args.emplace_back(Arg);
|
||||
}
|
||||
}
|
||||
|
||||
emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
|
||||
CGF.FinishFunction();
|
||||
return Fn;
|
||||
}
|
||||
|
||||
void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
|
||||
const Decl *D) {
|
||||
assert(D && "Expected function or captured|block decl.");
|
||||
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
|
||||
"Function is registered already.");
|
||||
SmallVector<const ValueDecl *, 4> IgnoredDecls;
|
||||
const Stmt *Body = nullptr;
|
||||
if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
|
||||
Body = FD->getBody();
|
||||
} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
|
||||
Body = BD->getBody();
|
||||
} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
|
||||
Body = CD->getBody();
|
||||
if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) {
|
||||
if (const auto *CS = dyn_cast<CapturedStmt>(Body)) {
|
||||
IgnoredDecls.reserve(CS->capture_size());
|
||||
for (const auto &Capture : CS->captures())
|
||||
if (Capture.capturesVariable())
|
||||
IgnoredDecls.emplace_back(Capture.getCapturedVar());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!Body)
|
||||
return;
|
||||
CheckVarsEscapingDeclContext VarChecker(CGF, IgnoredDecls);
|
||||
VarChecker.Visit(Body);
|
||||
const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
|
||||
if (!GlobalizedVarsRecord)
|
||||
return;
|
||||
auto &Res =
|
||||
FunctionGlobalizedDecls
|
||||
.try_emplace(CGF.CurFn, GlobalizedVarsRecord, DeclToAddrMapTy())
|
||||
.first->getSecond()
|
||||
.second;
|
||||
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
|
||||
const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
|
||||
Res.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
|
||||
}
|
||||
}
|
||||
|
||||
Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
|
||||
const VarDecl *VD) {
|
||||
auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
|
||||
if (I == FunctionGlobalizedDecls.end())
|
||||
return Address::invalid();
|
||||
auto VDI = I->getSecond().second.find(VD);
|
||||
if (VDI == I->getSecond().second.end())
|
||||
return Address::invalid();
|
||||
return VDI->second.second;
|
||||
}
|
||||
|
||||
void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
|
||||
FunctionToGlobalRecPtr.erase(CGF.CurFn);
|
||||
FunctionGlobalizedDecls.erase(CGF.CurFn);
|
||||
CGOpenMPRuntime::functionFinished(CGF);
|
||||
}
|
||||
|
|
|
@ -289,6 +289,14 @@ public:
|
|||
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
|
||||
ArrayRef<llvm::Value *> Args = llvm::None) const override;
|
||||
|
||||
/// Emits OpenMP-specific function prolog.
|
||||
/// Required for device constructs.
|
||||
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override;
|
||||
|
||||
/// Gets the OpenMP-specific address of the local variable.
|
||||
Address getAddressOfLocalVariable(CodeGenFunction &CGF,
|
||||
const VarDecl *VD) override;
|
||||
|
||||
/// Target codegen is specialized based on two programming models: the
|
||||
/// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
|
||||
/// model for constructs like 'target parallel' that support it.
|
||||
|
@ -300,12 +308,37 @@ public:
|
|||
Unknown,
|
||||
};
|
||||
|
||||
/// Cleans up references to the objects in finished function.
|
||||
///
|
||||
void functionFinished(CodeGenFunction &CGF) override;
|
||||
|
||||
private:
|
||||
// Track the execution mode when codegening directives within a target
|
||||
// region. The appropriate mode (generic/spmd) is set on entry to the
|
||||
// target region and used by containing directives such as 'parallel'
|
||||
// to emit optimized code.
|
||||
ExecutionMode CurrentExecutionMode;
|
||||
|
||||
/// Map between an outlined function and its wrapper.
|
||||
llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
|
||||
|
||||
/// Emit function which wraps the outline parallel region
|
||||
/// and controls the parameters which are passed to this function.
|
||||
/// The wrapper ensures that the outlined function is called
|
||||
/// with the correct arguments when data is shared.
|
||||
llvm::Function *createParallelDataSharingWrapper(
|
||||
llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D);
|
||||
|
||||
/// The map of local variables to their addresses in the global memory.
|
||||
using DeclToAddrMapTy = llvm::MapVector<const Decl *,
|
||||
std::pair<const FieldDecl *, Address>>;
|
||||
/// Maps the function to the list of the globalized variables with their
|
||||
/// addresses.
|
||||
llvm::DenseMap<llvm::Function *,
|
||||
std::pair<const RecordDecl *, DeclToAddrMapTy>>
|
||||
FunctionGlobalizedDecls;
|
||||
/// Map from function to global record pointer.
|
||||
llvm::DenseMap<llvm::Function *, llvm::Value *> FunctionToGlobalRecPtr;
|
||||
};
|
||||
|
||||
} // CodeGen namespace.
|
||||
|
|
|
@ -585,6 +585,7 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) {
|
|||
/*RegisterCastedArgsOnly=*/true,
|
||||
CapturedStmtInfo->getHelperName());
|
||||
CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true);
|
||||
WrapperCGF.CapturedStmtInfo = CapturedStmtInfo;
|
||||
Args.clear();
|
||||
LocalAddrs.clear();
|
||||
VLASizes.clear();
|
||||
|
|
|
@ -1067,6 +1067,11 @@ void CodeGenFunction::StartFunction(GlobalDecl GD,
|
|||
EmitStartEHSpec(CurCodeDecl);
|
||||
|
||||
PrologueCleanupDepth = EHStack.stable_begin();
|
||||
|
||||
// Emit OpenMP specific initialization of the device functions.
|
||||
if (getLangOpts().OpenMP && CurCodeDecl)
|
||||
CGM.getOpenMPRuntime().emitFunctionProlog(*this, CurCodeDecl);
|
||||
|
||||
EmitFunctionProlog(*CurFnInfo, CurFn, Args);
|
||||
|
||||
if (D && isa<CXXMethodDecl>(D) && cast<CXXMethodDecl>(D)->isInstance()) {
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
// Test device global memory data sharing codegen.
|
||||
///==========================================================================///
|
||||
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1
|
||||
|
||||
// expected-no-diagnostics
|
||||
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
void test_ds(){
|
||||
#pragma omp target
|
||||
{
|
||||
int a = 10;
|
||||
#pragma omp parallel
|
||||
{
|
||||
a = 1000;
|
||||
}
|
||||
int b = 100;
|
||||
#pragma omp parallel
|
||||
{
|
||||
b = a + 10000;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// ========= In the kernel function ========= ///
|
||||
|
||||
// CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}()
|
||||
// CK1: [[SHAREDARGS1:%.+]] = alloca i8**
|
||||
// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
|
||||
// CK1: call void @__kmpc_kernel_init
|
||||
// CK1: call void @__kmpc_data_sharing_init_stack
|
||||
// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0)
|
||||
// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
|
||||
// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
|
||||
// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
|
||||
// CK1: store i32 10, i32* [[A]]
|
||||
// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1)
|
||||
// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1)
|
||||
// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
|
||||
// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0
|
||||
// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8*
|
||||
// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]]
|
||||
// CK1: call void @llvm.nvvm.barrier0()
|
||||
// CK1: call void @llvm.nvvm.barrier0()
|
||||
// CK1: call void @__kmpc_end_sharing_variables()
|
||||
// CK1: store i32 100, i32* [[B]]
|
||||
// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1)
|
||||
// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2)
|
||||
// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]]
|
||||
// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0
|
||||
// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8*
|
||||
// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]]
|
||||
// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1
|
||||
// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8*
|
||||
// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]]
|
||||
// CK1: call void @llvm.nvvm.barrier0()
|
||||
// CK1: call void @llvm.nvvm.barrier0()
|
||||
// CK1: call void @__kmpc_end_sharing_variables()
|
||||
// CK1: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]])
|
||||
// CK1: call void @__kmpc_kernel_deinit(i16 1)
|
||||
|
||||
/// ========= In the data sharing wrapper function ========= ///
|
||||
|
||||
// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
|
||||
// CK1: [[SHAREDARGS4:%.+]] = alloca i8**
|
||||
// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]])
|
||||
// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]]
|
||||
// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0
|
||||
// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32**
|
||||
// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]]
|
||||
// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]])
|
||||
|
||||
/// ========= In the data sharing wrapper function ========= ///
|
||||
|
||||
// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
|
||||
// CK1: [[SHAREDARGS3:%.+]] = alloca i8**
|
||||
// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]])
|
||||
// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]]
|
||||
// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0
|
||||
// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32**
|
||||
// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]]
|
||||
// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1
|
||||
// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32**
|
||||
// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]]
|
||||
// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]])
|
||||
|
||||
#endif
|
||||
|
|
@ -64,254 +64,243 @@ int bar(int n){
|
|||
|
||||
// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
|
||||
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
|
||||
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
|
||||
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
|
||||
//
|
||||
// CHECK: [[AWAIT_WORK]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
|
||||
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
|
||||
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
|
||||
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
|
||||
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
|
||||
//
|
||||
// CHECK: [[SEL_WORKERS]]
|
||||
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
|
||||
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
|
||||
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PARALLEL]]
|
||||
// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
|
||||
// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PFN1]]
|
||||
// CHECK: call void [[PARALLEL_FN1]]_wrapper(
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_NEXT1]]
|
||||
// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
|
||||
// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PFN2]]
|
||||
// CHECK: call void [[PARALLEL_FN2]]_wrapper(
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_NEXT2]]
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[TERM_PARALLEL]]
|
||||
// CHECK: call void @__kmpc_kernel_end_parallel()
|
||||
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
|
||||
//
|
||||
// CHECK: [[BAR_PARALLEL]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
|
||||
// Create local storage for each capture.
|
||||
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
|
||||
// Store captures in the context.
|
||||
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
|
||||
//
|
||||
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
|
||||
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
|
||||
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
|
||||
//
|
||||
// CHECK: [[WORKER]]
|
||||
// CHECK: {{call|invoke}} void [[T6]]_worker()
|
||||
// CHECK: br label {{%?}}[[EXIT:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_MASTER]]
|
||||
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
|
||||
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[MASTER]]
|
||||
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
|
||||
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @__kmpc_serialized_parallel(
|
||||
// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
|
||||
// CHECK: call void @__kmpc_end_serialized_parallel(
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*),
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK-64-DAG: load i32, i32* [[REF_A]]
|
||||
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
|
||||
// CHECK: br label {{%?}}[[TERMINATE:.+]]
|
||||
//
|
||||
// CHECK: [[TERMINATE]]
|
||||
// CHECK: call void @__kmpc_kernel_deinit(
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-DAG: define internal void [[PARALLEL_FN1]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 42, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-DAG: define internal void [[PARALLEL_FN3]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 43, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-DAG: define internal void [[PARALLEL_FN2]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
|
||||
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
|
||||
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
|
||||
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
|
||||
//
|
||||
// CHECK: [[AWAIT_WORK]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
|
||||
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
|
||||
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
|
||||
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
|
||||
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
|
||||
//
|
||||
// CHECK: [[SEL_WORKERS]]
|
||||
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
|
||||
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
|
||||
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PARALLEL]]
|
||||
// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
|
||||
// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PFN1]]
|
||||
// CHECK: call void [[PARALLEL_FN1]](
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_NEXT1]]
|
||||
// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
|
||||
// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PFN2]]
|
||||
// CHECK: call void [[PARALLEL_FN2]](
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_NEXT2]]
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[TERM_PARALLEL]]
|
||||
// CHECK: call void @__kmpc_kernel_end_parallel()
|
||||
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
|
||||
//
|
||||
// CHECK: [[BAR_PARALLEL]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
|
||||
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
|
||||
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
|
||||
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
|
||||
//
|
||||
// CHECK: [[AWAIT_WORK]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
|
||||
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
|
||||
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
|
||||
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
|
||||
//
|
||||
// CHECK: [[SEL_WORKERS]]
|
||||
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
|
||||
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
|
||||
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PARALLEL]]
|
||||
// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32)* [[PARALLEL_FN4:@.+]]_wrapper to i8*)
|
||||
// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PFN]]
|
||||
// CHECK: call void [[PARALLEL_FN4]]_wrapper(
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_NEXT]]
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[TERM_PARALLEL]]
|
||||
// CHECK: call void @__kmpc_kernel_end_parallel()
|
||||
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
|
||||
//
|
||||
// CHECK: [[BAR_PARALLEL]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
|
||||
// Create local storage for each capture.
|
||||
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
|
||||
// Store captures in the context.
|
||||
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
|
||||
//
|
||||
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
|
||||
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
|
||||
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
|
||||
//
|
||||
// CHECK: [[WORKER]]
|
||||
// CHECK: {{call|invoke}} void [[T6]]_worker()
|
||||
// CHECK: br label {{%?}}[[EXIT:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_MASTER]]
|
||||
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
|
||||
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[MASTER]]
|
||||
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
|
||||
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*),
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @__kmpc_serialized_parallel(
|
||||
// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
|
||||
// CHECK: call void @__kmpc_end_serialized_parallel(
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*),
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK-64-DAG: load i32, i32* [[REF_A]]
|
||||
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
|
||||
// CHECK: br label {{%?}}[[TERMINATE:.+]]
|
||||
//
|
||||
// CHECK: [[TERMINATE]]
|
||||
// CHECK: call void @__kmpc_kernel_deinit(
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
|
||||
// Create local storage for each capture.
|
||||
// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]*
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]]
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
|
||||
// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
|
||||
// Store captures in the context.
|
||||
// CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32*
|
||||
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
|
||||
// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
|
||||
// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
|
||||
//
|
||||
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
|
||||
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
|
||||
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
|
||||
//
|
||||
// CHECK: [[WORKER]]
|
||||
// CHECK: {{call|invoke}} void [[T6]]_worker()
|
||||
// CHECK: br label {{%?}}[[EXIT:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_MASTER]]
|
||||
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
|
||||
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[MASTER]]
|
||||
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
|
||||
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
|
||||
// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]],
|
||||
// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]],
|
||||
// CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000
|
||||
// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
|
||||
//
|
||||
// CHECK: [[IF_THEN]]
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN4]]_wrapper to i8*),
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[IF_END:.+]]
|
||||
//
|
||||
// CHECK: [[IF_ELSE]]
|
||||
// CHECK: call void @__kmpc_serialized_parallel(
|
||||
// CHECK: {{call|invoke}} void [[PARALLEL_FN4]](
|
||||
// CHECK: call void @__kmpc_end_serialized_parallel(
|
||||
// br label [[IF_END]]
|
||||
//
|
||||
// CHECK: [[IF_END]]
|
||||
// CHECK-64-DAG: load i32, i32* [[REF_A]]
|
||||
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
|
||||
// CHECK-DAG: load i16, i16* [[REF_AA]]
|
||||
// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
|
||||
//
|
||||
// CHECK: br label {{%?}}[[TERMINATE:.+]]
|
||||
//
|
||||
// CHECK: [[TERMINATE]]
|
||||
// CHECK: call void @__kmpc_kernel_deinit(
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-DAG: define internal void [[PARALLEL_FN1]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 42, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-DAG: define internal void [[PARALLEL_FN3]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 43, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-DAG: define internal void [[PARALLEL_FN2]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
|
||||
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
|
||||
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
|
||||
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
|
||||
//
|
||||
// CHECK: [[AWAIT_WORK]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
|
||||
// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
|
||||
// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
|
||||
// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
|
||||
// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
|
||||
//
|
||||
// CHECK: [[SEL_WORKERS]]
|
||||
// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
|
||||
// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
|
||||
// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PARALLEL]]
|
||||
// CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
|
||||
// CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*)
|
||||
// CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
|
||||
//
|
||||
// CHECK: [[EXEC_PFN]]
|
||||
// CHECK: call void [[PARALLEL_FN4]](
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_NEXT]]
|
||||
// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
|
||||
//
|
||||
// CHECK: [[TERM_PARALLEL]]
|
||||
// CHECK: call void @__kmpc_kernel_end_parallel()
|
||||
// CHECK: br label {{%?}}[[BAR_PARALLEL]]
|
||||
//
|
||||
// CHECK: [[BAR_PARALLEL]]
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[AWAIT_WORK]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
|
||||
// Create local storage for each capture.
|
||||
// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_B:%.+]] = alloca [10 x i32]*
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_N:%.+]], i[[SZ]]* [[LOCAL_N]]
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
|
||||
// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
|
||||
// Store captures in the context.
|
||||
// CHECK-64-DAG:[[REF_N:%.+]] = bitcast i[[SZ]]* [[LOCAL_N]] to i32*
|
||||
// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
|
||||
// CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
|
||||
// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
|
||||
//
|
||||
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
|
||||
// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
|
||||
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
|
||||
//
|
||||
// CHECK: [[WORKER]]
|
||||
// CHECK: {{call|invoke}} void [[T6]]_worker()
|
||||
// CHECK: br label {{%?}}[[EXIT:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_MASTER]]
|
||||
// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
|
||||
// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[MASTER]]
|
||||
// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
|
||||
// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
|
||||
// CHECK-64: [[N:%.+]] = load i32, i32* [[REF_N]],
|
||||
// CHECK-32: [[N:%.+]] = load i32, i32* [[LOCAL_N]],
|
||||
// CHECK: [[CMP:%.+]] = icmp sgt i32 [[N]], 1000
|
||||
// CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
|
||||
//
|
||||
// CHECK: [[IF_THEN]]
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*),
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[IF_END:.+]]
|
||||
//
|
||||
// CHECK: [[IF_ELSE]]
|
||||
// CHECK: call void @__kmpc_serialized_parallel(
|
||||
// CHECK: {{call|invoke}} void [[PARALLEL_FN4]](
|
||||
// CHECK: call void @__kmpc_end_serialized_parallel(
|
||||
// br label [[IF_END]]
|
||||
//
|
||||
// CHECK: [[IF_END]]
|
||||
// CHECK-64-DAG: load i32, i32* [[REF_A]]
|
||||
// CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
|
||||
// CHECK-DAG: load i16, i16* [[REF_AA]]
|
||||
// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
|
||||
//
|
||||
// CHECK: br label {{%?}}[[TERMINATE:.+]]
|
||||
//
|
||||
// CHECK: [[TERMINATE]]
|
||||
// CHECK: call void @__kmpc_kernel_deinit(
|
||||
// CHECK: call void @llvm.nvvm.barrier0()
|
||||
// CHECK: br label {{%?}}[[EXIT]]
|
||||
//
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define internal void [[PARALLEL_FN4]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 45, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
// CHECK: define internal void [[PARALLEL_FN4]](
|
||||
// CHECK: [[A:%.+]] = alloca i[[SZ:32|64]],
|
||||
// CHECK: store i[[SZ]] 45, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue