2016-02-08 23:59:20 +08:00
|
|
|
//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This provides a class for OpenMP runtime code generation specialized to NVPTX
|
|
|
|
// targets.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "CGOpenMPRuntimeNVPTX.h"
|
2016-03-04 17:22:22 +08:00
|
|
|
#include "clang/AST/DeclOpenMP.h"
|
2016-04-04 23:55:02 +08:00
|
|
|
#include "CodeGenFunction.h"
|
|
|
|
#include "clang/AST/StmtOpenMP.h"
|
2016-02-08 23:59:20 +08:00
|
|
|
|
|
|
|
using namespace clang;
|
|
|
|
using namespace CodeGen;
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
namespace {
|
|
|
|
enum OpenMPRTLFunctionNVPTX {
|
2017-01-05 23:24:05 +08:00
|
|
|
/// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
|
2017-01-04 04:19:56 +08:00
|
|
|
OMPRTL_NVPTX__kmpc_kernel_init,
|
2017-01-05 23:24:05 +08:00
|
|
|
/// \brief Call to void __kmpc_kernel_deinit();
|
|
|
|
OMPRTL_NVPTX__kmpc_kernel_deinit,
|
2017-01-10 23:42:51 +08:00
|
|
|
/// \brief Call to void __kmpc_kernel_prepare_parallel(void
|
|
|
|
/// *outlined_function);
|
|
|
|
OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
|
|
|
|
/// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function);
|
|
|
|
OMPRTL_NVPTX__kmpc_kernel_parallel,
|
|
|
|
/// \brief Call to void __kmpc_kernel_end_parallel();
|
|
|
|
OMPRTL_NVPTX__kmpc_kernel_end_parallel,
|
|
|
|
/// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
|
|
|
|
/// global_tid);
|
|
|
|
OMPRTL_NVPTX__kmpc_serialized_parallel,
|
|
|
|
/// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
|
|
|
|
/// global_tid);
|
|
|
|
OMPRTL_NVPTX__kmpc_end_serialized_parallel,
|
2017-01-04 04:19:56 +08:00
|
|
|
};
|
2017-01-10 23:42:51 +08:00
|
|
|
|
|
|
|
/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
|
|
|
|
class NVPTXActionTy final : public PrePostActionTy {
|
|
|
|
llvm::Value *EnterCallee;
|
|
|
|
ArrayRef<llvm::Value *> EnterArgs;
|
|
|
|
llvm::Value *ExitCallee;
|
|
|
|
ArrayRef<llvm::Value *> ExitArgs;
|
|
|
|
bool Conditional;
|
|
|
|
llvm::BasicBlock *ContBlock = nullptr;
|
|
|
|
|
|
|
|
public:
|
|
|
|
NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
|
|
|
|
llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
|
|
|
|
bool Conditional = false)
|
|
|
|
: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
|
|
|
|
ExitArgs(ExitArgs), Conditional(Conditional) {}
|
|
|
|
void Enter(CodeGenFunction &CGF) override {
|
|
|
|
llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
|
|
|
|
if (Conditional) {
|
|
|
|
llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
|
|
|
|
auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
|
|
|
|
ContBlock = CGF.createBasicBlock("omp_if.end");
|
|
|
|
// Generate the branch (If-stmt)
|
|
|
|
CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
|
|
|
|
CGF.EmitBlock(ThenBlock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void Done(CodeGenFunction &CGF) {
|
|
|
|
// Emit the rest of blocks/branches
|
|
|
|
CGF.EmitBranch(ContBlock);
|
|
|
|
CGF.EmitBlock(ContBlock, true);
|
|
|
|
}
|
|
|
|
void Exit(CodeGenFunction &CGF) override {
|
|
|
|
CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
} // anonymous namespace
|
2017-01-04 04:19:56 +08:00
|
|
|
|
|
|
|
/// Get the GPU warp size.
|
|
|
|
static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateCall(
|
|
|
|
llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::None, "nvptx_warp_size");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get the id of the current thread on the GPU.
|
|
|
|
static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateCall(
|
|
|
|
llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::None, "nvptx_tid");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get the maximum number of threads in a block of the GPU.
|
|
|
|
static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateCall(
|
|
|
|
llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::None, "nvptx_num_threads");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get barrier to synchronize all threads in a block.
|
|
|
|
static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
Bld.CreateCall(llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Synchronize all GPU threads in a block.
|
|
|
|
static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
|
2016-03-22 09:48:56 +08:00
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
/// Get the value of the thread_limit clause in the teams directive.
|
|
|
|
/// The runtime encodes thread_limit in the launch parameter, always starting
|
|
|
|
/// thread_limit+warpSize threads per team.
|
|
|
|
static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
|
|
|
|
"thread_limit");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get the thread id of the OMP master thread.
|
2016-03-22 09:48:56 +08:00
|
|
|
/// The master thread id is the first thread (lane) of the last warp in the
|
|
|
|
/// GPU block. Warp size is assumed to be some power of 2.
|
|
|
|
/// Thread id is 0 indexed.
|
|
|
|
/// E.g: If NumThreads is 33, master id is 32.
|
|
|
|
/// If NumThreads is 64, master id is 32.
|
|
|
|
/// If NumThreads is 1024, master id is 992.
|
2017-01-04 04:19:56 +08:00
|
|
|
static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
|
|
|
|
|
|
|
|
// We assume that the warp size is a power of 2.
|
|
|
|
llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
|
|
|
|
|
|
|
|
return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
|
|
|
|
Bld.CreateNot(Mask), "master_tid");
|
|
|
|
}
|
|
|
|
|
|
|
|
CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
|
|
|
|
CodeGenModule &CGM)
|
|
|
|
: WorkerFn(nullptr), CGFI(nullptr) {
|
|
|
|
createWorkerFunction(CGM);
|
2016-03-22 18:41:20 +08:00
|
|
|
}
|
2016-03-22 09:48:56 +08:00
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
|
|
|
|
CodeGenModule &CGM) {
|
|
|
|
// Create an worker function with no arguments.
|
|
|
|
CGFI = &CGM.getTypes().arrangeNullaryFunction();
|
|
|
|
|
|
|
|
WorkerFn = llvm::Function::Create(
|
|
|
|
CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
|
|
|
|
/* placeholder */ "_worker", &CGM.getModule());
|
|
|
|
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
|
|
|
|
}
|
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
|
|
|
|
StringRef ParentName,
|
|
|
|
llvm::Function *&OutlinedFn,
|
|
|
|
llvm::Constant *&OutlinedFnID,
|
|
|
|
bool IsOffloadEntry,
|
|
|
|
const RegionCodeGenTy &CodeGen) {
|
|
|
|
EntryFunctionState EST;
|
|
|
|
WorkerFunctionState WST(CGM);
|
2017-01-10 23:42:51 +08:00
|
|
|
Work.clear();
|
2017-01-05 23:24:05 +08:00
|
|
|
|
|
|
|
// Emit target region as a standalone region.
|
|
|
|
class NVPTXPrePostActionTy : public PrePostActionTy {
|
|
|
|
CGOpenMPRuntimeNVPTX &RT;
|
|
|
|
CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
|
|
|
|
CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
|
|
|
|
|
|
|
|
public:
|
|
|
|
NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
|
|
|
|
CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
|
|
|
|
CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
|
|
|
|
: RT(RT), EST(EST), WST(WST) {}
|
|
|
|
void Enter(CodeGenFunction &CGF) override {
|
|
|
|
RT.emitGenericEntryHeader(CGF, EST, WST);
|
|
|
|
}
|
|
|
|
void Exit(CodeGenFunction &CGF) override {
|
|
|
|
RT.emitGenericEntryFooter(CGF, EST);
|
|
|
|
}
|
|
|
|
} Action(*this, EST, WST);
|
|
|
|
CodeGen.setAction(Action);
|
|
|
|
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
|
|
|
|
IsOffloadEntry, CodeGen);
|
2017-01-05 02:44:50 +08:00
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
// Create the worker function
|
|
|
|
emitWorkerFunction(WST);
|
|
|
|
|
|
|
|
// Now change the name of the worker function to correspond to this target
|
|
|
|
// region's entry function.
|
|
|
|
WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup NVPTX threads for master-worker OpenMP scheme.
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
|
|
|
|
EntryFunctionState &EST,
|
|
|
|
WorkerFunctionState &WST) {
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
|
|
|
|
llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
|
|
|
|
llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
|
|
|
|
llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
|
|
|
|
EST.ExitBB = CGF.createBasicBlock(".exit");
|
|
|
|
|
|
|
|
auto *IsWorker =
|
|
|
|
Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
|
|
|
|
Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(WorkerBB);
|
|
|
|
CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
|
|
|
|
CGF.EmitBranch(EST.ExitBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(MasterCheckBB);
|
|
|
|
auto *IsMaster =
|
|
|
|
Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
|
|
|
|
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(MasterBB);
|
|
|
|
// First action in sequential region:
|
|
|
|
// Initialize the state of the OpenMP runtime library on the GPU.
|
|
|
|
llvm::Value *Args[] = {getThreadLimit(CGF)};
|
|
|
|
CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
|
|
|
|
EntryFunctionState &EST) {
|
|
|
|
if (!EST.ExitBB)
|
|
|
|
EST.ExitBB = CGF.createBasicBlock(".exit");
|
|
|
|
|
|
|
|
llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
|
|
|
|
CGF.EmitBranch(TerminateBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(TerminateBB);
|
|
|
|
// Signal termination condition.
|
|
|
|
CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
|
|
|
|
// Barrier to terminate worker threads.
|
|
|
|
syncCTAThreads(CGF);
|
|
|
|
// Master thread jumps to exit point.
|
|
|
|
CGF.EmitBranch(EST.ExitBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EST.ExitBB);
|
|
|
|
EST.ExitBB = nullptr;
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
|
|
|
|
auto &Ctx = CGM.getContext();
|
|
|
|
|
|
|
|
CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
|
2017-01-05 23:24:05 +08:00
|
|
|
CGF.disableDebugInfo();
|
2016-03-22 09:48:56 +08:00
|
|
|
CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
|
|
|
|
emitWorkerLoop(CGF, WST);
|
|
|
|
CGF.FinishFunction();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
|
|
|
|
WorkerFunctionState &WST) {
|
|
|
|
//
|
|
|
|
// The workers enter this loop and wait for parallel work from the master.
|
|
|
|
// When the master encounters a parallel region it sets up the work + variable
|
|
|
|
// arguments, and wakes up the workers. The workers first check to see if
|
|
|
|
// they are required for the parallel region, i.e., within the # of requested
|
|
|
|
// parallel threads. The activated workers load the variable arguments and
|
|
|
|
// execute the parallel work.
|
|
|
|
//
|
|
|
|
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
|
|
|
|
llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
|
|
|
|
llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
|
|
|
|
llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
|
|
|
|
llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
|
|
|
|
llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
|
|
|
|
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
|
|
|
|
|
|
|
|
CGF.EmitBranch(AwaitBB);
|
|
|
|
|
|
|
|
// Workers wait for work from master.
|
|
|
|
CGF.EmitBlock(AwaitBB);
|
|
|
|
// Wait for parallel work
|
|
|
|
syncCTAThreads(CGF);
|
2017-01-05 23:24:05 +08:00
|
|
|
|
|
|
|
Address WorkFn =
|
|
|
|
CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
|
|
|
|
Address ExecStatus =
|
|
|
|
CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
|
|
|
|
CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
|
|
|
|
CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
|
|
|
|
|
2017-01-10 23:42:51 +08:00
|
|
|
llvm::Value *Args[] = {WorkFn.getPointer()};
|
|
|
|
llvm::Value *Ret = CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
|
|
|
|
Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
|
2017-01-05 23:24:05 +08:00
|
|
|
|
2016-03-22 09:48:56 +08:00
|
|
|
// On termination condition (workid == 0), exit loop.
|
2017-01-05 23:24:05 +08:00
|
|
|
llvm::Value *ShouldTerminate =
|
|
|
|
Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
|
2016-03-22 09:48:56 +08:00
|
|
|
Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
|
|
|
|
|
|
|
|
// Activate requested workers.
|
|
|
|
CGF.EmitBlock(SelectWorkersBB);
|
2017-01-05 23:24:05 +08:00
|
|
|
llvm::Value *IsActive =
|
|
|
|
Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
|
|
|
|
Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
|
2016-03-22 09:48:56 +08:00
|
|
|
|
|
|
|
// Signal start of parallel region.
|
|
|
|
CGF.EmitBlock(ExecuteBB);
|
2017-01-10 23:42:51 +08:00
|
|
|
|
|
|
|
// Process work items: outlined parallel functions.
|
|
|
|
for (auto *W : Work) {
|
|
|
|
// Try to match this outlined function.
|
|
|
|
auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
|
|
|
|
|
|
|
|
llvm::Value *WorkFnMatch =
|
|
|
|
Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
|
|
|
|
|
|
|
|
llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
|
|
|
|
llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
|
|
|
|
Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
|
|
|
|
|
|
|
|
// Execute this outlined function.
|
|
|
|
CGF.EmitBlock(ExecuteFNBB);
|
|
|
|
|
|
|
|
// Insert call to work function.
|
|
|
|
// FIXME: Pass arguments to outlined function from master thread.
|
|
|
|
auto *Fn = cast<llvm::Function>(W);
|
|
|
|
Address ZeroAddr =
|
|
|
|
CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
|
|
|
|
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
|
|
|
|
llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
|
|
|
|
CGF.EmitCallOrInvoke(Fn, FnArgs);
|
|
|
|
|
|
|
|
// Go to end of parallel region.
|
|
|
|
CGF.EmitBranch(TerminateBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(CheckNextBB);
|
|
|
|
}
|
2016-03-22 09:48:56 +08:00
|
|
|
|
|
|
|
// Signal end of parallel region.
|
|
|
|
CGF.EmitBlock(TerminateBB);
|
2017-01-10 23:42:51 +08:00
|
|
|
CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
|
|
|
|
llvm::None);
|
2016-03-22 09:48:56 +08:00
|
|
|
CGF.EmitBranch(BarrierBB);
|
|
|
|
|
|
|
|
// All active and inactive workers wait at a barrier after parallel region.
|
|
|
|
CGF.EmitBlock(BarrierBB);
|
|
|
|
// Barrier after parallel region.
|
|
|
|
syncCTAThreads(CGF);
|
|
|
|
CGF.EmitBranch(AwaitBB);
|
|
|
|
|
|
|
|
// Exit target region.
|
|
|
|
CGF.EmitBlock(ExitBB);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Returns specified OpenMP runtime function for the current OpenMP
|
|
|
|
/// implementation. Specialized for the NVPTX device.
|
|
|
|
/// \param Function OpenMP runtime function.
|
|
|
|
/// \return Specified function.
|
|
|
|
llvm::Constant *
|
|
|
|
CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
|
|
|
|
llvm::Constant *RTLFn = nullptr;
|
|
|
|
switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
|
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_init: {
|
2017-01-05 23:24:05 +08:00
|
|
|
// Build void __kmpc_kernel_init(kmp_int32 thread_limit);
|
|
|
|
llvm::Type *TypeParams[] = {CGM.Int32Ty};
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
|
|
|
|
break;
|
|
|
|
}
|
2017-01-05 23:24:05 +08:00
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_deinit: {
|
|
|
|
// Build void __kmpc_kernel_deinit();
|
|
|
|
llvm::FunctionType *FnTy =
|
2017-01-10 23:42:51 +08:00
|
|
|
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
|
2017-01-05 23:24:05 +08:00
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
|
|
|
|
break;
|
|
|
|
}
|
2017-01-10 23:42:51 +08:00
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
|
|
|
|
/// Build void __kmpc_kernel_prepare_parallel(
|
|
|
|
/// void *outlined_function);
|
|
|
|
llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
|
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_parallel: {
|
|
|
|
/// Build bool __kmpc_kernel_parallel(void **outlined_function);
|
|
|
|
llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
|
|
|
|
llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
|
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
|
|
|
|
/// Build void __kmpc_kernel_end_parallel();
|
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OMPRTL_NVPTX__kmpc_serialized_parallel: {
|
|
|
|
// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
|
|
|
|
// global_tid);
|
|
|
|
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
|
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
|
|
|
|
// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
|
|
|
|
// global_tid);
|
|
|
|
llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
|
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
|
|
|
|
break;
|
|
|
|
}
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
return RTLFn;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
|
|
|
|
llvm::Constant *Addr,
|
[OpenMP] Add fields for flags in the offload entry descriptor.
Summary:
This patch adds two fields to the offload entry descriptor. One field is meant to signal Ctors/Dtors and `link` global variables, and the other is reserved for runtime library use.
Currently, these fields are only filled with zeros in the current code generation, but that will change when `declare target` is added.
The reason, we are adding these fields now is to make the code generation consistent with the runtime library proposal under review in https://reviews.llvm.org/D14031.
Reviewers: ABataev, hfinkel, carlo.bertolli, kkwli0, arpith-jacob, Hahnfeld
Subscribers: cfe-commits, caomhin, jholewinski
Differential Revision: https://reviews.llvm.org/D28298
llvm-svn: 291124
2017-01-06 00:02:49 +08:00
|
|
|
uint64_t Size, int32_t) {
|
2016-03-22 09:48:56 +08:00
|
|
|
auto *F = dyn_cast<llvm::Function>(Addr);
|
|
|
|
// TODO: Add support for global variables on the device after declare target
|
|
|
|
// support.
|
|
|
|
if (!F)
|
|
|
|
return;
|
|
|
|
llvm::Module *M = F->getParent();
|
|
|
|
llvm::LLVMContext &Ctx = M->getContext();
|
|
|
|
|
|
|
|
// Get "nvvm.annotations" metadata node
|
|
|
|
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
|
|
|
|
|
|
|
|
llvm::Metadata *MDVals[] = {
|
|
|
|
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
|
|
|
|
llvm::ConstantAsMetadata::get(
|
|
|
|
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
|
|
|
|
// Append metadata to nvvm.annotations
|
|
|
|
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
|
|
|
|
const OMPExecutableDirective &D, StringRef ParentName,
|
|
|
|
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
|
2016-03-29 13:34:15 +08:00
|
|
|
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
|
2016-03-22 09:48:56 +08:00
|
|
|
if (!IsOffloadEntry) // Nothing to do.
|
|
|
|
return;
|
|
|
|
|
|
|
|
assert(!ParentName.empty() && "Invalid target region parent name!");
|
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
|
|
|
|
CodeGen);
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
|
2016-02-08 23:59:20 +08:00
|
|
|
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
|
2017-01-05 23:24:05 +08:00
|
|
|
: CGOpenMPRuntime(CGM) {
|
2016-03-22 09:48:56 +08:00
|
|
|
if (!CGM.getLangOpts().OpenMPIsDevice)
|
|
|
|
llvm_unreachable("OpenMP NVPTX can only handle device code.");
|
|
|
|
}
|
2016-04-04 23:55:02 +08:00
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
|
|
|
|
const Expr *NumTeams,
|
|
|
|
const Expr *ThreadLimit,
|
|
|
|
SourceLocation Loc) {}
|
|
|
|
|
|
|
|
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
|
|
|
|
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
|
|
|
|
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
|
|
|
|
|
|
|
|
llvm::Function *OutlinedFun = nullptr;
|
|
|
|
if (isa<OMPTeamsDirective>(D)) {
|
|
|
|
llvm::Value *OutlinedFunVal =
|
|
|
|
CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
|
|
|
|
D, ThreadIDVar, InnermostKind, CodeGen);
|
|
|
|
OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
|
Cleanup the handling of noinline function attributes, -fno-inline,
-fno-inline-functions, -O0, and optnone.
These were really, really tangled together:
- We used the noinline LLVM attribute for -fno-inline
- But not for -fno-inline-functions (breaking LTO)
- But we did use it for -finline-hint-functions (yay, LTO is happy!)
- But we didn't for -O0 (LTO is sad yet again...)
- We had weird structuring of CodeGenOpts with both an inlining
enumeration and a boolean. They interacted in weird ways and
needlessly.
- A *lot* of set smashing went on with setting these, and then got worse
when we considered optnone and other inlining-effecting attributes.
- A bunch of inline affecting attributes were managed in a completely
different place from -fno-inline.
- Even with -fno-inline we failed to put the LLVM noinline attribute
onto many generated function definitions because they didn't show up
as AST-level functions.
- If you passed -O0 but -finline-functions we would run the normal
inliner pass in LLVM despite it being in the O0 pipeline, which really
doesn't make much sense.
- Lastly, we used things like '-fno-inline' to manipulate the pass
pipeline which forced the pass pipeline to be much more
parameterizable than it really needs to be. Instead we can *just* use
the optimization level to select a pipeline and control the rest via
attributes.
Sadly, this causes a bunch of churn in tests because we don't run the
optimizer in the tests and check the contents of attribute sets. It
would be awesome if attribute sets were a bit more FileCheck friendly,
but oh well.
I think this is a significant improvement and should remove the semantic
need to change what inliner pass we run in order to comply with the
requested inlining semantics by relying completely on attributes. It
also cleans up tho optnone and related handling a bit.
One unfortunate aspect of this is that for generating alwaysinline
routines like those in OpenMP we end up removing noinline and then
adding alwaysinline. I tried a bunch of other approaches, but because we
recompute function attributes from scratch and don't have a declaration
here I couldn't find anything substantially cleaner than this.
Differential Revision: https://reviews.llvm.org/D28053
llvm-svn: 290398
2016-12-23 09:24:49 +08:00
|
|
|
OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
|
2016-04-04 23:55:02 +08:00
|
|
|
OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
|
2017-01-10 23:42:51 +08:00
|
|
|
} else {
|
|
|
|
llvm::Value *OutlinedFunVal =
|
|
|
|
CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
|
|
|
|
D, ThreadIDVar, InnermostKind, CodeGen);
|
|
|
|
OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
|
|
|
|
}
|
2016-04-04 23:55:02 +08:00
|
|
|
|
|
|
|
return OutlinedFun;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
|
|
|
|
const OMPExecutableDirective &D,
|
|
|
|
SourceLocation Loc,
|
|
|
|
llvm::Value *OutlinedFn,
|
|
|
|
ArrayRef<llvm::Value *> CapturedVars) {
|
|
|
|
if (!CGF.HaveInsertPoint())
|
|
|
|
return;
|
|
|
|
|
|
|
|
Address ZeroAddr =
|
|
|
|
CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
|
|
|
|
/*Name*/ ".zero.addr");
|
|
|
|
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
|
|
|
|
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
|
|
|
|
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
|
|
|
|
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
|
|
|
|
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
|
|
|
|
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
|
|
|
|
}
|
2017-01-10 23:42:51 +08:00
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitParallelCall(
|
|
|
|
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
|
|
|
|
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
|
|
|
|
if (!CGF.HaveInsertPoint())
|
|
|
|
return;
|
|
|
|
|
|
|
|
emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
|
|
|
|
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
|
|
|
|
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
|
|
|
|
llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
|
|
|
|
|
|
|
|
auto &&L0ParallelGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF,
|
|
|
|
PrePostActionTy &) {
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
|
|
|
|
// Prepare for parallel region. Indicate the outlined function.
|
|
|
|
llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)};
|
|
|
|
CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
|
|
|
|
Args);
|
|
|
|
|
|
|
|
// Activate workers. This barrier is used by the master to signal
|
|
|
|
// work for the workers.
|
|
|
|
syncCTAThreads(CGF);
|
|
|
|
|
|
|
|
// OpenMP [2.5, Parallel Construct, p.49]
|
|
|
|
// There is an implied barrier at the end of a parallel region. After the
|
|
|
|
// end of a parallel region, only the master thread of the team resumes
|
|
|
|
// execution of the enclosing task region.
|
|
|
|
//
|
|
|
|
// The master waits at this barrier until all workers are done.
|
|
|
|
syncCTAThreads(CGF);
|
|
|
|
|
|
|
|
// Remember for post-processing in worker loop.
|
|
|
|
Work.push_back(Fn);
|
|
|
|
};
|
|
|
|
|
|
|
|
auto *RTLoc = emitUpdateLocation(CGF, Loc);
|
|
|
|
auto *ThreadID = getThreadID(CGF, Loc);
|
|
|
|
llvm::Value *Args[] = {RTLoc, ThreadID};
|
|
|
|
|
|
|
|
auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
|
|
|
|
PrePostActionTy &) {
|
|
|
|
auto &&CodeGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
|
|
|
|
PrePostActionTy &Action) {
|
|
|
|
Action.Enter(CGF);
|
|
|
|
|
|
|
|
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
|
|
|
|
OutlinedFnArgs.push_back(
|
|
|
|
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
|
|
|
|
OutlinedFnArgs.push_back(
|
|
|
|
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
|
|
|
|
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
|
|
|
|
CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs);
|
|
|
|
};
|
|
|
|
|
|
|
|
RegionCodeGenTy RCG(CodeGen);
|
|
|
|
NVPTXActionTy Action(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
|
|
|
|
Args,
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
|
|
|
|
Args);
|
|
|
|
RCG.setAction(Action);
|
|
|
|
RCG(CGF);
|
|
|
|
};
|
|
|
|
|
|
|
|
if (IfCond)
|
|
|
|
emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
|
|
|
|
else {
|
|
|
|
CodeGenFunction::RunCleanupsScope Scope(CGF);
|
|
|
|
RegionCodeGenTy ThenRCG(L0ParallelGen);
|
|
|
|
ThenRCG(CGF);
|
|
|
|
}
|
|
|
|
}
|