2016-02-08 23:59:20 +08:00
|
|
|
//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This provides a class for OpenMP runtime code generation specialized to NVPTX
|
|
|
|
// targets.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "CGOpenMPRuntimeNVPTX.h"
|
2016-03-04 17:22:22 +08:00
|
|
|
#include "clang/AST/DeclOpenMP.h"
|
2016-04-04 23:55:02 +08:00
|
|
|
#include "CodeGenFunction.h"
|
|
|
|
#include "clang/AST/StmtOpenMP.h"
|
2016-02-08 23:59:20 +08:00
|
|
|
|
|
|
|
using namespace clang;
|
|
|
|
using namespace CodeGen;
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
namespace {
|
|
|
|
enum OpenMPRTLFunctionNVPTX {
|
2017-01-05 23:24:05 +08:00
|
|
|
/// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
|
2017-01-04 04:19:56 +08:00
|
|
|
OMPRTL_NVPTX__kmpc_kernel_init,
|
2017-01-05 23:24:05 +08:00
|
|
|
/// \brief Call to void __kmpc_kernel_deinit();
|
|
|
|
OMPRTL_NVPTX__kmpc_kernel_deinit,
|
2017-01-04 04:19:56 +08:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
/// Get the GPU warp size.
|
|
|
|
static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateCall(
|
|
|
|
llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::None, "nvptx_warp_size");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get the id of the current thread on the GPU.
|
|
|
|
static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateCall(
|
|
|
|
llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::None, "nvptx_tid");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get the maximum number of threads in a block of the GPU.
|
|
|
|
static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateCall(
|
|
|
|
llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::None, "nvptx_num_threads");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get barrier to synchronize all threads in a block.
|
|
|
|
static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
Bld.CreateCall(llvm::Intrinsic::getDeclaration(
|
2017-01-04 04:19:56 +08:00
|
|
|
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Synchronize all GPU threads in a block.
|
|
|
|
static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
|
2016-03-22 09:48:56 +08:00
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
/// Get the value of the thread_limit clause in the teams directive.
|
|
|
|
/// The runtime encodes thread_limit in the launch parameter, always starting
|
|
|
|
/// thread_limit+warpSize threads per team.
|
|
|
|
static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
|
|
|
|
"thread_limit");
|
|
|
|
}
|
|
|
|
|
2017-01-04 04:19:56 +08:00
|
|
|
/// Get the thread id of the OMP master thread.
|
2016-03-22 09:48:56 +08:00
|
|
|
/// The master thread id is the first thread (lane) of the last warp in the
|
|
|
|
/// GPU block. Warp size is assumed to be some power of 2.
|
|
|
|
/// Thread id is 0 indexed.
|
|
|
|
/// E.g: If NumThreads is 33, master id is 32.
|
|
|
|
/// If NumThreads is 64, master id is 32.
|
|
|
|
/// If NumThreads is 1024, master id is 992.
|
2017-01-04 04:19:56 +08:00
|
|
|
static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
|
2016-03-22 09:48:56 +08:00
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
|
|
|
|
|
|
|
|
// We assume that the warp size is a power of 2.
|
|
|
|
llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
|
|
|
|
|
|
|
|
return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
|
|
|
|
Bld.CreateNot(Mask), "master_tid");
|
|
|
|
}
|
|
|
|
|
|
|
|
CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
|
|
|
|
CodeGenModule &CGM)
|
|
|
|
: WorkerFn(nullptr), CGFI(nullptr) {
|
|
|
|
createWorkerFunction(CGM);
|
2016-03-22 18:41:20 +08:00
|
|
|
}
|
2016-03-22 09:48:56 +08:00
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
|
|
|
|
CodeGenModule &CGM) {
|
|
|
|
// Create an worker function with no arguments.
|
|
|
|
CGFI = &CGM.getTypes().arrangeNullaryFunction();
|
|
|
|
|
|
|
|
WorkerFn = llvm::Function::Create(
|
|
|
|
CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
|
|
|
|
/* placeholder */ "_worker", &CGM.getModule());
|
|
|
|
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
|
|
|
|
}
|
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
|
|
|
|
StringRef ParentName,
|
|
|
|
llvm::Function *&OutlinedFn,
|
|
|
|
llvm::Constant *&OutlinedFnID,
|
|
|
|
bool IsOffloadEntry,
|
|
|
|
const RegionCodeGenTy &CodeGen) {
|
|
|
|
EntryFunctionState EST;
|
|
|
|
WorkerFunctionState WST(CGM);
|
|
|
|
|
|
|
|
// Emit target region as a standalone region.
|
|
|
|
class NVPTXPrePostActionTy : public PrePostActionTy {
|
|
|
|
CGOpenMPRuntimeNVPTX &RT;
|
|
|
|
CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
|
|
|
|
CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
|
|
|
|
|
|
|
|
public:
|
|
|
|
NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
|
|
|
|
CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
|
|
|
|
CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
|
|
|
|
: RT(RT), EST(EST), WST(WST) {}
|
|
|
|
void Enter(CodeGenFunction &CGF) override {
|
|
|
|
RT.emitGenericEntryHeader(CGF, EST, WST);
|
|
|
|
}
|
|
|
|
void Exit(CodeGenFunction &CGF) override {
|
|
|
|
RT.emitGenericEntryFooter(CGF, EST);
|
|
|
|
}
|
|
|
|
} Action(*this, EST, WST);
|
|
|
|
CodeGen.setAction(Action);
|
|
|
|
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
|
|
|
|
IsOffloadEntry, CodeGen);
|
2017-01-05 02:44:50 +08:00
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
// Create the worker function
|
|
|
|
emitWorkerFunction(WST);
|
|
|
|
|
|
|
|
// Now change the name of the worker function to correspond to this target
|
|
|
|
// region's entry function.
|
|
|
|
WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup NVPTX threads for master-worker OpenMP scheme.
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
|
|
|
|
EntryFunctionState &EST,
|
|
|
|
WorkerFunctionState &WST) {
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
|
|
|
|
llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
|
|
|
|
llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
|
|
|
|
llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
|
|
|
|
EST.ExitBB = CGF.createBasicBlock(".exit");
|
|
|
|
|
|
|
|
auto *IsWorker =
|
|
|
|
Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
|
|
|
|
Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(WorkerBB);
|
|
|
|
CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
|
|
|
|
CGF.EmitBranch(EST.ExitBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(MasterCheckBB);
|
|
|
|
auto *IsMaster =
|
|
|
|
Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
|
|
|
|
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(MasterBB);
|
|
|
|
// First action in sequential region:
|
|
|
|
// Initialize the state of the OpenMP runtime library on the GPU.
|
|
|
|
llvm::Value *Args[] = {getThreadLimit(CGF)};
|
|
|
|
CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
|
|
|
|
EntryFunctionState &EST) {
|
|
|
|
if (!EST.ExitBB)
|
|
|
|
EST.ExitBB = CGF.createBasicBlock(".exit");
|
|
|
|
|
|
|
|
llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
|
|
|
|
CGF.EmitBranch(TerminateBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(TerminateBB);
|
|
|
|
// Signal termination condition.
|
|
|
|
CGF.EmitRuntimeCall(
|
|
|
|
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
|
|
|
|
// Barrier to terminate worker threads.
|
|
|
|
syncCTAThreads(CGF);
|
|
|
|
// Master thread jumps to exit point.
|
|
|
|
CGF.EmitBranch(EST.ExitBB);
|
|
|
|
|
|
|
|
CGF.EmitBlock(EST.ExitBB);
|
|
|
|
EST.ExitBB = nullptr;
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
|
|
|
|
auto &Ctx = CGM.getContext();
|
|
|
|
|
|
|
|
CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
|
2017-01-05 23:24:05 +08:00
|
|
|
CGF.disableDebugInfo();
|
2016-03-22 09:48:56 +08:00
|
|
|
CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
|
|
|
|
emitWorkerLoop(CGF, WST);
|
|
|
|
CGF.FinishFunction();
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
|
|
|
|
WorkerFunctionState &WST) {
|
|
|
|
//
|
|
|
|
// The workers enter this loop and wait for parallel work from the master.
|
|
|
|
// When the master encounters a parallel region it sets up the work + variable
|
|
|
|
// arguments, and wakes up the workers. The workers first check to see if
|
|
|
|
// they are required for the parallel region, i.e., within the # of requested
|
|
|
|
// parallel threads. The activated workers load the variable arguments and
|
|
|
|
// execute the parallel work.
|
|
|
|
//
|
|
|
|
|
|
|
|
CGBuilderTy &Bld = CGF.Builder;
|
|
|
|
|
|
|
|
llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
|
|
|
|
llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
|
|
|
|
llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
|
|
|
|
llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
|
|
|
|
llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
|
|
|
|
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
|
|
|
|
|
|
|
|
CGF.EmitBranch(AwaitBB);
|
|
|
|
|
|
|
|
// Workers wait for work from master.
|
|
|
|
CGF.EmitBlock(AwaitBB);
|
|
|
|
// Wait for parallel work
|
|
|
|
syncCTAThreads(CGF);
|
2017-01-05 23:24:05 +08:00
|
|
|
|
|
|
|
Address WorkFn =
|
|
|
|
CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
|
|
|
|
Address ExecStatus =
|
|
|
|
CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
|
|
|
|
CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
|
|
|
|
CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
|
|
|
|
|
|
|
|
// TODO: Call into runtime to get parallel work.
|
|
|
|
|
2016-03-22 09:48:56 +08:00
|
|
|
// On termination condition (workid == 0), exit loop.
|
2017-01-05 23:24:05 +08:00
|
|
|
llvm::Value *ShouldTerminate =
|
|
|
|
Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
|
2016-03-22 09:48:56 +08:00
|
|
|
Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
|
|
|
|
|
|
|
|
// Activate requested workers.
|
|
|
|
CGF.EmitBlock(SelectWorkersBB);
|
2017-01-05 23:24:05 +08:00
|
|
|
llvm::Value *IsActive =
|
|
|
|
Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
|
|
|
|
Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
|
2016-03-22 09:48:56 +08:00
|
|
|
|
|
|
|
// Signal start of parallel region.
|
|
|
|
CGF.EmitBlock(ExecuteBB);
|
|
|
|
// TODO: Add parallel work.
|
|
|
|
|
|
|
|
// Signal end of parallel region.
|
|
|
|
CGF.EmitBlock(TerminateBB);
|
|
|
|
CGF.EmitBranch(BarrierBB);
|
|
|
|
|
|
|
|
// All active and inactive workers wait at a barrier after parallel region.
|
|
|
|
CGF.EmitBlock(BarrierBB);
|
|
|
|
// Barrier after parallel region.
|
|
|
|
syncCTAThreads(CGF);
|
|
|
|
CGF.EmitBranch(AwaitBB);
|
|
|
|
|
|
|
|
// Exit target region.
|
|
|
|
CGF.EmitBlock(ExitBB);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Returns specified OpenMP runtime function for the current OpenMP
|
|
|
|
/// implementation. Specialized for the NVPTX device.
|
|
|
|
/// \param Function OpenMP runtime function.
|
|
|
|
/// \return Specified function.
|
|
|
|
llvm::Constant *
|
|
|
|
CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
|
|
|
|
llvm::Constant *RTLFn = nullptr;
|
|
|
|
switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
|
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_init: {
|
2017-01-05 23:24:05 +08:00
|
|
|
// Build void __kmpc_kernel_init(kmp_int32 thread_limit);
|
|
|
|
llvm::Type *TypeParams[] = {CGM.Int32Ty};
|
2016-03-22 09:48:56 +08:00
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
|
|
|
|
break;
|
|
|
|
}
|
2017-01-05 23:24:05 +08:00
|
|
|
case OMPRTL_NVPTX__kmpc_kernel_deinit: {
|
|
|
|
// Build void __kmpc_kernel_deinit();
|
|
|
|
llvm::FunctionType *FnTy =
|
|
|
|
llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false);
|
|
|
|
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
|
|
|
|
break;
|
|
|
|
}
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
return RTLFn;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
|
|
|
|
llvm::Constant *Addr,
|
[OpenMP] Add fields for flags in the offload entry descriptor.
Summary:
This patch adds two fields to the offload entry descriptor. One field is meant to signal Ctors/Dtors and `link` global variables, and the other is reserved for runtime library use.
Currently, these fields are only filled with zeros in the current code generation, but that will change when `declare target` is added.
The reason, we are adding these fields now is to make the code generation consistent with the runtime library proposal under review in https://reviews.llvm.org/D14031.
Reviewers: ABataev, hfinkel, carlo.bertolli, kkwli0, arpith-jacob, Hahnfeld
Subscribers: cfe-commits, caomhin, jholewinski
Differential Revision: https://reviews.llvm.org/D28298
llvm-svn: 291124
2017-01-06 00:02:49 +08:00
|
|
|
uint64_t Size, int32_t) {
|
2016-03-22 09:48:56 +08:00
|
|
|
auto *F = dyn_cast<llvm::Function>(Addr);
|
|
|
|
// TODO: Add support for global variables on the device after declare target
|
|
|
|
// support.
|
|
|
|
if (!F)
|
|
|
|
return;
|
|
|
|
llvm::Module *M = F->getParent();
|
|
|
|
llvm::LLVMContext &Ctx = M->getContext();
|
|
|
|
|
|
|
|
// Get "nvvm.annotations" metadata node
|
|
|
|
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
|
|
|
|
|
|
|
|
llvm::Metadata *MDVals[] = {
|
|
|
|
llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
|
|
|
|
llvm::ConstantAsMetadata::get(
|
|
|
|
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
|
|
|
|
// Append metadata to nvvm.annotations
|
|
|
|
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
|
|
|
|
const OMPExecutableDirective &D, StringRef ParentName,
|
|
|
|
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
|
2016-03-29 13:34:15 +08:00
|
|
|
bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
|
2016-03-22 09:48:56 +08:00
|
|
|
if (!IsOffloadEntry) // Nothing to do.
|
|
|
|
return;
|
|
|
|
|
|
|
|
assert(!ParentName.empty() && "Invalid target region parent name!");
|
|
|
|
|
2017-01-05 23:24:05 +08:00
|
|
|
emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
|
|
|
|
CodeGen);
|
2016-03-22 09:48:56 +08:00
|
|
|
}
|
|
|
|
|
2016-02-08 23:59:20 +08:00
|
|
|
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
|
2017-01-05 23:24:05 +08:00
|
|
|
: CGOpenMPRuntime(CGM) {
|
2016-03-22 09:48:56 +08:00
|
|
|
if (!CGM.getLangOpts().OpenMPIsDevice)
|
|
|
|
llvm_unreachable("OpenMP NVPTX can only handle device code.");
|
|
|
|
}
|
2016-04-04 23:55:02 +08:00
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
|
|
|
|
const Expr *NumTeams,
|
|
|
|
const Expr *ThreadLimit,
|
|
|
|
SourceLocation Loc) {}
|
|
|
|
|
|
|
|
llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
|
|
|
|
const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
|
|
|
|
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
|
|
|
|
|
|
|
|
llvm::Function *OutlinedFun = nullptr;
|
|
|
|
if (isa<OMPTeamsDirective>(D)) {
|
|
|
|
llvm::Value *OutlinedFunVal =
|
|
|
|
CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
|
|
|
|
D, ThreadIDVar, InnermostKind, CodeGen);
|
|
|
|
OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
|
Cleanup the handling of noinline function attributes, -fno-inline,
-fno-inline-functions, -O0, and optnone.
These were really, really tangled together:
- We used the noinline LLVM attribute for -fno-inline
- But not for -fno-inline-functions (breaking LTO)
- But we did use it for -finline-hint-functions (yay, LTO is happy!)
- But we didn't for -O0 (LTO is sad yet again...)
- We had weird structuring of CodeGenOpts with both an inlining
enumeration and a boolean. They interacted in weird ways and
needlessly.
- A *lot* of set smashing went on with setting these, and then got worse
when we considered optnone and other inlining-effecting attributes.
- A bunch of inline affecting attributes were managed in a completely
different place from -fno-inline.
- Even with -fno-inline we failed to put the LLVM noinline attribute
onto many generated function definitions because they didn't show up
as AST-level functions.
- If you passed -O0 but -finline-functions we would run the normal
inliner pass in LLVM despite it being in the O0 pipeline, which really
doesn't make much sense.
- Lastly, we used things like '-fno-inline' to manipulate the pass
pipeline which forced the pass pipeline to be much more
parameterizable than it really needs to be. Instead we can *just* use
the optimization level to select a pipeline and control the rest via
attributes.
Sadly, this causes a bunch of churn in tests because we don't run the
optimizer in the tests and check the contents of attribute sets. It
would be awesome if attribute sets were a bit more FileCheck friendly,
but oh well.
I think this is a significant improvement and should remove the semantic
need to change what inliner pass we run in order to comply with the
requested inlining semantics by relying completely on attributes. It
also cleans up tho optnone and related handling a bit.
One unfortunate aspect of this is that for generating alwaysinline
routines like those in OpenMP we end up removing noinline and then
adding alwaysinline. I tried a bunch of other approaches, but because we
recompute function attributes from scratch and don't have a declaration
here I couldn't find anything substantially cleaner than this.
Differential Revision: https://reviews.llvm.org/D28053
llvm-svn: 290398
2016-12-23 09:24:49 +08:00
|
|
|
OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
|
2016-04-04 23:55:02 +08:00
|
|
|
OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
|
|
|
|
} else
|
|
|
|
llvm_unreachable("parallel directive is not yet supported for nvptx "
|
|
|
|
"backend.");
|
|
|
|
|
|
|
|
return OutlinedFun;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
|
|
|
|
const OMPExecutableDirective &D,
|
|
|
|
SourceLocation Loc,
|
|
|
|
llvm::Value *OutlinedFn,
|
|
|
|
ArrayRef<llvm::Value *> CapturedVars) {
|
|
|
|
if (!CGF.HaveInsertPoint())
|
|
|
|
return;
|
|
|
|
|
|
|
|
Address ZeroAddr =
|
|
|
|
CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
|
|
|
|
/*Name*/ ".zero.addr");
|
|
|
|
CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
|
|
|
|
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
|
|
|
|
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
|
|
|
|
OutlinedFnArgs.push_back(ZeroAddr.getPointer());
|
|
|
|
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
|
|
|
|
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
|
|
|
|
}
|