2016-07-13 23:54:58 +08:00
|
|
|
//===------ PPCGCodeGeneration.cpp - Polly Accelerator Code Generation. ---===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// Take a scop created by ScopInfo and map it to GPU code using the ppcg
|
|
|
|
// GPU mapping strategy.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
#include "polly/CodeGen/PPCGCodeGeneration.h"
|
2016-08-09 01:35:55 +08:00
|
|
|
#include "polly/CodeGen/IslAst.h"
|
2016-07-13 23:54:58 +08:00
|
|
|
#include "polly/CodeGen/IslNodeBuilder.h"
|
2016-07-18 19:56:39 +08:00
|
|
|
#include "polly/CodeGen/Utils.h"
|
2016-07-13 23:54:58 +08:00
|
|
|
#include "polly/DependenceInfo.h"
|
|
|
|
#include "polly/LinkAllPasses.h"
|
2016-07-14 18:22:25 +08:00
|
|
|
#include "polly/Options.h"
|
2016-08-03 20:00:07 +08:00
|
|
|
#include "polly/ScopDetection.h"
|
2016-07-13 23:54:58 +08:00
|
|
|
#include "polly/ScopInfo.h"
|
2016-07-21 21:15:59 +08:00
|
|
|
#include "polly/Support/SCEVValidator.h"
|
2016-07-22 15:11:12 +08:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2016-07-13 23:54:58 +08:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/BasicAliasAnalysis.h"
|
|
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
|
|
|
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
|
2016-07-22 15:11:12 +08:00
|
|
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
|
|
#include "llvm/IR/LegacyPassManager.h"
|
2016-07-24 14:43:17 +08:00
|
|
|
#include "llvm/IR/Verifier.h"
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
#include "llvm/IRReader/IRReader.h"
|
|
|
|
#include "llvm/Linker/Linker.h"
|
2016-07-22 15:11:12 +08:00
|
|
|
#include "llvm/Support/TargetRegistry.h"
|
|
|
|
#include "llvm/Support/TargetSelect.h"
|
|
|
|
#include "llvm/Target/TargetMachine.h"
|
2016-07-24 14:43:21 +08:00
|
|
|
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
|
2016-08-09 23:35:03 +08:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2016-07-13 23:54:58 +08:00
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
#include "isl/union_map.h"
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
extern "C" {
|
2016-07-15 15:50:36 +08:00
|
|
|
#include "ppcg/cuda.h"
|
|
|
|
#include "ppcg/gpu.h"
|
|
|
|
#include "ppcg/gpu_print.h"
|
|
|
|
#include "ppcg/ppcg.h"
|
|
|
|
#include "ppcg/schedule.h"
|
2016-07-14 18:22:19 +08:00
|
|
|
}
|
|
|
|
|
2016-07-13 23:54:58 +08:00
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
|
|
|
|
using namespace polly;
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "polly-codegen-ppcg"
|
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
static cl::opt<bool> DumpSchedule("polly-acc-dump-schedule",
|
|
|
|
cl::desc("Dump the computed GPU Schedule"),
|
2016-07-14 18:51:47 +08:00
|
|
|
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
2016-07-14 18:22:25 +08:00
|
|
|
cl::cat(PollyCategory));
|
2016-07-14 23:51:37 +08:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
DumpCode("polly-acc-dump-code",
|
|
|
|
cl::desc("Dump C code describing the GPU mapping"), cl::Hidden,
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
|
|
|
|
cl::desc("Dump the kernel LLVM-IR"),
|
|
|
|
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
static cl::opt<bool> DumpKernelASM("polly-acc-dump-kernel-asm",
|
|
|
|
cl::desc("Dump the kernel assembly code"),
|
|
|
|
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
static cl::opt<bool> FastMath("polly-acc-fastmath",
|
|
|
|
cl::desc("Allow unsafe math optimizations"),
|
|
|
|
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
2016-08-04 20:18:14 +08:00
|
|
|
static cl::opt<bool> SharedMemory("polly-acc-use-shared",
|
|
|
|
cl::desc("Use shared memory"), cl::Hidden,
|
|
|
|
cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
2016-08-04 20:39:03 +08:00
|
|
|
static cl::opt<bool> PrivateMemory("polly-acc-use-private",
|
|
|
|
cl::desc("Use private memory"), cl::Hidden,
|
|
|
|
cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
2016-07-22 15:11:12 +08:00
|
|
|
|
2017-04-28 19:16:30 +08:00
|
|
|
static cl::opt<bool> ManagedMemory("polly-acc-codegen-managed-memory",
|
|
|
|
cl::desc("Generate Host kernel code assuming"
|
|
|
|
" that all memory has been"
|
|
|
|
" declared as managed memory"),
|
|
|
|
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
2017-06-26 22:56:56 +08:00
|
|
|
static cl::opt<bool>
|
|
|
|
FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure",
|
|
|
|
cl::desc("Fail and generate a backtrace if"
|
|
|
|
" verifyModule fails on the GPU "
|
|
|
|
" kernel module."),
|
|
|
|
cl::Hidden, cl::init(false), cl::ZeroOrMore,
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
static cl::opt<std::string> CUDALibDevice(
|
|
|
|
"polly-acc-libdevice", cl::desc("Path to CUDA libdevice"), cl::Hidden,
|
|
|
|
cl::init("/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.ll"),
|
|
|
|
cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
static cl::opt<std::string>
|
|
|
|
CudaVersion("polly-acc-cuda-version",
|
|
|
|
cl::desc("The CUDA version to compile for"), cl::Hidden,
|
|
|
|
cl::init("sm_30"), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
2016-09-18 14:50:35 +08:00
|
|
|
static cl::opt<int>
|
|
|
|
MinCompute("polly-acc-mincompute",
|
|
|
|
cl::desc("Minimal number of compute statements to run on GPU."),
|
|
|
|
cl::Hidden, cl::init(10 * 512 * 512));
|
|
|
|
|
2017-08-05 03:36:40 +08:00
|
|
|
/// Return a unique name for a Scop, which is the scop region with the
|
|
|
|
/// function name.
|
|
|
|
std::string getUniqueScopName(const Scop *S) {
|
|
|
|
return "Scop Region: " + S->getNameStr() +
|
|
|
|
" | Function: " + std::string(S->getFunction().getName());
|
|
|
|
}
|
|
|
|
|
2017-07-05 22:57:04 +08:00
|
|
|
/// Used to store information PPCG wants for kills. This information is
|
|
|
|
/// used by live range reordering.
|
|
|
|
///
|
|
|
|
/// @see computeLiveRangeReordering
|
|
|
|
/// @see GPUNodeBuilder::createPPCGScop
|
|
|
|
/// @see GPUNodeBuilder::createPPCGProg
|
|
|
|
struct MustKillsInfo {
|
|
|
|
/// Collection of all kill statements that will be sequenced at the end of
|
|
|
|
/// PPCGScop->schedule.
|
|
|
|
///
|
|
|
|
/// The nodes in `KillsSchedule` will be merged using `isl_schedule_set`
|
|
|
|
/// which merges schedules in *arbitrary* order.
|
|
|
|
/// (we don't care about the order of the kills anyway).
|
|
|
|
isl::schedule KillsSchedule;
|
|
|
|
/// Map from kill statement instances to scalars that need to be
|
|
|
|
/// killed.
|
|
|
|
///
|
2017-07-18 17:15:16 +08:00
|
|
|
/// We currently derive kill information for:
|
|
|
|
/// 1. phi nodes. PHI nodes are not alive outside the scop and can
|
|
|
|
/// consequently all be killed.
|
|
|
|
/// 2. Scalar arrays that are not used outside the Scop. This is
|
|
|
|
/// checked by `isScalarUsesContainedInScop`.
|
|
|
|
/// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
|
2017-07-05 22:57:04 +08:00
|
|
|
isl::union_map TaggedMustKills;
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
/// Tagged must kills stripped of the tags.
|
|
|
|
/// [params] -> { Stmt_phantom[] -> scalar_to_kill[] }
|
|
|
|
isl::union_map MustKills;
|
|
|
|
|
|
|
|
MustKillsInfo() : KillsSchedule(nullptr) {}
|
2017-07-05 22:57:04 +08:00
|
|
|
};
|
|
|
|
|
2017-07-06 21:42:42 +08:00
|
|
|
/// Check if SAI's uses are entirely contained within Scop S.
|
|
|
|
/// If a scalar is used only with a Scop, we are free to kill it, as no data
|
|
|
|
/// can flow in/out of the value any more.
|
|
|
|
/// @see computeMustKillsInfo
|
|
|
|
static bool isScalarUsesContainedInScop(const Scop &S,
|
|
|
|
const ScopArrayInfo *SAI) {
|
|
|
|
assert(SAI->isValueKind() && "this function only deals with scalars."
|
|
|
|
" Dealing with arrays required alias analysis");
|
|
|
|
|
|
|
|
const Region &R = S.getRegion();
|
|
|
|
for (User *U : SAI->getBasePtr()->users()) {
|
|
|
|
Instruction *I = dyn_cast<Instruction>(U);
|
|
|
|
assert(I && "invalid user of scop array info");
|
|
|
|
if (!R.contains(I))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-07-05 22:57:04 +08:00
|
|
|
/// Compute must-kills needed to enable live range reordering with PPCG.
|
|
|
|
///
|
|
|
|
/// @params S The Scop to compute live range reordering information
|
|
|
|
/// @returns live range reordering information that can be used to setup
|
|
|
|
/// PPCG.
|
|
|
|
static MustKillsInfo computeMustKillsInfo(const Scop &S) {
|
2017-08-07 04:11:59 +08:00
|
|
|
const isl::space ParamSpace = S.getParamSpace();
|
2017-07-05 22:57:04 +08:00
|
|
|
MustKillsInfo Info;
|
|
|
|
|
2017-07-06 21:42:42 +08:00
|
|
|
// 1. Collect all ScopArrayInfo that satisfy *any* of the criteria:
|
|
|
|
// 1.1 phi nodes in scop.
|
|
|
|
// 1.2 scalars that are only used within the scop
|
2017-07-05 22:57:04 +08:00
|
|
|
SmallVector<isl::id, 4> KillMemIds;
|
|
|
|
for (ScopArrayInfo *SAI : S.arrays()) {
|
2017-07-06 21:42:42 +08:00
|
|
|
if (SAI->isPHIKind() ||
|
|
|
|
(SAI->isValueKind() && isScalarUsesContainedInScop(S, SAI)))
|
2017-07-22 07:07:56 +08:00
|
|
|
KillMemIds.push_back(isl::manage(SAI->getBasePtrId().release()));
|
2017-07-05 22:57:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
|
2017-07-20 23:48:36 +08:00
|
|
|
Info.MustKills = isl::union_map::empty(isl::space(ParamSpace));
|
2017-07-05 22:57:04 +08:00
|
|
|
|
|
|
|
// Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
|
|
|
|
// schedule:
|
|
|
|
// - filter: "[control] -> { }"
|
|
|
|
// So, we choose to not create this to keep the output a little nicer,
|
|
|
|
// at the cost of some code complexity.
|
|
|
|
Info.KillsSchedule = nullptr;
|
|
|
|
|
2017-07-18 17:15:16 +08:00
|
|
|
for (isl::id &ToKillId : KillMemIds) {
|
2017-07-05 22:57:04 +08:00
|
|
|
isl::id KillStmtId = isl::id::alloc(
|
2017-07-18 17:15:16 +08:00
|
|
|
S.getIslCtx(),
|
|
|
|
std::string("SKill_phantom_").append(ToKillId.get_name()), nullptr);
|
2017-07-05 22:57:04 +08:00
|
|
|
|
|
|
|
// NOTE: construction of tagged_must_kill:
|
|
|
|
// 2. We need to construct a map:
|
2017-07-18 17:15:16 +08:00
|
|
|
// [param] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
|
2017-07-05 22:57:04 +08:00
|
|
|
// To construct this, we use `isl_map_domain_product` on 2 maps`:
|
2017-07-18 17:15:16 +08:00
|
|
|
// 2a. StmtToScalar:
|
|
|
|
// [param] -> { Stmt_phantom[] -> scalar_to_kill[] }
|
|
|
|
// 2b. PhantomRefToScalar:
|
|
|
|
// [param] -> { ref_phantom[] -> scalar_to_kill[] }
|
2017-07-05 22:57:04 +08:00
|
|
|
//
|
|
|
|
// Combining these with `isl_map_domain_product` gives us
|
|
|
|
// TaggedMustKill:
|
2017-07-18 17:15:16 +08:00
|
|
|
// [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
|
2017-07-05 22:57:04 +08:00
|
|
|
|
2017-07-18 17:15:16 +08:00
|
|
|
// 2a. [param] -> { Stmt[] -> scalar_to_kill[] }
|
|
|
|
isl::map StmtToScalar = isl::map::universe(isl::space(ParamSpace));
|
|
|
|
StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::in, isl::id(KillStmtId));
|
|
|
|
StmtToScalar = StmtToScalar.set_tuple_id(isl::dim::out, isl::id(ToKillId));
|
2017-07-05 22:57:04 +08:00
|
|
|
|
|
|
|
isl::id PhantomRefId = isl::id::alloc(
|
2017-07-18 17:15:16 +08:00
|
|
|
S.getIslCtx(), std::string("ref_phantom") + ToKillId.get_name(),
|
|
|
|
nullptr);
|
2017-07-05 22:57:04 +08:00
|
|
|
|
2017-07-18 17:15:16 +08:00
|
|
|
// 2b. [param] -> { phantom_ref[] -> scalar_to_kill[] }
|
|
|
|
isl::map PhantomRefToScalar = isl::map::universe(isl::space(ParamSpace));
|
|
|
|
PhantomRefToScalar =
|
|
|
|
PhantomRefToScalar.set_tuple_id(isl::dim::in, PhantomRefId);
|
|
|
|
PhantomRefToScalar =
|
|
|
|
PhantomRefToScalar.set_tuple_id(isl::dim::out, ToKillId);
|
2017-07-05 22:57:04 +08:00
|
|
|
|
2017-07-18 17:15:16 +08:00
|
|
|
// 2. [param] -> { [Stmt[] -> phantom_ref[]] -> scalar_to_kill[] }
|
|
|
|
isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar);
|
2017-07-05 22:57:04 +08:00
|
|
|
Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
// 2. [param] -> { Stmt[] -> scalar_to_kill[] }
|
|
|
|
Info.MustKills = Info.TaggedMustKills.domain_factor_domain();
|
|
|
|
|
2017-07-05 22:57:04 +08:00
|
|
|
// 3. Create the kill schedule of the form:
|
|
|
|
// "[param] -> { Stmt_phantom[] }"
|
|
|
|
// Then add this to Info.KillsSchedule.
|
|
|
|
isl::space KillStmtSpace = ParamSpace;
|
|
|
|
KillStmtSpace = KillStmtSpace.set_tuple_id(isl::dim::set, KillStmtId);
|
|
|
|
isl::union_set KillStmtDomain = isl::set::universe(KillStmtSpace);
|
|
|
|
|
|
|
|
isl::schedule KillSchedule = isl::schedule::from_domain(KillStmtDomain);
|
|
|
|
if (Info.KillsSchedule)
|
|
|
|
Info.KillsSchedule = Info.KillsSchedule.set(KillSchedule);
|
|
|
|
else
|
|
|
|
Info.KillsSchedule = KillSchedule;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Info;
|
|
|
|
}
|
|
|
|
|
2016-07-14 23:51:32 +08:00
|
|
|
/// Create the ast expressions for a ScopStmt.
|
|
|
|
///
|
|
|
|
/// This function is a callback for to generate the ast expressions for each
|
|
|
|
/// of the scheduled ScopStmts.
|
|
|
|
static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
|
2017-07-24 16:34:24 +08:00
|
|
|
void *StmtT, __isl_take isl_ast_build *Build_C,
|
2016-07-14 23:51:32 +08:00
|
|
|
isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
|
|
|
|
isl_id *Id, void *User),
|
|
|
|
void *UserIndex,
|
|
|
|
isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
|
2016-07-21 21:15:59 +08:00
|
|
|
void *UserExpr) {
|
2016-07-14 23:51:32 +08:00
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
ScopStmt *Stmt = (ScopStmt *)StmtT;
|
2016-07-14 23:51:32 +08:00
|
|
|
|
2017-07-24 16:34:24 +08:00
|
|
|
if (!Stmt || !Build_C)
|
2016-07-21 21:15:59 +08:00
|
|
|
return NULL;
|
|
|
|
|
2017-07-24 16:34:24 +08:00
|
|
|
isl::ast_build Build = isl::manage(isl_ast_build_copy(Build_C));
|
|
|
|
isl::ctx Ctx = Build.get_ctx();
|
|
|
|
isl::id_to_ast_expr RefToExpr = isl::id_to_ast_expr::alloc(Ctx, 0);
|
2016-07-21 21:15:59 +08:00
|
|
|
|
|
|
|
for (MemoryAccess *Acc : *Stmt) {
|
2017-07-24 16:34:24 +08:00
|
|
|
isl::map AddrFunc = Acc->getAddressFunction();
|
2017-08-07 00:39:52 +08:00
|
|
|
AddrFunc = AddrFunc.intersect_domain(Stmt->getDomain());
|
2017-07-24 16:34:24 +08:00
|
|
|
|
|
|
|
isl::id RefId = Acc->getId();
|
|
|
|
isl::pw_multi_aff PMA = isl::pw_multi_aff::from_map(AddrFunc);
|
|
|
|
|
|
|
|
isl::multi_pw_aff MPA = isl::multi_pw_aff(PMA);
|
|
|
|
MPA = MPA.coalesce();
|
|
|
|
MPA = isl::manage(FunctionIndex(MPA.release(), RefId.get(), UserIndex));
|
|
|
|
|
|
|
|
isl::ast_expr Access = Build.access_from(MPA);
|
|
|
|
Access = isl::manage(FunctionExpr(Access.release(), RefId.get(), UserExpr));
|
|
|
|
RefToExpr = RefToExpr.set(RefId, Access);
|
2016-07-21 21:15:59 +08:00
|
|
|
}
|
|
|
|
|
2017-07-24 16:34:24 +08:00
|
|
|
return RefToExpr.release();
|
2016-07-14 23:51:32 +08:00
|
|
|
}
|
2016-07-14 18:22:25 +08:00
|
|
|
|
2017-05-09 18:45:52 +08:00
|
|
|
/// Given a LLVM Type, compute its size in bytes,
|
|
|
|
static int computeSizeInBytes(const Type *T) {
|
|
|
|
int bytes = T->getPrimitiveSizeInBits() / 8;
|
|
|
|
if (bytes == 0)
|
|
|
|
bytes = T->getScalarSizeInBits() / 8;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
/// Generate code for a GPU specific isl AST.
|
|
|
|
///
|
|
|
|
/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
|
2017-06-08 20:06:15 +08:00
|
|
|
/// generates code for general-purpose AST nodes, with special functionality
|
2016-07-18 19:56:39 +08:00
|
|
|
/// for generating GPU specific user nodes.
|
|
|
|
///
|
|
|
|
/// @see GPUNodeBuilder::createUser
|
|
|
|
class GPUNodeBuilder : public IslNodeBuilder {
|
|
|
|
public:
|
2017-04-04 18:01:53 +08:00
|
|
|
GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator,
|
2016-07-18 19:56:39 +08:00
|
|
|
const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
|
2016-11-03 06:32:23 +08:00
|
|
|
DominatorTree &DT, Scop &S, BasicBlock *StartBlock,
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
gpu_prog *Prog, GPURuntime Runtime, GPUArch Arch)
|
2017-04-04 18:01:53 +08:00
|
|
|
: IslNodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock),
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
Prog(Prog), Runtime(Runtime), Arch(Arch) {
|
2016-07-21 21:15:59 +08:00
|
|
|
getExprBuilder().setIDToSAI(&IDToSAI);
|
|
|
|
}
|
2016-07-18 19:56:39 +08:00
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
/// Create after-run-time-check initialization code.
|
|
|
|
void initializeAfterRTH();
|
|
|
|
|
|
|
|
/// Finalize the generated scop.
|
|
|
|
virtual void finalize();
|
|
|
|
|
2016-09-12 14:06:31 +08:00
|
|
|
/// Track if the full build process was successful.
|
|
|
|
///
|
|
|
|
/// This value is set to false, if throughout the build process an error
|
|
|
|
/// occurred which prevents us from generating valid GPU code.
|
|
|
|
bool BuildSuccessful = true;
|
|
|
|
|
2016-09-18 16:31:09 +08:00
|
|
|
/// The maximal number of loops surrounding a sequential kernel.
|
|
|
|
unsigned DeepestSequential = 0;
|
|
|
|
|
|
|
|
/// The maximal number of loops surrounding a parallel kernel.
|
|
|
|
unsigned DeepestParallel = 0;
|
|
|
|
|
Prefix the name of the calling host function in the name of callee GPU kernel
Summary:
Provide more context to the name of a GPU kernel by prefixing its name with the host function that calls it. E.g. The first kernel called by `gemm` would be `FUNC_gemm_KERNEL_0`.
Kernels currently follow the "kernel_#" (# = 0,1,2,3,...) nomenclature. This patch makes it easier to map host caller and device callee, especially when there are many kernels produced by Polly-ACC.
Reviewers: grosser, Meinersbur, bollu, philip.pfaffe, kbarton!
Reviewed By: grosser
Subscribers: nemanjai, pollydev
Tags: #polly
Differential Revision: https://reviews.llvm.org/D33985
llvm-svn: 307173
2017-07-06 00:48:21 +08:00
|
|
|
/// Return the name to set for the ptx_kernel.
|
|
|
|
std::string getKernelFuncName(int Kernel_id);
|
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
private:
|
2016-07-22 15:11:12 +08:00
|
|
|
/// A vector of array base pointers for which a new ScopArrayInfo was created.
|
|
|
|
///
|
|
|
|
/// This vector is used to delete the ScopArrayInfo when it is not needed any
|
|
|
|
/// more.
|
|
|
|
std::vector<Value *> LocalArrays;
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
/// A map from ScopArrays to their corresponding device allocations.
|
|
|
|
std::map<ScopArrayInfo *, Value *> DeviceAllocations;
|
2016-07-25 20:47:33 +08:00
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
/// The current GPU context.
|
|
|
|
Value *GPUContext;
|
|
|
|
|
2016-08-04 20:18:14 +08:00
|
|
|
/// The set of isl_ids allocated in the kernel
|
|
|
|
std::vector<isl_id *> KernelIds;
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
/// A module containing GPU code.
|
|
|
|
///
|
|
|
|
/// This pointer is only set in case we are currently generating GPU code.
|
|
|
|
std::unique_ptr<Module> GPUModule;
|
|
|
|
|
|
|
|
/// The GPU program we generate code for.
|
|
|
|
gpu_prog *Prog;
|
|
|
|
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
/// The GPU Runtime implementation to use (OpenCL or CUDA).
|
|
|
|
GPURuntime Runtime;
|
|
|
|
|
|
|
|
/// The GPU Architecture to target.
|
|
|
|
GPUArch Arch;
|
|
|
|
|
2016-07-19 15:32:44 +08:00
|
|
|
/// Class to free isl_ids.
|
|
|
|
class IslIdDeleter {
|
|
|
|
public:
|
|
|
|
void operator()(__isl_take isl_id *Id) { isl_id_free(Id); };
|
|
|
|
};
|
|
|
|
|
|
|
|
/// A set containing all isl_ids allocated in a GPU kernel.
|
|
|
|
///
|
|
|
|
/// By releasing this set all isl_ids will be freed.
|
|
|
|
std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
|
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
/// Create code for user-defined AST nodes.
|
|
|
|
///
|
|
|
|
/// These AST nodes can be of type:
|
|
|
|
///
|
|
|
|
/// - ScopStmt: A computational statement (TODO)
|
|
|
|
/// - Kernel: A GPU kernel call (TODO)
|
2016-07-25 20:47:39 +08:00
|
|
|
/// - Data-Transfer: A GPU <-> CPU data-transfer
|
2016-07-19 15:33:16 +08:00
|
|
|
/// - In-kernel synchronization
|
|
|
|
/// - In-kernel memory copy statement
|
2016-07-18 19:56:39 +08:00
|
|
|
///
|
2016-07-18 23:44:25 +08:00
|
|
|
/// @param UserStmt The ast node to generate code for.
|
|
|
|
virtual void createUser(__isl_take isl_ast_node *UserStmt);
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
enum DataDirection { HOST_TO_DEVICE, DEVICE_TO_HOST };
|
|
|
|
|
|
|
|
/// Create code for a data transfer statement
|
|
|
|
///
|
|
|
|
/// @param TransferStmt The data transfer statement.
|
|
|
|
/// @param Direction The direction in which to transfer data.
|
|
|
|
void createDataTransfer(__isl_take isl_ast_node *TransferStmt,
|
|
|
|
enum DataDirection Direction);
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
/// Find llvm::Values referenced in GPU kernel.
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to scan for llvm::Values
|
|
|
|
///
|
2017-08-06 10:39:05 +08:00
|
|
|
/// @returns A tuple, whose:
|
|
|
|
/// - First element contains the set of values referenced by the
|
|
|
|
/// kernel
|
|
|
|
/// - Second element contains the set of functions referenced by the
|
|
|
|
/// kernel. All functions in the set satisfy
|
|
|
|
/// `isValidFunctionInKernel`.
|
|
|
|
/// - Third element contains loops that have induction variables
|
|
|
|
/// which are used in the kernel, *and* these loops are *neither*
|
|
|
|
/// in the scop, nor do they immediately surroung the Scop.
|
|
|
|
/// See [Code generation of induction variables of loops outside
|
|
|
|
/// Scops]
|
|
|
|
std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>>
|
2017-06-26 21:12:06 +08:00
|
|
|
getReferencesInKernel(ppcg_kernel *Kernel);
|
2016-07-21 21:15:59 +08:00
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
/// Compute the sizes of the execution grid for a given kernel.
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to compute grid sizes for.
|
|
|
|
///
|
|
|
|
/// @returns A tuple with grid sizes for X and Y dimension
|
|
|
|
std::tuple<Value *, Value *> getGridSizes(ppcg_kernel *Kernel);
|
|
|
|
|
2017-08-06 19:10:38 +08:00
|
|
|
/// Get the managed array pointer for sending host pointers to the device.
|
2017-04-28 19:16:30 +08:00
|
|
|
/// \note
|
|
|
|
/// This is to be used only with managed memory
|
2017-08-06 19:10:38 +08:00
|
|
|
Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo);
|
2017-04-28 19:16:30 +08:00
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
/// Compute the sizes of the thread blocks for a given kernel.
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to compute thread block sizes for.
|
|
|
|
///
|
|
|
|
/// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
|
|
|
|
std::tuple<Value *, Value *, Value *> getBlockSizes(ppcg_kernel *Kernel);
|
|
|
|
|
2017-05-09 18:45:52 +08:00
|
|
|
/// Store a specific kernel launch parameter in the array of kernel launch
|
|
|
|
/// parameters.
|
|
|
|
///
|
|
|
|
/// @param Parameters The list of parameters in which to store.
|
|
|
|
/// @param Param The kernel launch parameter to store.
|
|
|
|
/// @param Index The index in the parameter list, at which to store the
|
|
|
|
/// parameter.
|
|
|
|
void insertStoreParameter(Instruction *Parameters, Instruction *Param,
|
|
|
|
int Index);
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
/// Create kernel launch parameters.
|
|
|
|
///
|
2016-08-04 14:55:49 +08:00
|
|
|
/// @param Kernel The kernel to create parameters for.
|
|
|
|
/// @param F The kernel function that has been created.
|
|
|
|
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
|
2016-07-27 21:20:16 +08:00
|
|
|
///
|
|
|
|
/// @returns A stack allocated array with pointers to the parameter
|
|
|
|
/// values that are passed to the kernel.
|
2016-08-04 14:55:49 +08:00
|
|
|
Value *createLaunchParameters(ppcg_kernel *Kernel, Function *F,
|
|
|
|
SetVector<Value *> SubtreeValues);
|
2016-07-27 21:20:16 +08:00
|
|
|
|
2016-08-04 20:18:14 +08:00
|
|
|
/// Create declarations for kernel variable.
|
|
|
|
///
|
|
|
|
/// This includes shared memory declarations.
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel definition to create variables for.
|
|
|
|
/// @param FN The function into which to generate the variables.
|
|
|
|
void createKernelVariables(ppcg_kernel *Kernel, Function *FN);
|
|
|
|
|
2016-08-05 14:47:43 +08:00
|
|
|
/// Add CUDA annotations to module.
|
|
|
|
///
|
|
|
|
/// Add a set of CUDA annotations that declares the maximal block dimensions
|
|
|
|
/// that will be used to execute the CUDA kernel. This allows the NVIDIA
|
|
|
|
/// PTX compiler to bound the number of allocated registers to ensure the
|
|
|
|
/// resulting kernel is known to run with up to as many block dimensions
|
|
|
|
/// as specified here.
|
|
|
|
///
|
|
|
|
/// @param M The module to add the annotations to.
|
|
|
|
/// @param BlockDimX The size of block dimension X.
|
|
|
|
/// @param BlockDimY The size of block dimension Y.
|
|
|
|
/// @param BlockDimZ The size of block dimension Z.
|
|
|
|
void addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY,
|
|
|
|
Value *BlockDimZ);
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
/// Create GPU kernel.
|
|
|
|
///
|
|
|
|
/// Code generate the kernel described by @p KernelStmt.
|
|
|
|
///
|
|
|
|
/// @param KernelStmt The ast node to generate kernel code for.
|
|
|
|
void createKernel(__isl_take isl_ast_node *KernelStmt);
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
/// Generate code that computes the size of an array.
|
|
|
|
///
|
|
|
|
/// @param Array The array for which to compute a size.
|
|
|
|
Value *getArraySize(gpu_array_info *Array);
|
|
|
|
|
2016-09-15 22:05:58 +08:00
|
|
|
/// Generate code to compute the minimal offset at which an array is accessed.
|
|
|
|
///
|
|
|
|
/// The offset of an array is the minimal array location accessed in a scop.
|
|
|
|
///
|
|
|
|
/// Example:
|
|
|
|
///
|
|
|
|
/// for (long i = 0; i < 100; i++)
|
|
|
|
/// A[i + 42] += ...
|
|
|
|
///
|
|
|
|
/// getArrayOffset(A) results in 42.
|
|
|
|
///
|
|
|
|
/// @param Array The array for which to compute the offset.
|
|
|
|
/// @returns An llvm::Value that contains the offset of the array.
|
|
|
|
Value *getArrayOffset(gpu_array_info *Array);
|
|
|
|
|
2016-08-04 14:55:59 +08:00
|
|
|
/// Prepare the kernel arguments for kernel code generation
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to generate code for.
|
|
|
|
/// @param FN The function created for the kernel.
|
|
|
|
void prepareKernelArguments(ppcg_kernel *Kernel, Function *FN);
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
/// Create kernel function.
|
|
|
|
///
|
|
|
|
/// Create a kernel function located in a newly created module that can serve
|
|
|
|
/// as target for device code generation. Set the Builder to point to the
|
|
|
|
/// start block of this newly created function.
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to generate code for.
|
2016-07-21 21:15:59 +08:00
|
|
|
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
|
2017-06-26 21:12:06 +08:00
|
|
|
/// @param SubtreeFunctions The set of llvm::Functions referenced by this
|
|
|
|
/// kernel.
|
2016-07-21 21:15:59 +08:00
|
|
|
void createKernelFunction(ppcg_kernel *Kernel,
|
2017-06-26 21:12:06 +08:00
|
|
|
SetVector<Value *> &SubtreeValues,
|
|
|
|
SetVector<Function *> &SubtreeFunctions);
|
2016-07-19 15:32:38 +08:00
|
|
|
|
|
|
|
/// Create the declaration of a kernel function.
|
|
|
|
///
|
|
|
|
/// The kernel function takes as arguments:
|
|
|
|
///
|
|
|
|
/// - One i8 pointer for each external array reference used in the kernel.
|
2016-07-19 15:32:55 +08:00
|
|
|
/// - Host iterators
|
2016-07-19 15:33:06 +08:00
|
|
|
/// - Parameters
|
2016-07-19 15:32:38 +08:00
|
|
|
/// - Other LLVM Value references (TODO)
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to generate the function declaration for.
|
2016-07-21 21:15:59 +08:00
|
|
|
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
|
|
|
|
///
|
2016-07-19 15:32:38 +08:00
|
|
|
/// @returns The newly declared function.
|
2016-07-21 21:15:59 +08:00
|
|
|
Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
|
|
|
|
SetVector<Value *> &SubtreeValues);
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-07-19 15:32:44 +08:00
|
|
|
/// Insert intrinsic functions to obtain thread and block ids.
|
|
|
|
///
|
|
|
|
/// @param The kernel to generate the intrinsic functions for.
|
|
|
|
void insertKernelIntrinsics(ppcg_kernel *Kernel);
|
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
/// Insert function calls to retrieve the SPIR group/local ids.
|
|
|
|
///
|
|
|
|
/// @param The kernel to generate the function calls for.
|
|
|
|
void insertKernelCallsSPIR(ppcg_kernel *Kernel);
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
/// Setup the creation of functions referenced by the GPU kernel.
|
|
|
|
///
|
|
|
|
/// 1. Create new function declarations in GPUModule which are the same as
|
|
|
|
/// SubtreeFunctions.
|
|
|
|
///
|
|
|
|
/// 2. Populate IslNodeBuilder::ValueMap with mappings from
|
|
|
|
/// old functions (that come from the original module) to new functions
|
|
|
|
/// (that are created within GPUModule). That way, we generate references
|
|
|
|
/// to the correct function (in GPUModule) in BlockGenerator.
|
|
|
|
///
|
|
|
|
/// @see IslNodeBuilder::ValueMap
|
|
|
|
/// @see BlockGenerator::GlobalMap
|
|
|
|
/// @see BlockGenerator::getNewValue
|
|
|
|
/// @see GPUNodeBuilder::getReferencesInKernel.
|
|
|
|
///
|
|
|
|
/// @param SubtreeFunctions The set of llvm::Functions referenced by
|
|
|
|
/// this kernel.
|
|
|
|
void setupKernelSubtreeFunctions(SetVector<Function *> SubtreeFunctions);
|
|
|
|
|
2016-08-04 20:18:14 +08:00
|
|
|
/// Create a global-to-shared or shared-to-global copy statement.
|
|
|
|
///
|
|
|
|
/// @param CopyStmt The copy statement to generate code for
|
|
|
|
void createKernelCopy(ppcg_kernel_stmt *CopyStmt);
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
/// Create code for a ScopStmt called in @p Expr.
|
|
|
|
///
|
|
|
|
/// @param Expr The expression containing the call.
|
|
|
|
/// @param KernelStmt The kernel statement referenced in the call.
|
|
|
|
void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
|
|
|
|
|
2016-07-19 15:33:16 +08:00
|
|
|
/// Create an in-kernel synchronization call.
|
|
|
|
void createKernelSync();
|
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
/// Create a PTX assembly string for the current GPU kernel.
|
|
|
|
///
|
|
|
|
/// @returns A string containing the corresponding PTX assembly code.
|
|
|
|
std::string createKernelASM();
|
|
|
|
|
|
|
|
/// Remove references from the dominator tree to the kernel function @p F.
|
|
|
|
///
|
|
|
|
/// @param F The function to remove references to.
|
|
|
|
void clearDominators(Function *F);
|
|
|
|
|
|
|
|
/// Remove references from scalar evolution to the kernel function @p F.
|
|
|
|
///
|
|
|
|
/// @param F The function to remove references to.
|
|
|
|
void clearScalarEvolution(Function *F);
|
|
|
|
|
|
|
|
/// Remove references from loop info to the kernel function @p F.
|
|
|
|
///
|
|
|
|
/// @param F The function to remove references to.
|
|
|
|
void clearLoops(Function *F);
|
|
|
|
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
/// Check if the scop requires to be linked with CUDA's libdevice.
|
|
|
|
bool requiresCUDALibDevice();
|
|
|
|
|
|
|
|
/// Link with the NVIDIA libdevice library (if needed and available).
|
|
|
|
void addCUDALibDevice();
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
/// Finalize the generation of the kernel function.
|
|
|
|
///
|
|
|
|
/// Free the LLVM-IR module corresponding to the kernel and -- if requested --
|
|
|
|
/// dump its IR to stderr.
|
2016-07-26 00:31:21 +08:00
|
|
|
///
|
|
|
|
/// @returns The Assembly string of the kernel.
|
|
|
|
std::string finalizeKernelFunction();
|
2016-07-25 17:16:01 +08:00
|
|
|
|
2016-09-18 03:22:31 +08:00
|
|
|
/// Finalize the generation of the kernel arguments.
|
|
|
|
///
|
|
|
|
/// This function ensures that not-read-only scalars used in a kernel are
|
2017-06-08 20:06:15 +08:00
|
|
|
/// stored back to the global memory location they are backed with before
|
2016-09-18 03:22:31 +08:00
|
|
|
/// the kernel terminates.
|
|
|
|
///
|
|
|
|
/// @params Kernel The kernel to finalize kernel arguments for.
|
|
|
|
void finalizeKernelArguments(ppcg_kernel *Kernel);
|
|
|
|
|
2016-07-25 20:47:33 +08:00
|
|
|
/// Create code that allocates memory to store arrays on device.
|
2016-07-25 17:16:01 +08:00
|
|
|
void allocateDeviceArrays();
|
|
|
|
|
2017-08-06 19:10:38 +08:00
|
|
|
/// Create code to prepare the managed device pointers.
|
|
|
|
void prepareManagedDeviceArrays();
|
|
|
|
|
2016-07-25 20:47:33 +08:00
|
|
|
/// Free all allocated device arrays.
|
|
|
|
void freeDeviceArrays();
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
/// Create a call to initialize the GPU context.
|
|
|
|
///
|
|
|
|
/// @returns A pointer to the newly initialized context.
|
|
|
|
Value *createCallInitContext();
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
/// Create a call to get the device pointer for a kernel allocation.
|
|
|
|
///
|
|
|
|
/// @param Allocation The Polly GPU allocation
|
|
|
|
///
|
|
|
|
/// @returns The device parameter corresponding to this allocation.
|
|
|
|
Value *createCallGetDevicePtr(Value *Allocation);
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
/// Create a call to free the GPU context.
|
|
|
|
///
|
|
|
|
/// @param Context A pointer to an initialized GPU context.
|
|
|
|
void createCallFreeContext(Value *Context);
|
|
|
|
|
2016-07-25 20:47:33 +08:00
|
|
|
/// Create a call to allocate memory on the device.
|
|
|
|
///
|
|
|
|
/// @param Size The size of memory to allocate
|
|
|
|
///
|
|
|
|
/// @returns A pointer that identifies this allocation.
|
2016-07-25 17:16:01 +08:00
|
|
|
Value *createCallAllocateMemoryForDevice(Value *Size);
|
2016-07-25 20:47:33 +08:00
|
|
|
|
|
|
|
/// Create a call to free a device array.
|
|
|
|
///
|
|
|
|
/// @param Array The device array to free.
|
|
|
|
void createCallFreeDeviceMemory(Value *Array);
|
2016-07-25 20:47:39 +08:00
|
|
|
|
|
|
|
/// Create a call to copy data from host to device.
|
|
|
|
///
|
|
|
|
/// @param HostPtr A pointer to the host data that should be copied.
|
|
|
|
/// @param DevicePtr A device pointer specifying the location to copy to.
|
|
|
|
void createCallCopyFromHostToDevice(Value *HostPtr, Value *DevicePtr,
|
|
|
|
Value *Size);
|
|
|
|
|
|
|
|
/// Create a call to copy data from device to host.
|
|
|
|
///
|
|
|
|
/// @param DevicePtr A pointer to the device data that should be copied.
|
|
|
|
/// @param HostPtr A host pointer specifying the location to copy to.
|
|
|
|
void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr,
|
|
|
|
Value *Size);
|
2016-07-26 00:31:21 +08:00
|
|
|
|
2017-04-28 19:16:30 +08:00
|
|
|
/// Create a call to synchronize Host & Device.
|
|
|
|
/// \note
|
|
|
|
/// This is to be used only with managed memory.
|
|
|
|
void createCallSynchronizeDevice();
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
/// Create a call to get a kernel from an assembly string.
|
|
|
|
///
|
|
|
|
/// @param Buffer The string describing the kernel.
|
|
|
|
/// @param Entry The name of the kernel function to call.
|
|
|
|
///
|
|
|
|
/// @returns A pointer to a kernel object
|
|
|
|
Value *createCallGetKernel(Value *Buffer, Value *Entry);
|
|
|
|
|
|
|
|
/// Create a call to free a GPU kernel.
|
|
|
|
///
|
|
|
|
/// @param GPUKernel THe kernel to free.
|
|
|
|
void createCallFreeKernel(Value *GPUKernel);
|
2016-07-27 21:20:16 +08:00
|
|
|
|
|
|
|
/// Create a call to launch a GPU kernel.
|
|
|
|
///
|
|
|
|
/// @param GPUKernel The kernel to launch.
|
|
|
|
/// @param GridDimX The size of the first grid dimension.
|
|
|
|
/// @param GridDimY The size of the second grid dimension.
|
|
|
|
/// @param GridBlockX The size of the first block dimension.
|
|
|
|
/// @param GridBlockY The size of the second block dimension.
|
|
|
|
/// @param GridBlockZ The size of the third block dimension.
|
2017-06-08 20:06:15 +08:00
|
|
|
/// @param Parameters A pointer to an array that contains itself pointers to
|
2016-07-27 21:20:16 +08:00
|
|
|
/// the parameter values passed for each kernel argument.
|
|
|
|
void createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
|
|
|
|
Value *GridDimY, Value *BlockDimX,
|
|
|
|
Value *BlockDimY, Value *BlockDimZ,
|
|
|
|
Value *Parameters);
|
2016-07-18 19:56:39 +08:00
|
|
|
};
|
|
|
|
|
Prefix the name of the calling host function in the name of callee GPU kernel
Summary:
Provide more context to the name of a GPU kernel by prefixing its name with the host function that calls it. E.g. The first kernel called by `gemm` would be `FUNC_gemm_KERNEL_0`.
Kernels currently follow the "kernel_#" (# = 0,1,2,3,...) nomenclature. This patch makes it easier to map host caller and device callee, especially when there are many kernels produced by Polly-ACC.
Reviewers: grosser, Meinersbur, bollu, philip.pfaffe, kbarton!
Reviewed By: grosser
Subscribers: nemanjai, pollydev
Tags: #polly
Differential Revision: https://reviews.llvm.org/D33985
llvm-svn: 307173
2017-07-06 00:48:21 +08:00
|
|
|
std::string GPUNodeBuilder::getKernelFuncName(int Kernel_id) {
|
2017-07-13 00:46:19 +08:00
|
|
|
return "FUNC_" + S.getFunction().getName().str() + "_SCOP_" +
|
|
|
|
std::to_string(S.getID()) + "_KERNEL_" + std::to_string(Kernel_id);
|
Prefix the name of the calling host function in the name of callee GPU kernel
Summary:
Provide more context to the name of a GPU kernel by prefixing its name with the host function that calls it. E.g. The first kernel called by `gemm` would be `FUNC_gemm_KERNEL_0`.
Kernels currently follow the "kernel_#" (# = 0,1,2,3,...) nomenclature. This patch makes it easier to map host caller and device callee, especially when there are many kernels produced by Polly-ACC.
Reviewers: grosser, Meinersbur, bollu, philip.pfaffe, kbarton!
Reviewed By: grosser
Subscribers: nemanjai, pollydev
Tags: #polly
Differential Revision: https://reviews.llvm.org/D33985
llvm-svn: 307173
2017-07-06 00:48:21 +08:00
|
|
|
}
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
void GPUNodeBuilder::initializeAfterRTH() {
|
2016-08-09 23:35:03 +08:00
|
|
|
BasicBlock *NewBB = SplitBlock(Builder.GetInsertBlock(),
|
|
|
|
&*Builder.GetInsertPoint(), &DT, &LI);
|
|
|
|
NewBB->setName("polly.acc.initialize");
|
|
|
|
Builder.SetInsertPoint(&NewBB->front());
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
GPUContext = createCallInitContext();
|
2017-04-28 19:16:30 +08:00
|
|
|
|
|
|
|
if (!ManagedMemory)
|
|
|
|
allocateDeviceArrays();
|
2017-08-06 19:10:38 +08:00
|
|
|
else
|
|
|
|
prepareManagedDeviceArrays();
|
2016-07-25 17:16:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::finalize() {
|
2017-04-28 19:16:30 +08:00
|
|
|
if (!ManagedMemory)
|
|
|
|
freeDeviceArrays();
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
createCallFreeContext(GPUContext);
|
|
|
|
IslNodeBuilder::finalize();
|
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::allocateDeviceArrays() {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory will directly send host pointers "
|
|
|
|
"to the kernel. There is no need for device arrays");
|
2017-08-07 03:52:38 +08:00
|
|
|
isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release());
|
2016-07-25 17:16:01 +08:00
|
|
|
|
|
|
|
for (int i = 0; i < Prog->n_array; ++i) {
|
|
|
|
gpu_array_info *Array = &Prog->array[i];
|
2016-07-25 20:47:39 +08:00
|
|
|
auto *ScopArray = (ScopArrayInfo *)Array->user;
|
2016-07-25 20:47:33 +08:00
|
|
|
std::string DevArrayName("p_dev_array_");
|
|
|
|
DevArrayName.append(Array->name);
|
2016-07-25 17:16:01 +08:00
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
Value *ArraySize = getArraySize(Array);
|
2016-09-15 22:05:58 +08:00
|
|
|
Value *Offset = getArrayOffset(Array);
|
|
|
|
if (Offset)
|
|
|
|
ArraySize = Builder.CreateSub(
|
|
|
|
ArraySize,
|
|
|
|
Builder.CreateMul(Offset,
|
|
|
|
Builder.getInt64(ScopArray->getElemSizeInBytes())));
|
2016-07-25 20:47:33 +08:00
|
|
|
Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
|
|
|
|
DevArray->setName(DevArrayName);
|
2016-07-25 20:47:39 +08:00
|
|
|
DeviceAllocations[ScopArray] = DevArray;
|
2016-07-25 17:16:01 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
isl_ast_build_free(Build);
|
|
|
|
}
|
|
|
|
|
2017-08-06 19:10:38 +08:00
|
|
|
void GPUNodeBuilder::prepareManagedDeviceArrays() {
|
|
|
|
assert(ManagedMemory &&
|
|
|
|
"Device array most only be prepared in managed-memory mode");
|
|
|
|
for (int i = 0; i < Prog->n_array; ++i) {
|
|
|
|
gpu_array_info *Array = &Prog->array[i];
|
|
|
|
ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user;
|
|
|
|
Value *HostPtr;
|
|
|
|
|
|
|
|
if (gpu_array_is_scalar(Array))
|
|
|
|
HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
|
|
|
|
else
|
|
|
|
HostPtr = ScopArray->getBasePtr();
|
|
|
|
HostPtr = getLatestValue(HostPtr);
|
|
|
|
|
|
|
|
Value *Offset = getArrayOffset(Array);
|
|
|
|
if (Offset) {
|
|
|
|
HostPtr = Builder.CreatePointerCast(
|
|
|
|
HostPtr, ScopArray->getElementType()->getPointerTo());
|
|
|
|
HostPtr = Builder.CreateGEP(HostPtr, Offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
|
|
|
|
DeviceAllocations[ScopArray] = HostPtr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-05 14:47:43 +08:00
|
|
|
void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX,
|
|
|
|
Value *BlockDimY, Value *BlockDimZ) {
|
|
|
|
auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations");
|
|
|
|
|
|
|
|
for (auto &F : *M) {
|
|
|
|
if (F.getCallingConv() != CallingConv::PTX_Kernel)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Value *V[] = {BlockDimX, BlockDimY, BlockDimZ};
|
|
|
|
|
|
|
|
Metadata *Elements[] = {
|
|
|
|
ValueAsMetadata::get(&F), MDString::get(M->getContext(), "maxntidx"),
|
|
|
|
ValueAsMetadata::get(V[0]), MDString::get(M->getContext(), "maxntidy"),
|
|
|
|
ValueAsMetadata::get(V[1]), MDString::get(M->getContext(), "maxntidz"),
|
|
|
|
ValueAsMetadata::get(V[2]),
|
|
|
|
};
|
|
|
|
MDNode *Node = MDNode::get(M->getContext(), Elements);
|
|
|
|
AnnotationNode->addOperand(Node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:33 +08:00
|
|
|
void GPUNodeBuilder::freeDeviceArrays() {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory does not use device arrays");
|
2016-07-25 20:47:39 +08:00
|
|
|
for (auto &Array : DeviceAllocations)
|
|
|
|
createCallFreeDeviceMemory(Array.second);
|
2016-07-25 20:47:33 +08:00
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
Value *GPUNodeBuilder::createCallGetKernel(Value *Buffer, Value *Entry) {
|
|
|
|
const char *Name = "polly_getKernel";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Builder.CreateCall(F, {Buffer, Entry});
|
|
|
|
}
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
Value *GPUNodeBuilder::createCallGetDevicePtr(Value *Allocation) {
|
|
|
|
const char *Name = "polly_getDevicePtr";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Builder.CreateCall(F, {Allocation});
|
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::createCallLaunchKernel(Value *GPUKernel, Value *GridDimX,
|
|
|
|
Value *GridDimY, Value *BlockDimX,
|
|
|
|
Value *BlockDimY, Value *BlockDimZ,
|
|
|
|
Value *Parameters) {
|
|
|
|
const char *Name = "polly_launchKernel";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
Args.push_back(Builder.getInt32Ty());
|
|
|
|
Args.push_back(Builder.getInt32Ty());
|
|
|
|
Args.push_back(Builder.getInt32Ty());
|
|
|
|
Args.push_back(Builder.getInt32Ty());
|
|
|
|
Args.push_back(Builder.getInt32Ty());
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
2017-02-01 18:12:09 +08:00
|
|
|
Builder.CreateCall(F, {GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
|
|
|
|
BlockDimZ, Parameters});
|
2016-07-27 21:20:16 +08:00
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
void GPUNodeBuilder::createCallFreeKernel(Value *GPUKernel) {
|
|
|
|
const char *Name = "polly_freeKernel";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateCall(F, {GPUKernel});
|
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:33 +08:00
|
|
|
void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory does not allocate or free memory "
|
|
|
|
"for device");
|
2016-07-25 20:47:33 +08:00
|
|
|
const char *Name = "polly_freeDeviceMemory";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateCall(F, {Array});
|
|
|
|
}
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory does not allocate or free memory "
|
|
|
|
"for device");
|
2016-07-25 17:16:01 +08:00
|
|
|
const char *Name = "polly_allocateMemoryForDevice";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt64Ty());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Builder.CreateCall(F, {Size});
|
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData,
|
|
|
|
Value *DeviceData,
|
|
|
|
Value *Size) {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory does not transfer memory between "
|
|
|
|
"device and host");
|
2016-07-25 20:47:39 +08:00
|
|
|
const char *Name = "polly_copyFromHostToDevice";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
Args.push_back(Builder.getInt64Ty());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateCall(F, {HostData, DeviceData, Size});
|
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData,
|
|
|
|
Value *HostData,
|
|
|
|
Value *Size) {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory does not transfer memory between "
|
|
|
|
"device and host");
|
2016-07-25 20:47:39 +08:00
|
|
|
const char *Name = "polly_copyFromDeviceToHost";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
Args.push_back(Builder.getInt64Ty());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateCall(F, {DeviceData, HostData, Size});
|
|
|
|
}
|
|
|
|
|
2017-04-28 19:16:30 +08:00
|
|
|
void GPUNodeBuilder::createCallSynchronizeDevice() {
|
|
|
|
assert(ManagedMemory && "explicit synchronization is only necessary for "
|
|
|
|
"managed memory");
|
|
|
|
const char *Name = "polly_synchronizeDevice";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateCall(F);
|
|
|
|
}
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
Value *GPUNodeBuilder::createCallInitContext() {
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
const char *Name;
|
|
|
|
|
|
|
|
switch (Runtime) {
|
|
|
|
case GPURuntime::CUDA:
|
|
|
|
Name = "polly_initContextCUDA";
|
|
|
|
break;
|
|
|
|
case GPURuntime::OpenCL:
|
|
|
|
Name = "polly_initContextCL";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-07-25 17:16:01 +08:00
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Builder.CreateCall(F, {});
|
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::createCallFreeContext(Value *Context) {
|
|
|
|
const char *Name = "polly_freeContext";
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *F = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If F is not available, declare it.
|
|
|
|
if (!F) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy());
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
F = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
}
|
|
|
|
|
|
|
|
Builder.CreateCall(F, {Context});
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:33:16 +08:00
|
|
|
/// Check if one string is a prefix of another.
|
|
|
|
///
|
|
|
|
/// @param String The string in which to look for the prefix.
|
|
|
|
/// @param Prefix The prefix to look for.
|
|
|
|
static bool isPrefix(std::string String, std::string Prefix) {
|
|
|
|
return String.find(Prefix) == 0;
|
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
|
2017-08-07 04:11:59 +08:00
|
|
|
isl::ast_build Build = isl::ast_build::from_context(S.getContext());
|
2016-07-25 20:47:39 +08:00
|
|
|
Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
|
|
|
|
|
|
|
|
if (!gpu_array_is_scalar(Array)) {
|
2017-07-24 17:08:21 +08:00
|
|
|
isl::multi_pw_aff ArrayBound =
|
|
|
|
isl::manage(isl_multi_pw_aff_copy(Array->bound));
|
|
|
|
|
|
|
|
isl::pw_aff OffsetDimZero = ArrayBound.get_pw_aff(0);
|
|
|
|
isl::ast_expr Res = Build.expr_from(OffsetDimZero);
|
2016-07-25 20:47:39 +08:00
|
|
|
|
|
|
|
for (unsigned int i = 1; i < Array->n_index; i++) {
|
2017-07-24 17:08:21 +08:00
|
|
|
isl::pw_aff Bound_I = ArrayBound.get_pw_aff(i);
|
|
|
|
isl::ast_expr Expr = Build.expr_from(Bound_I);
|
|
|
|
Res = Res.mul(Expr);
|
2016-07-25 20:47:39 +08:00
|
|
|
}
|
|
|
|
|
2017-07-24 17:08:21 +08:00
|
|
|
Value *NumElements = ExprBuilder.create(Res.release());
|
2016-09-13 16:02:14 +08:00
|
|
|
if (NumElements->getType() != ArraySize->getType())
|
|
|
|
NumElements = Builder.CreateSExt(NumElements, ArraySize->getType());
|
2016-07-25 20:47:39 +08:00
|
|
|
ArraySize = Builder.CreateMul(ArraySize, NumElements);
|
|
|
|
}
|
|
|
|
return ArraySize;
|
|
|
|
}
|
|
|
|
|
2016-09-15 22:05:58 +08:00
|
|
|
Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
|
|
|
|
if (gpu_array_is_scalar(Array))
|
|
|
|
return nullptr;
|
|
|
|
|
2017-08-07 04:11:59 +08:00
|
|
|
isl::ast_build Build = isl::ast_build::from_context(S.getContext());
|
2016-09-15 22:05:58 +08:00
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
isl::set Min = isl::manage(isl_set_copy(Array->extent)).lexmin();
|
2016-09-15 22:05:58 +08:00
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
isl::set ZeroSet = isl::set::universe(Min.get_space());
|
2016-09-15 22:05:58 +08:00
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
for (long i = 0; i < Min.dim(isl::dim::set); i++)
|
|
|
|
ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0);
|
2016-09-15 22:05:58 +08:00
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
if (Min.is_subset(ZeroSet)) {
|
2016-09-15 22:05:58 +08:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.get_ctx(), 0));
|
2016-09-15 22:05:58 +08:00
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
for (long i = 0; i < Min.dim(isl::dim::set); i++) {
|
2016-09-15 22:05:58 +08:00
|
|
|
if (i > 0) {
|
2017-08-01 17:58:55 +08:00
|
|
|
isl::pw_aff Bound_I =
|
|
|
|
isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1));
|
|
|
|
isl::ast_expr BExpr = Build.expr_from(Bound_I);
|
|
|
|
Result = Result.mul(BExpr);
|
2016-09-15 22:05:58 +08:00
|
|
|
}
|
2017-08-01 17:58:55 +08:00
|
|
|
isl::pw_aff DimMin = Min.dim_min(i);
|
|
|
|
isl::ast_expr MExpr = Build.expr_from(DimMin);
|
|
|
|
Result = Result.add(MExpr);
|
2016-09-15 22:05:58 +08:00
|
|
|
}
|
|
|
|
|
2017-08-01 17:58:55 +08:00
|
|
|
return ExprBuilder.create(Result.release());
|
2016-09-15 22:05:58 +08:00
|
|
|
}
|
|
|
|
|
2017-08-06 19:10:38 +08:00
|
|
|
Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array,
|
|
|
|
ScopArrayInfo *ArrayInfo) {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(ManagedMemory && "Only used when you wish to get a host "
|
|
|
|
"pointer for sending data to the kernel, "
|
|
|
|
"with managed memory");
|
|
|
|
std::map<ScopArrayInfo *, Value *>::iterator it;
|
2017-08-06 19:10:38 +08:00
|
|
|
it = DeviceAllocations.find(ArrayInfo);
|
|
|
|
assert(it != DeviceAllocations.end() &&
|
|
|
|
"Device array expected to be available");
|
|
|
|
return it->second;
|
2017-04-28 19:16:30 +08:00
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
|
|
|
|
enum DataDirection Direction) {
|
2017-04-28 19:16:30 +08:00
|
|
|
assert(!ManagedMemory && "Managed memory needs no data transfers");
|
2016-07-25 20:47:39 +08:00
|
|
|
isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
|
|
|
|
isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0);
|
|
|
|
isl_id *Id = isl_ast_expr_get_id(Arg);
|
|
|
|
auto Array = (gpu_array_info *)isl_id_get_user(Id);
|
|
|
|
auto ScopArray = (ScopArrayInfo *)(Array->user);
|
|
|
|
|
|
|
|
Value *Size = getArraySize(Array);
|
2016-09-15 22:05:58 +08:00
|
|
|
Value *Offset = getArrayOffset(Array);
|
2016-07-25 20:47:39 +08:00
|
|
|
Value *DevPtr = DeviceAllocations[ScopArray];
|
|
|
|
|
2016-08-09 23:35:06 +08:00
|
|
|
Value *HostPtr;
|
|
|
|
|
|
|
|
if (gpu_array_is_scalar(Array))
|
|
|
|
HostPtr = BlockGen.getOrCreateAlloca(ScopArray);
|
|
|
|
else
|
|
|
|
HostPtr = ScopArray->getBasePtr();
|
2017-08-01 22:26:39 +08:00
|
|
|
HostPtr = getLatestValue(HostPtr);
|
2016-07-25 20:47:39 +08:00
|
|
|
|
2016-09-15 22:05:58 +08:00
|
|
|
if (Offset) {
|
|
|
|
HostPtr = Builder.CreatePointerCast(
|
|
|
|
HostPtr, ScopArray->getElementType()->getPointerTo());
|
|
|
|
HostPtr = Builder.CreateGEP(HostPtr, Offset);
|
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());
|
|
|
|
|
2016-09-15 22:05:58 +08:00
|
|
|
if (Offset) {
|
|
|
|
Size = Builder.CreateSub(
|
2017-02-01 18:12:09 +08:00
|
|
|
Size, Builder.CreateMul(
|
|
|
|
Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
|
2016-09-15 22:05:58 +08:00
|
|
|
}
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
if (Direction == HOST_TO_DEVICE)
|
|
|
|
createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
|
|
|
|
else
|
|
|
|
createCallCopyFromDeviceToHost(DevPtr, HostPtr, Size);
|
|
|
|
|
|
|
|
isl_id_free(Id);
|
|
|
|
isl_ast_expr_free(Arg);
|
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
isl_ast_node_free(TransferStmt);
|
|
|
|
}
|
|
|
|
|
2016-07-18 23:44:25 +08:00
|
|
|
void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
|
2016-07-19 15:32:38 +08:00
|
|
|
isl_ast_expr *Expr = isl_ast_node_user_get_expr(UserStmt);
|
|
|
|
isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
|
|
|
|
isl_id *Id = isl_ast_expr_get_id(StmtExpr);
|
|
|
|
isl_id_free(Id);
|
|
|
|
isl_ast_expr_free(StmtExpr);
|
|
|
|
|
|
|
|
const char *Str = isl_id_get_name(Id);
|
|
|
|
if (!strcmp(Str, "kernel")) {
|
|
|
|
createKernel(UserStmt);
|
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
return;
|
|
|
|
}
|
2017-07-20 23:48:36 +08:00
|
|
|
if (!strcmp(Str, "init_device")) {
|
|
|
|
initializeAfterRTH();
|
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!strcmp(Str, "clear_device")) {
|
|
|
|
finalize();
|
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
return;
|
|
|
|
}
|
2016-07-25 20:47:39 +08:00
|
|
|
if (isPrefix(Str, "to_device")) {
|
2017-04-28 19:16:30 +08:00
|
|
|
if (!ManagedMemory)
|
|
|
|
createDataTransfer(UserStmt, HOST_TO_DEVICE);
|
|
|
|
else
|
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
|
2016-07-25 20:47:39 +08:00
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isPrefix(Str, "from_device")) {
|
2017-04-28 19:16:30 +08:00
|
|
|
if (!ManagedMemory) {
|
|
|
|
createDataTransfer(UserStmt, DEVICE_TO_HOST);
|
|
|
|
} else {
|
|
|
|
createCallSynchronizeDevice();
|
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
}
|
2016-07-19 15:33:16 +08:00
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
isl_id *Anno = isl_ast_node_get_annotation(UserStmt);
|
|
|
|
struct ppcg_kernel_stmt *KernelStmt =
|
|
|
|
(struct ppcg_kernel_stmt *)isl_id_get_user(Anno);
|
|
|
|
isl_id_free(Anno);
|
|
|
|
|
|
|
|
switch (KernelStmt->type) {
|
|
|
|
case ppcg_kernel_domain:
|
2016-07-21 21:15:59 +08:00
|
|
|
createScopStmt(Expr, KernelStmt);
|
2016-07-19 15:33:16 +08:00
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
return;
|
|
|
|
case ppcg_kernel_copy:
|
2016-08-04 20:18:14 +08:00
|
|
|
createKernelCopy(KernelStmt);
|
2016-07-19 15:33:16 +08:00
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
return;
|
|
|
|
case ppcg_kernel_sync:
|
|
|
|
createKernelSync();
|
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
isl_ast_expr_free(Expr);
|
2016-07-18 23:44:25 +08:00
|
|
|
isl_ast_node_free(UserStmt);
|
|
|
|
return;
|
|
|
|
}
|
2016-08-04 20:18:14 +08:00
|
|
|
void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
|
|
|
|
isl_ast_expr *LocalIndex = isl_ast_expr_copy(KernelStmt->u.c.local_index);
|
|
|
|
LocalIndex = isl_ast_expr_address_of(LocalIndex);
|
|
|
|
Value *LocalAddr = ExprBuilder.create(LocalIndex);
|
|
|
|
isl_ast_expr *Index = isl_ast_expr_copy(KernelStmt->u.c.index);
|
|
|
|
Index = isl_ast_expr_address_of(Index);
|
|
|
|
Value *GlobalAddr = ExprBuilder.create(Index);
|
|
|
|
|
|
|
|
if (KernelStmt->u.c.read) {
|
|
|
|
LoadInst *Load = Builder.CreateLoad(GlobalAddr, "shared.read");
|
|
|
|
Builder.CreateStore(Load, LocalAddr);
|
|
|
|
} else {
|
|
|
|
LoadInst *Load = Builder.CreateLoad(LocalAddr, "shared.write");
|
|
|
|
Builder.CreateStore(Load, GlobalAddr);
|
|
|
|
}
|
|
|
|
}
|
2016-07-18 23:44:25 +08:00
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
|
|
|
|
ppcg_kernel_stmt *KernelStmt) {
|
|
|
|
auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
|
|
|
|
isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
|
|
|
|
|
|
|
|
LoopToScevMapT LTS;
|
|
|
|
LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
|
|
|
|
|
|
|
|
createSubstitutions(Expr, Stmt, LTS);
|
|
|
|
|
|
|
|
if (Stmt->isBlockStmt())
|
|
|
|
BlockGen.copyStmt(*Stmt, LTS, Indexes);
|
|
|
|
else
|
2016-09-13 16:42:10 +08:00
|
|
|
RegionGen.copyStmt(*Stmt, LTS, Indexes);
|
2016-07-21 21:15:59 +08:00
|
|
|
}
|
|
|
|
|
2016-07-19 15:33:16 +08:00
|
|
|
void GPUNodeBuilder::createKernelSync() {
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
const char *SpirName = "__gen_ocl_barrier_global";
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
|
|
|
|
Function *Sync;
|
|
|
|
|
|
|
|
switch (Arch) {
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
case GPUArch::SPIR64:
|
|
|
|
case GPUArch::SPIR32:
|
|
|
|
Sync = M->getFunction(SpirName);
|
|
|
|
|
|
|
|
// If Sync is not available, declare it.
|
|
|
|
if (!Sync) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
Sync = Function::Create(Ty, Linkage, SpirName, M);
|
|
|
|
Sync->setCallingConv(CallingConv::SPIR_FUNC);
|
|
|
|
}
|
|
|
|
break;
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:33:16 +08:00
|
|
|
Builder.CreateCall(Sync, {});
|
|
|
|
}
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
/// Collect llvm::Values referenced from @p Node
|
|
|
|
///
|
|
|
|
/// This function only applies to isl_ast_nodes that are user_nodes referring
|
|
|
|
/// to a ScopStmt. All other node types are ignore.
|
|
|
|
///
|
|
|
|
/// @param Node The node to collect references for.
|
|
|
|
/// @param User A user pointer used as storage for the data that is collected.
|
|
|
|
///
|
|
|
|
/// @returns isl_bool_true if data could be collected successfully.
|
|
|
|
isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
|
|
|
|
if (isl_ast_node_get_type(Node) != isl_ast_node_user)
|
|
|
|
return isl_bool_true;
|
|
|
|
|
|
|
|
isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
|
|
|
|
isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
|
|
|
|
isl_id *Id = isl_ast_expr_get_id(StmtExpr);
|
|
|
|
const char *Str = isl_id_get_name(Id);
|
|
|
|
isl_id_free(Id);
|
|
|
|
isl_ast_expr_free(StmtExpr);
|
|
|
|
isl_ast_expr_free(Expr);
|
|
|
|
|
|
|
|
if (!isPrefix(Str, "Stmt"))
|
|
|
|
return isl_bool_true;
|
|
|
|
|
|
|
|
Id = isl_ast_node_get_annotation(Node);
|
|
|
|
auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
|
|
|
|
auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
|
|
|
|
isl_id_free(Id);
|
|
|
|
|
2016-08-04 14:55:59 +08:00
|
|
|
addReferencesFromStmt(Stmt, User, false /* CreateScalarRefs */);
|
2016-07-21 21:15:59 +08:00
|
|
|
|
|
|
|
return isl_bool_true;
|
|
|
|
}
|
|
|
|
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
/// A list of functions that are available in NVIDIA's libdevice.
|
|
|
|
const std::set<std::string> CUDALibDeviceFunctions = {
|
|
|
|
"exp", "expf", "expl", "cos", "cosf",
|
|
|
|
"sqrt", "sqrtf", "copysign", "copysignf", "copysignl"};
|
|
|
|
|
|
|
|
/// Return the corresponding CUDA libdevice function name for @p F.
|
|
|
|
///
|
|
|
|
/// Return "" if we are not compiling for CUDA.
|
|
|
|
std::string getCUDALibDeviceFuntion(Function *F) {
|
|
|
|
if (CUDALibDeviceFunctions.count(F->getName()))
|
|
|
|
return std::string("__nv_") + std::string(F->getName());
|
|
|
|
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
/// Check if F is a function that we can code-generate in a GPU kernel.
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
static bool isValidFunctionInKernel(llvm::Function *F, bool AllowLibDevice) {
|
2017-06-26 21:12:06 +08:00
|
|
|
assert(F && "F is an invalid pointer");
|
|
|
|
// We string compare against the name of the function to allow
|
2017-07-21 02:26:34 +08:00
|
|
|
// all variants of the intrinsic "llvm.sqrt.*", "llvm.fabs", and
|
|
|
|
// "llvm.copysign".
|
|
|
|
const StringRef Name = F->getName();
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
|
|
|
|
if (AllowLibDevice && getCUDALibDeviceFuntion(F).length() > 0)
|
|
|
|
return true;
|
|
|
|
|
2017-07-21 02:26:34 +08:00
|
|
|
return F->isIntrinsic() &&
|
|
|
|
(Name.startswith("llvm.sqrt") || Name.startswith("llvm.fabs") ||
|
|
|
|
Name.startswith("llvm.copysign"));
|
2017-06-26 21:12:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Do not take `Function` as a subtree value.
|
|
|
|
///
|
|
|
|
/// We try to take the reference of all subtree values and pass them along
|
|
|
|
/// to the kernel from the host. Taking an address of any function and
|
|
|
|
/// trying to pass along is nonsensical. Only allow `Value`s that are not
|
|
|
|
/// `Function`s.
|
|
|
|
static bool isValidSubtreeValue(llvm::Value *V) { return !isa<Function>(V); }
|
|
|
|
|
|
|
|
/// Return `Function`s from `RawSubtreeValues`.
|
|
|
|
static SetVector<Function *>
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues,
|
|
|
|
bool AllowCUDALibDevice) {
|
2017-06-26 21:12:06 +08:00
|
|
|
SetVector<Function *> SubtreeFunctions;
|
|
|
|
for (Value *It : RawSubtreeValues) {
|
|
|
|
Function *F = dyn_cast<Function>(It);
|
|
|
|
if (F) {
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
assert(isValidFunctionInKernel(F, AllowCUDALibDevice) &&
|
|
|
|
"Code should have bailed out by "
|
|
|
|
"this point if an invalid function "
|
|
|
|
"were present in a kernel.");
|
2017-06-26 21:12:06 +08:00
|
|
|
SubtreeFunctions.insert(F);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return SubtreeFunctions;
|
|
|
|
}
|
|
|
|
|
2017-08-06 10:39:05 +08:00
|
|
|
std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>>
|
2017-06-26 21:12:06 +08:00
|
|
|
GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
|
2016-07-21 21:15:59 +08:00
|
|
|
SetVector<Value *> SubtreeValues;
|
|
|
|
SetVector<const SCEV *> SCEVs;
|
|
|
|
SetVector<const Loop *> Loops;
|
|
|
|
SubtreeReferences References = {
|
|
|
|
LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
|
|
|
|
|
|
|
|
for (const auto &I : IDToValue)
|
|
|
|
SubtreeValues.insert(I.second);
|
|
|
|
|
2017-08-06 10:39:05 +08:00
|
|
|
// NOTE: this is populated in IslNodeBuilder::addParameters
|
|
|
|
// See [Code generation of induction variables of loops outside Scops].
|
|
|
|
for (const auto &I : OutsideLoopIterations)
|
|
|
|
SubtreeValues.insert(cast<SCEVUnknown>(I.second)->getValue());
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
isl_ast_node_foreach_descendant_top_down(
|
|
|
|
Kernel->tree, collectReferencesInGPUStmt, &References);
|
|
|
|
|
2017-08-06 10:39:05 +08:00
|
|
|
for (const SCEV *Expr : SCEVs) {
|
2016-07-21 21:15:59 +08:00
|
|
|
findValues(Expr, SE, SubtreeValues);
|
2017-08-06 10:39:05 +08:00
|
|
|
findLoops(Expr, Loops);
|
|
|
|
}
|
|
|
|
|
|
|
|
Loops.remove_if([this](const Loop *L) {
|
|
|
|
return S.contains(L) || L->contains(S.getEntry());
|
|
|
|
});
|
2016-07-21 21:15:59 +08:00
|
|
|
|
|
|
|
for (auto &SAI : S.arrays())
|
2016-07-30 17:25:51 +08:00
|
|
|
SubtreeValues.remove(SAI->getBasePtr());
|
2016-07-21 21:15:59 +08:00
|
|
|
|
2017-08-07 04:11:59 +08:00
|
|
|
isl_space *Space = S.getParamSpace().release();
|
2016-07-21 21:15:59 +08:00
|
|
|
for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
|
|
|
|
assert(IDToValue.count(Id));
|
|
|
|
Value *Val = IDToValue[Id];
|
|
|
|
SubtreeValues.remove(Val);
|
|
|
|
isl_id_free(Id);
|
|
|
|
}
|
|
|
|
isl_space_free(Space);
|
|
|
|
|
|
|
|
for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
|
|
|
|
assert(IDToValue.count(Id));
|
|
|
|
Value *Val = IDToValue[Id];
|
|
|
|
SubtreeValues.remove(Val);
|
|
|
|
isl_id_free(Id);
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
// Note: { ValidSubtreeValues, ValidSubtreeFunctions } partitions
|
|
|
|
// SubtreeValues. This is important, because we should not lose any
|
|
|
|
// SubtreeValues in the process of constructing the
|
|
|
|
// "ValidSubtree{Values, Functions} sets. Nor should the set
|
|
|
|
// ValidSubtree{Values, Functions} have any common element.
|
|
|
|
auto ValidSubtreeValuesIt =
|
|
|
|
make_filter_range(SubtreeValues, isValidSubtreeValue);
|
|
|
|
SetVector<Value *> ValidSubtreeValues(ValidSubtreeValuesIt.begin(),
|
|
|
|
ValidSubtreeValuesIt.end());
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
|
|
|
|
bool AllowCUDALibDevice = Arch == GPUArch::NVPTX64;
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
SetVector<Function *> ValidSubtreeFunctions(
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
getFunctionsFromRawSubtreeValues(SubtreeValues, AllowCUDALibDevice));
|
2017-06-26 21:12:06 +08:00
|
|
|
|
2017-07-13 20:18:56 +08:00
|
|
|
// @see IslNodeBuilder::getReferencesInSubtree
|
|
|
|
SetVector<Value *> ReplacedValues;
|
|
|
|
for (Value *V : ValidSubtreeValues) {
|
|
|
|
auto It = ValueMap.find(V);
|
|
|
|
if (It == ValueMap.end())
|
|
|
|
ReplacedValues.insert(V);
|
|
|
|
else
|
|
|
|
ReplacedValues.insert(It->second);
|
|
|
|
}
|
2017-08-06 10:39:05 +08:00
|
|
|
return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops);
|
2016-07-21 21:15:59 +08:00
|
|
|
}
|
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
void GPUNodeBuilder::clearDominators(Function *F) {
|
|
|
|
DomTreeNode *N = DT.getNode(&F->getEntryBlock());
|
|
|
|
std::vector<BasicBlock *> Nodes;
|
|
|
|
for (po_iterator<DomTreeNode *> I = po_begin(N), E = po_end(N); I != E; ++I)
|
|
|
|
Nodes.push_back(I->getBlock());
|
|
|
|
|
|
|
|
for (BasicBlock *BB : Nodes)
|
|
|
|
DT.eraseNode(BB);
|
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::clearScalarEvolution(Function *F) {
|
|
|
|
for (BasicBlock &BB : *F) {
|
|
|
|
Loop *L = LI.getLoopFor(&BB);
|
|
|
|
if (L)
|
|
|
|
SE.forgetLoop(L);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::clearLoops(Function *F) {
|
|
|
|
for (BasicBlock &BB : *F) {
|
|
|
|
Loop *L = LI.getLoopFor(&BB);
|
|
|
|
if (L)
|
|
|
|
SE.forgetLoop(L);
|
|
|
|
LI.removeBlock(&BB);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
|
|
|
|
std::vector<Value *> Sizes;
|
2017-08-07 03:52:38 +08:00
|
|
|
isl::ast_build Context = isl::ast_build::from_context(S.getContext());
|
2016-07-27 21:20:16 +08:00
|
|
|
|
2017-08-01 18:45:41 +08:00
|
|
|
isl::multi_pw_aff GridSizePwAffs =
|
|
|
|
isl::manage(isl_multi_pw_aff_copy(Kernel->grid_size));
|
2016-07-27 21:20:16 +08:00
|
|
|
for (long i = 0; i < Kernel->n_grid; i++) {
|
2017-08-01 18:45:41 +08:00
|
|
|
isl::pw_aff Size = GridSizePwAffs.get_pw_aff(i);
|
|
|
|
isl::ast_expr GridSize = Context.expr_from(Size);
|
|
|
|
Value *Res = ExprBuilder.create(GridSize.release());
|
2016-07-27 21:20:16 +08:00
|
|
|
Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
|
|
|
|
Sizes.push_back(Res);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (long i = Kernel->n_grid; i < 3; i++)
|
|
|
|
Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
|
|
|
|
|
|
|
|
return std::make_tuple(Sizes[0], Sizes[1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::tuple<Value *, Value *, Value *>
|
|
|
|
GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
|
|
|
|
std::vector<Value *> Sizes;
|
|
|
|
|
|
|
|
for (long i = 0; i < Kernel->n_block; i++) {
|
|
|
|
Value *Res = ConstantInt::get(Builder.getInt32Ty(), Kernel->block_dim[i]);
|
|
|
|
Sizes.push_back(Res);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (long i = Kernel->n_block; i < 3; i++)
|
|
|
|
Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));
|
|
|
|
|
|
|
|
return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
|
|
|
|
}
|
|
|
|
|
2017-05-09 18:45:52 +08:00
|
|
|
void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters,
|
|
|
|
Instruction *Param, int Index) {
|
|
|
|
Value *Slot = Builder.CreateGEP(
|
|
|
|
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
|
|
|
|
Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
|
|
|
|
Builder.CreateStore(ParamTyped, Slot);
|
|
|
|
}
|
|
|
|
|
2016-08-04 14:55:49 +08:00
|
|
|
Value *
|
|
|
|
GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,
|
|
|
|
SetVector<Value *> SubtreeValues) {
|
2017-05-09 18:45:52 +08:00
|
|
|
const int NumArgs = F->arg_size();
|
|
|
|
std::vector<int> ArgSizes(NumArgs);
|
|
|
|
|
|
|
|
Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs);
|
2016-07-27 21:20:16 +08:00
|
|
|
|
|
|
|
BasicBlock *EntryBlock =
|
|
|
|
&Builder.GetInsertBlock()->getParent()->getEntryBlock();
|
2017-04-11 12:23:38 +08:00
|
|
|
auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
|
2016-07-27 21:20:16 +08:00
|
|
|
std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
|
2017-04-11 12:23:38 +08:00
|
|
|
Instruction *Parameters = new AllocaInst(
|
|
|
|
ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());
|
2016-07-27 21:20:16 +08:00
|
|
|
|
|
|
|
int Index = 0;
|
|
|
|
for (long i = 0; i < Prog->n_array; i++) {
|
|
|
|
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
2017-07-25 00:22:27 +08:00
|
|
|
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
|
2016-07-27 21:20:16 +08:00
|
|
|
|
2017-05-09 18:45:52 +08:00
|
|
|
ArgSizes[Index] = SAI->getElemSizeInBytes();
|
|
|
|
|
2017-04-28 19:16:30 +08:00
|
|
|
Value *DevArray = nullptr;
|
|
|
|
if (ManagedMemory) {
|
2017-08-06 19:10:38 +08:00
|
|
|
DevArray = getManagedDeviceArray(&Prog->array[i],
|
|
|
|
const_cast<ScopArrayInfo *>(SAI));
|
2017-04-28 19:16:30 +08:00
|
|
|
} else {
|
|
|
|
DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
|
|
|
|
DevArray = createCallGetDevicePtr(DevArray);
|
|
|
|
}
|
|
|
|
assert(DevArray != nullptr && "Array to be offloaded to device not "
|
|
|
|
"initialized");
|
2016-09-15 22:05:58 +08:00
|
|
|
Value *Offset = getArrayOffset(&Prog->array[i]);
|
|
|
|
|
|
|
|
if (Offset) {
|
|
|
|
DevArray = Builder.CreatePointerCast(
|
|
|
|
DevArray, SAI->getElementType()->getPointerTo());
|
|
|
|
DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
|
|
|
|
DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
|
|
|
|
}
|
2016-07-28 14:47:53 +08:00
|
|
|
Value *Slot = Builder.CreateGEP(
|
|
|
|
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
|
2016-09-18 03:22:18 +08:00
|
|
|
|
|
|
|
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
2017-04-28 19:16:30 +08:00
|
|
|
Value *ValPtr = nullptr;
|
|
|
|
if (ManagedMemory)
|
|
|
|
ValPtr = DevArray;
|
|
|
|
else
|
|
|
|
ValPtr = BlockGen.getOrCreateAlloca(SAI);
|
|
|
|
|
|
|
|
assert(ValPtr != nullptr && "ValPtr that should point to a valid object"
|
|
|
|
" to be stored into Parameters");
|
2016-09-18 03:22:18 +08:00
|
|
|
Value *ValPtrCast =
|
|
|
|
Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy());
|
|
|
|
Builder.CreateStore(ValPtrCast, Slot);
|
|
|
|
} else {
|
2017-04-11 12:23:38 +08:00
|
|
|
Instruction *Param =
|
|
|
|
new AllocaInst(Builder.getInt8PtrTy(), AddressSpace,
|
|
|
|
Launch + "_param_" + std::to_string(Index),
|
|
|
|
EntryBlock->getTerminator());
|
2016-09-18 03:22:18 +08:00
|
|
|
Builder.CreateStore(DevArray, Param);
|
|
|
|
Value *ParamTyped =
|
|
|
|
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
|
|
|
|
Builder.CreateStore(ParamTyped, Slot);
|
|
|
|
}
|
2016-07-27 21:20:16 +08:00
|
|
|
Index++;
|
|
|
|
}
|
|
|
|
|
2016-07-28 14:47:56 +08:00
|
|
|
int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
|
|
|
|
|
|
|
|
for (long i = 0; i < NumHostIters; i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
|
|
|
|
Value *Val = IDToValue[Id];
|
2016-07-28 14:47:59 +08:00
|
|
|
isl_id_free(Id);
|
2017-05-09 18:45:52 +08:00
|
|
|
|
|
|
|
ArgSizes[Index] = computeSizeInBytes(Val->getType());
|
|
|
|
|
2017-04-11 12:23:38 +08:00
|
|
|
Instruction *Param =
|
|
|
|
new AllocaInst(Val->getType(), AddressSpace,
|
|
|
|
Launch + "_param_" + std::to_string(Index),
|
|
|
|
EntryBlock->getTerminator());
|
2016-07-28 14:47:59 +08:00
|
|
|
Builder.CreateStore(Val, Param);
|
2017-05-09 18:45:52 +08:00
|
|
|
insertStoreParameter(Parameters, Param, Index);
|
2016-07-28 14:47:59 +08:00
|
|
|
Index++;
|
|
|
|
}
|
|
|
|
|
|
|
|
int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
|
|
|
|
|
|
|
|
for (long i = 0; i < NumVars; i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
|
|
|
|
Value *Val = IDToValue[Id];
|
2017-07-13 20:18:56 +08:00
|
|
|
if (ValueMap.count(Val))
|
|
|
|
Val = ValueMap[Val];
|
2016-07-28 14:47:56 +08:00
|
|
|
isl_id_free(Id);
|
2017-05-09 18:45:52 +08:00
|
|
|
|
|
|
|
ArgSizes[Index] = computeSizeInBytes(Val->getType());
|
|
|
|
|
2017-04-11 12:23:38 +08:00
|
|
|
Instruction *Param =
|
|
|
|
new AllocaInst(Val->getType(), AddressSpace,
|
|
|
|
Launch + "_param_" + std::to_string(Index),
|
|
|
|
EntryBlock->getTerminator());
|
2016-07-28 14:47:56 +08:00
|
|
|
Builder.CreateStore(Val, Param);
|
2017-05-09 18:45:52 +08:00
|
|
|
insertStoreParameter(Parameters, Param, Index);
|
2016-07-28 14:47:56 +08:00
|
|
|
Index++;
|
|
|
|
}
|
|
|
|
|
2016-08-04 14:55:49 +08:00
|
|
|
for (auto Val : SubtreeValues) {
|
2017-05-09 18:45:52 +08:00
|
|
|
ArgSizes[Index] = computeSizeInBytes(Val->getType());
|
|
|
|
|
2017-04-11 12:23:38 +08:00
|
|
|
Instruction *Param =
|
|
|
|
new AllocaInst(Val->getType(), AddressSpace,
|
|
|
|
Launch + "_param_" + std::to_string(Index),
|
|
|
|
EntryBlock->getTerminator());
|
2016-08-04 14:55:49 +08:00
|
|
|
Builder.CreateStore(Val, Param);
|
2017-05-09 18:45:52 +08:00
|
|
|
insertStoreParameter(Parameters, Param, Index);
|
|
|
|
Index++;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < NumArgs; i++) {
|
|
|
|
Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]);
|
|
|
|
Instruction *Param =
|
|
|
|
new AllocaInst(Builder.getInt32Ty(), AddressSpace,
|
|
|
|
Launch + "_param_size_" + std::to_string(i),
|
|
|
|
EntryBlock->getTerminator());
|
|
|
|
Builder.CreateStore(Val, Param);
|
|
|
|
insertStoreParameter(Parameters, Param, Index);
|
2016-08-04 14:55:49 +08:00
|
|
|
Index++;
|
|
|
|
}
|
|
|
|
|
2016-07-27 21:20:16 +08:00
|
|
|
auto Location = EntryBlock->getTerminator();
|
|
|
|
return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
|
|
|
|
Launch + "_params_i8ptr", Location);
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
void GPUNodeBuilder::setupKernelSubtreeFunctions(
|
|
|
|
SetVector<Function *> SubtreeFunctions) {
|
|
|
|
for (auto Fn : SubtreeFunctions) {
|
|
|
|
const std::string ClonedFnName = Fn->getName();
|
|
|
|
Function *Clone = GPUModule->getFunction(ClonedFnName);
|
|
|
|
if (!Clone)
|
|
|
|
Clone =
|
|
|
|
Function::Create(Fn->getFunctionType(), GlobalValue::ExternalLinkage,
|
|
|
|
ClonedFnName, GPUModule.get());
|
|
|
|
assert(Clone && "Expected cloned function to be initialized.");
|
|
|
|
assert(ValueMap.find(Fn) == ValueMap.end() &&
|
|
|
|
"Fn already present in ValueMap");
|
|
|
|
ValueMap[Fn] = Clone;
|
|
|
|
}
|
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
|
|
|
|
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
|
|
|
|
ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
|
|
|
|
isl_id_free(Id);
|
|
|
|
isl_ast_node_free(KernelStmt);
|
|
|
|
|
2016-09-18 16:31:09 +08:00
|
|
|
if (Kernel->n_grid > 1)
|
|
|
|
DeepestParallel =
|
|
|
|
std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set));
|
|
|
|
else
|
|
|
|
DeepestSequential =
|
|
|
|
std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set));
|
|
|
|
|
2016-08-05 14:47:43 +08:00
|
|
|
Value *BlockDimX, *BlockDimY, *BlockDimZ;
|
|
|
|
std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
SetVector<Value *> SubtreeValues;
|
|
|
|
SetVector<Function *> SubtreeFunctions;
|
2017-08-06 10:39:05 +08:00
|
|
|
SetVector<const Loop *> Loops;
|
|
|
|
std::tie(SubtreeValues, SubtreeFunctions, Loops) =
|
|
|
|
getReferencesInKernel(Kernel);
|
2016-07-21 21:15:59 +08:00
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
assert(Kernel->tree && "Device AST of kernel node is empty");
|
|
|
|
|
|
|
|
Instruction &HostInsertPoint = *Builder.GetInsertPoint();
|
2016-07-19 15:32:44 +08:00
|
|
|
IslExprBuilder::IDToValueTy HostIDs = IDToValue;
|
2016-07-21 21:15:59 +08:00
|
|
|
ValueMapT HostValueMap = ValueMap;
|
[Polly] [BlockGenerator] Unify ScalarMap and PhiOpsMap
Instead of keeping two separate maps from Value to Allocas, one for
MemoryType::Value and the other for MemoryType::PHI, we introduce a single map
from ScopArrayInfo to the corresponding Alloca. This change is intended, both as
a general simplification and cleanup, but also to reduce our use of
MemoryAccess::getBaseAddr(). Moving away from using getBaseAddr() makes sure
we have only a single place where the array (and its base pointer) for which we
generate code for is specified, which means we can more easily introduce new
access functions that use a different ScopArrayInfo as base. We already today
experiment with modifiable access functions, so this change does not address
a specific bug, but it just reduces the scope one needs to reason about.
Another motivation for this patch is https://reviews.llvm.org/D28518, where
memory accesses with different base pointers could possibly be mapped to a
single ScopArrayInfo object. Such a mapping is currently not possible, as we
currently generate alloca instructions according to the base addresses of the
memory accesses, not according to the ScopArrayInfo object they belong to. By
making allocas ScopArrayInfo specific, a mapping to a single ScopArrayInfo
object will automatically mean that the same stack slot is used for these
arrays. For D28518 this is not a problem, as only MemoryType::Array objects are
mapping, but resolving this inconsistency will hopefully avoid confusion.
llvm-svn: 293374
2017-01-28 15:42:10 +08:00
|
|
|
BlockGenerator::AllocaMapTy HostScalarMap = ScalarMap;
|
2016-08-09 23:35:06 +08:00
|
|
|
ScalarMap.clear();
|
2016-07-21 21:15:59 +08:00
|
|
|
|
|
|
|
// Create for all loops we depend on values that contain the current loop
|
|
|
|
// iteration. These values are necessary to generate code for SCEVs that
|
|
|
|
// depend on such loops. As a result we need to pass them to the subfunction.
|
|
|
|
for (const Loop *L : Loops) {
|
|
|
|
const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
|
|
|
|
SE.getUnknown(Builder.getInt64(1)),
|
|
|
|
L, SCEV::FlagAnyWrap);
|
|
|
|
Value *V = generateSCEV(OuterLIV);
|
|
|
|
OutsideLoopIterations[L] = SE.getUnknown(V);
|
|
|
|
SubtreeValues.insert(V);
|
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
createKernelFunction(Kernel, SubtreeValues, SubtreeFunctions);
|
|
|
|
setupKernelSubtreeFunctions(SubtreeFunctions);
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-07-19 15:33:11 +08:00
|
|
|
create(isl_ast_node_copy(Kernel->tree));
|
|
|
|
|
2016-09-18 03:22:31 +08:00
|
|
|
finalizeKernelArguments(Kernel);
|
2016-07-22 15:11:12 +08:00
|
|
|
Function *F = Builder.GetInsertBlock()->getParent();
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
if (Arch == GPUArch::NVPTX64)
|
|
|
|
addCUDAAnnotations(F->getParent(), BlockDimX, BlockDimY, BlockDimZ);
|
2016-07-22 15:11:12 +08:00
|
|
|
clearDominators(F);
|
|
|
|
clearScalarEvolution(F);
|
|
|
|
clearLoops(F);
|
|
|
|
|
2016-07-19 15:32:44 +08:00
|
|
|
IDToValue = HostIDs;
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-08-09 23:35:06 +08:00
|
|
|
ValueMap = std::move(HostValueMap);
|
|
|
|
ScalarMap = std::move(HostScalarMap);
|
2016-07-21 21:15:59 +08:00
|
|
|
EscapeMap.clear();
|
|
|
|
IDToSAI.clear();
|
2016-07-22 15:11:12 +08:00
|
|
|
Annotator.resetAlternativeAliasBases();
|
|
|
|
for (auto &BasePtr : LocalArrays)
|
2017-01-15 04:25:44 +08:00
|
|
|
S.invalidateScopArrayInfo(BasePtr, MemoryKind::Array);
|
2016-07-22 15:11:12 +08:00
|
|
|
LocalArrays.clear();
|
2016-07-21 21:15:59 +08:00
|
|
|
|
2016-09-18 03:22:31 +08:00
|
|
|
std::string ASMString = finalizeKernelFunction();
|
|
|
|
Builder.SetInsertPoint(&HostInsertPoint);
|
2016-08-04 14:55:49 +08:00
|
|
|
Value *Parameters = createLaunchParameters(Kernel, F, SubtreeValues);
|
2016-07-27 21:20:16 +08:00
|
|
|
|
Prefix the name of the calling host function in the name of callee GPU kernel
Summary:
Provide more context to the name of a GPU kernel by prefixing its name with the host function that calls it. E.g. The first kernel called by `gemm` would be `FUNC_gemm_KERNEL_0`.
Kernels currently follow the "kernel_#" (# = 0,1,2,3,...) nomenclature. This patch makes it easier to map host caller and device callee, especially when there are many kernels produced by Polly-ACC.
Reviewers: grosser, Meinersbur, bollu, philip.pfaffe, kbarton!
Reviewed By: grosser
Subscribers: nemanjai, pollydev
Tags: #polly
Differential Revision: https://reviews.llvm.org/D33985
llvm-svn: 307173
2017-07-06 00:48:21 +08:00
|
|
|
std::string Name = getKernelFuncName(Kernel->id);
|
2016-07-26 00:31:21 +08:00
|
|
|
Value *KernelString = Builder.CreateGlobalStringPtr(ASMString, Name);
|
|
|
|
Value *NameString = Builder.CreateGlobalStringPtr(Name, Name + "_name");
|
|
|
|
Value *GPUKernel = createCallGetKernel(KernelString, NameString);
|
2016-07-27 21:20:16 +08:00
|
|
|
|
|
|
|
Value *GridDimX, *GridDimY;
|
|
|
|
std::tie(GridDimX, GridDimY) = getGridSizes(Kernel);
|
|
|
|
|
|
|
|
createCallLaunchKernel(GPUKernel, GridDimX, GridDimY, BlockDimX, BlockDimY,
|
|
|
|
BlockDimZ, Parameters);
|
2016-07-26 00:31:21 +08:00
|
|
|
createCallFreeKernel(GPUKernel);
|
2016-08-04 20:18:14 +08:00
|
|
|
|
|
|
|
for (auto Id : KernelIds)
|
|
|
|
isl_id_free(Id);
|
|
|
|
|
|
|
|
KernelIds.clear();
|
2016-07-19 15:32:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Compute the DataLayout string for the NVPTX backend.
|
|
|
|
///
|
|
|
|
/// @param is64Bit Are we looking for a 64 bit architecture?
|
|
|
|
static std::string computeNVPTXDataLayout(bool is64Bit) {
|
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility
Added a small change to the way pointer arguments are set in the kernel
code generation. The way the pointer is retrieved now, specifically requests
global address space to be annotated. This is necessary, if the IR should be
run through NVPTX to generate OpenCL compatible PTX.
The changes do not affect the PTX Strings generated for the CUDA target
(nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl).
Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends.
Contributed-by: Philipp Schaad
Reviewers: Meinersbur, grosser, bollu
Reviewed By: grosser, bollu
Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32215
llvm-svn: 301299
2017-04-25 16:08:29 +08:00
|
|
|
std::string Ret = "";
|
2016-07-19 15:32:38 +08:00
|
|
|
|
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility
Added a small change to the way pointer arguments are set in the kernel
code generation. The way the pointer is retrieved now, specifically requests
global address space to be annotated. This is necessary, if the IR should be
run through NVPTX to generate OpenCL compatible PTX.
The changes do not affect the PTX Strings generated for the CUDA target
(nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl).
Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends.
Contributed-by: Philipp Schaad
Reviewers: Meinersbur, grosser, bollu
Reviewed By: grosser, bollu
Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32215
llvm-svn: 301299
2017-04-25 16:08:29 +08:00
|
|
|
if (!is64Bit) {
|
|
|
|
Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
|
2017-07-28 14:38:49 +08:00
|
|
|
"64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
|
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility
Added a small change to the way pointer arguments are set in the kernel
code generation. The way the pointer is retrieved now, specifically requests
global address space to be annotated. This is necessary, if the IR should be
run through NVPTX to generate OpenCL compatible PTX.
The changes do not affect the PTX Strings generated for the CUDA target
(nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl).
Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends.
Contributed-by: Philipp Schaad
Reviewers: Meinersbur, grosser, bollu
Reviewed By: grosser, bollu
Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32215
llvm-svn: 301299
2017-04-25 16:08:29 +08:00
|
|
|
"64-v128:128:128-n16:32:64";
|
|
|
|
} else {
|
|
|
|
Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
|
2017-07-28 14:38:49 +08:00
|
|
|
"64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:"
|
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility
Added a small change to the way pointer arguments are set in the kernel
code generation. The way the pointer is retrieved now, specifically requests
global address space to be annotated. This is necessary, if the IR should be
run through NVPTX to generate OpenCL compatible PTX.
The changes do not affect the PTX Strings generated for the CUDA target
(nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl).
Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends.
Contributed-by: Philipp Schaad
Reviewers: Meinersbur, grosser, bollu
Reviewed By: grosser, bollu
Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32215
llvm-svn: 301299
2017-04-25 16:08:29 +08:00
|
|
|
"64-v128:128:128-n16:32:64";
|
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
/// Compute the DataLayout string for a SPIR kernel.
|
|
|
|
///
|
|
|
|
/// @param is64Bit Are we looking for a 64 bit architecture?
|
|
|
|
static std::string computeSPIRDataLayout(bool is64Bit) {
|
|
|
|
std::string Ret = "";
|
|
|
|
|
|
|
|
if (!is64Bit) {
|
|
|
|
Ret += "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
|
2017-07-28 14:38:49 +08:00
|
|
|
"64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
"32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
|
|
|
|
"256:256-v256:256:256-v512:512:512-v1024:1024:1024";
|
|
|
|
} else {
|
|
|
|
Ret += "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:"
|
2017-07-28 14:38:49 +08:00
|
|
|
"64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:"
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
"32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:"
|
|
|
|
"256:256-v256:256:256-v512:512:512-v1024:1024:1024";
|
|
|
|
}
|
|
|
|
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
Function *
|
|
|
|
GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
|
|
|
|
SetVector<Value *> &SubtreeValues) {
|
2016-07-19 15:32:38 +08:00
|
|
|
std::vector<Type *> Args;
|
Prefix the name of the calling host function in the name of callee GPU kernel
Summary:
Provide more context to the name of a GPU kernel by prefixing its name with the host function that calls it. E.g. The first kernel called by `gemm` would be `FUNC_gemm_KERNEL_0`.
Kernels currently follow the "kernel_#" (# = 0,1,2,3,...) nomenclature. This patch makes it easier to map host caller and device callee, especially when there are many kernels produced by Polly-ACC.
Reviewers: grosser, Meinersbur, bollu, philip.pfaffe, kbarton!
Reviewed By: grosser
Subscribers: nemanjai, pollydev
Tags: #polly
Differential Revision: https://reviews.llvm.org/D33985
llvm-svn: 307173
2017-07-06 00:48:21 +08:00
|
|
|
std::string Identifier = getKernelFuncName(Kernel->id);
|
2016-07-19 15:32:38 +08:00
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
std::vector<Metadata *> MemoryType;
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
for (long i = 0; i < Prog->n_array; i++) {
|
|
|
|
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
|
|
|
continue;
|
|
|
|
|
2016-09-18 03:22:18 +08:00
|
|
|
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
|
|
|
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
2017-07-25 00:22:27 +08:00
|
|
|
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id));
|
2016-09-18 03:22:18 +08:00
|
|
|
Args.push_back(SAI->getElementType());
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
MemoryType.push_back(
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
|
2016-09-18 03:22:18 +08:00
|
|
|
} else {
|
[PPCGCodeGeneration] Update PPCG Code Generation for OpenCL compatibility
Added a small change to the way pointer arguments are set in the kernel
code generation. The way the pointer is retrieved now, specifically requests
global address space to be annotated. This is necessary, if the IR should be
run through NVPTX to generate OpenCL compatible PTX.
The changes do not affect the PTX Strings generated for the CUDA target
(nvptx64-nvidia-cuda), but are necessary for OpenCL (nvptx64-nvidia-nvcl).
Additionally, the data layout has been updated to what the NVPTX Backend requests/recommends.
Contributed-by: Philipp Schaad
Reviewers: Meinersbur, grosser, bollu
Reviewed By: grosser, bollu
Subscribers: jlebar, pollydev, llvm-commits, nemanjai, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32215
llvm-svn: 301299
2017-04-25 16:08:29 +08:00
|
|
|
static const int UseGlobalMemory = 1;
|
|
|
|
Args.push_back(Builder.getInt8PtrTy(UseGlobalMemory));
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
MemoryType.push_back(
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 1)));
|
2016-09-18 03:22:18 +08:00
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
}
|
|
|
|
|
2016-07-19 15:32:55 +08:00
|
|
|
int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);
|
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
for (long i = 0; i < NumHostIters; i++) {
|
2016-07-19 15:32:55 +08:00
|
|
|
Args.push_back(Builder.getInt64Ty());
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
MemoryType.push_back(
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
|
|
|
|
}
|
2016-07-19 15:32:55 +08:00
|
|
|
|
2016-07-19 15:33:06 +08:00
|
|
|
int NumVars = isl_space_dim(Kernel->space, isl_dim_param);
|
|
|
|
|
2016-08-09 15:22:08 +08:00
|
|
|
for (long i = 0; i < NumVars; i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
|
|
|
|
Value *Val = IDToValue[Id];
|
|
|
|
isl_id_free(Id);
|
|
|
|
Args.push_back(Val->getType());
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
MemoryType.push_back(
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
|
2016-08-09 15:22:08 +08:00
|
|
|
}
|
2016-07-19 15:33:06 +08:00
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
for (auto *V : SubtreeValues) {
|
2016-07-21 21:15:59 +08:00
|
|
|
Args.push_back(V->getType());
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
MemoryType.push_back(
|
|
|
|
ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), 0)));
|
|
|
|
}
|
2016-07-21 21:15:59 +08:00
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
|
|
|
|
auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
|
|
|
|
GPUModule.get());
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
std::vector<Metadata *> EmptyStrings;
|
|
|
|
|
|
|
|
for (unsigned int i = 0; i < MemoryType.size(); i++) {
|
|
|
|
EmptyStrings.push_back(MDString::get(FN->getContext(), ""));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Arch == GPUArch::SPIR32 || Arch == GPUArch::SPIR64) {
|
|
|
|
FN->setMetadata("kernel_arg_addr_space",
|
|
|
|
MDNode::get(FN->getContext(), MemoryType));
|
|
|
|
FN->setMetadata("kernel_arg_name",
|
|
|
|
MDNode::get(FN->getContext(), EmptyStrings));
|
|
|
|
FN->setMetadata("kernel_arg_access_qual",
|
|
|
|
MDNode::get(FN->getContext(), EmptyStrings));
|
|
|
|
FN->setMetadata("kernel_arg_type",
|
|
|
|
MDNode::get(FN->getContext(), EmptyStrings));
|
|
|
|
FN->setMetadata("kernel_arg_type_qual",
|
|
|
|
MDNode::get(FN->getContext(), EmptyStrings));
|
|
|
|
FN->setMetadata("kernel_arg_base_type",
|
|
|
|
MDNode::get(FN->getContext(), EmptyStrings));
|
|
|
|
}
|
|
|
|
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
switch (Arch) {
|
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
FN->setCallingConv(CallingConv::PTX_Kernel);
|
|
|
|
break;
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
case GPUArch::SPIR32:
|
|
|
|
case GPUArch::SPIR64:
|
|
|
|
FN->setCallingConv(CallingConv::SPIR_KERNEL);
|
|
|
|
break;
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
|
|
|
|
auto Arg = FN->arg_begin();
|
|
|
|
for (long i = 0; i < Kernel->n_array; i++) {
|
|
|
|
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
|
|
|
continue;
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
Arg->setName(Kernel->array[i].array->name);
|
|
|
|
|
|
|
|
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
2017-07-25 00:22:27 +08:00
|
|
|
const ScopArrayInfo *SAI =
|
|
|
|
ScopArrayInfo::getFromId(isl::manage(isl_id_copy(Id)));
|
2016-07-21 21:15:59 +08:00
|
|
|
Type *EleTy = SAI->getElementType();
|
|
|
|
Value *Val = &*Arg;
|
|
|
|
SmallVector<const SCEV *, 4> Sizes;
|
|
|
|
isl_ast_build *Build =
|
|
|
|
isl_ast_build_from_context(isl_set_copy(Prog->context));
|
2016-09-13 01:08:31 +08:00
|
|
|
Sizes.push_back(nullptr);
|
2016-07-21 21:15:59 +08:00
|
|
|
for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
|
|
|
|
isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
|
2017-07-20 23:48:36 +08:00
|
|
|
Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j));
|
2016-07-21 21:15:59 +08:00
|
|
|
auto V = ExprBuilder.create(DimSize);
|
|
|
|
Sizes.push_back(SE.getSCEV(V));
|
|
|
|
}
|
|
|
|
const ScopArrayInfo *SAIRep =
|
2017-01-15 04:25:44 +08:00
|
|
|
S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, MemoryKind::Array);
|
2016-07-22 15:11:12 +08:00
|
|
|
LocalArrays.push_back(Val);
|
2016-07-21 21:15:59 +08:00
|
|
|
|
|
|
|
isl_ast_build_free(Build);
|
2016-08-04 20:18:14 +08:00
|
|
|
KernelIds.push_back(Id);
|
2016-07-21 21:15:59 +08:00
|
|
|
IDToSAI[Id] = SAIRep;
|
2016-07-19 15:32:38 +08:00
|
|
|
Arg++;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:32:55 +08:00
|
|
|
for (long i = 0; i < NumHostIters; i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
|
|
|
|
Arg->setName(isl_id_get_name(Id));
|
|
|
|
IDToValue[Id] = &*Arg;
|
|
|
|
KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
|
|
|
|
Arg++;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:33:06 +08:00
|
|
|
for (long i = 0; i < NumVars; i++) {
|
|
|
|
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
|
|
|
|
Arg->setName(isl_id_get_name(Id));
|
2016-08-09 03:22:19 +08:00
|
|
|
Value *Val = IDToValue[Id];
|
|
|
|
ValueMap[Val] = &*Arg;
|
2016-07-19 15:33:06 +08:00
|
|
|
IDToValue[Id] = &*Arg;
|
|
|
|
KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
|
|
|
|
Arg++;
|
|
|
|
}
|
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
for (auto *V : SubtreeValues) {
|
|
|
|
Arg->setName(V->getName());
|
|
|
|
ValueMap[V] = &*Arg;
|
|
|
|
Arg++;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
return FN;
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:32:44 +08:00
|
|
|
void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
Intrinsic::ID IntrinsicsBID[2];
|
|
|
|
Intrinsic::ID IntrinsicsTID[3];
|
|
|
|
|
|
|
|
switch (Arch) {
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
case GPUArch::SPIR64:
|
|
|
|
case GPUArch::SPIR32:
|
|
|
|
llvm_unreachable("Cannot generate NVVM intrinsics for SPIR");
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
IntrinsicsBID[0] = Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
|
|
|
|
IntrinsicsBID[1] = Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
|
2016-07-19 15:32:44 +08:00
|
|
|
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
IntrinsicsTID[0] = Intrinsic::nvvm_read_ptx_sreg_tid_x;
|
|
|
|
IntrinsicsTID[1] = Intrinsic::nvvm_read_ptx_sreg_tid_y;
|
|
|
|
IntrinsicsTID[2] = Intrinsic::nvvm_read_ptx_sreg_tid_z;
|
|
|
|
break;
|
|
|
|
}
|
2016-07-19 15:32:44 +08:00
|
|
|
|
|
|
|
auto addId = [this](__isl_take isl_id *Id, Intrinsic::ID Intr) mutable {
|
|
|
|
std::string Name = isl_id_get_name(Id);
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *IntrinsicFn = Intrinsic::getDeclaration(M, Intr);
|
|
|
|
Value *Val = Builder.CreateCall(IntrinsicFn, {});
|
|
|
|
Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
|
|
|
|
IDToValue[Id] = Val;
|
|
|
|
KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i = 0; i < Kernel->n_grid; ++i) {
|
|
|
|
isl_id *Id = isl_id_list_get_id(Kernel->block_ids, i);
|
|
|
|
addId(Id, IntrinsicsBID[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < Kernel->n_block; ++i) {
|
|
|
|
isl_id *Id = isl_id_list_get_id(Kernel->thread_ids, i);
|
|
|
|
addId(Id, IntrinsicsTID[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
void GPUNodeBuilder::insertKernelCallsSPIR(ppcg_kernel *Kernel) {
|
|
|
|
const char *GroupName[3] = {"__gen_ocl_get_group_id0",
|
|
|
|
"__gen_ocl_get_group_id1",
|
|
|
|
"__gen_ocl_get_group_id2"};
|
|
|
|
|
|
|
|
const char *LocalName[3] = {"__gen_ocl_get_local_id0",
|
|
|
|
"__gen_ocl_get_local_id1",
|
|
|
|
"__gen_ocl_get_local_id2"};
|
|
|
|
|
|
|
|
auto createFunc = [this](const char *Name, __isl_take isl_id *Id) mutable {
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
Function *FN = M->getFunction(Name);
|
|
|
|
|
|
|
|
// If FN is not available, declare it.
|
|
|
|
if (!FN) {
|
|
|
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
|
|
|
std::vector<Type *> Args;
|
|
|
|
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Args, false);
|
|
|
|
FN = Function::Create(Ty, Linkage, Name, M);
|
|
|
|
FN->setCallingConv(CallingConv::SPIR_FUNC);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Val = Builder.CreateCall(FN, {});
|
|
|
|
Val = Builder.CreateIntCast(Val, Builder.getInt64Ty(), false, Name);
|
|
|
|
IDToValue[Id] = Val;
|
|
|
|
KernelIDs.insert(std::unique_ptr<isl_id, IslIdDeleter>(Id));
|
|
|
|
};
|
|
|
|
|
|
|
|
for (int i = 0; i < Kernel->n_grid; ++i)
|
|
|
|
createFunc(GroupName[i], isl_id_list_get_id(Kernel->block_ids, i));
|
|
|
|
|
|
|
|
for (int i = 0; i < Kernel->n_block; ++i)
|
|
|
|
createFunc(LocalName[i], isl_id_list_get_id(Kernel->thread_ids, i));
|
|
|
|
}
|
|
|
|
|
2016-08-04 14:55:59 +08:00
|
|
|
void GPUNodeBuilder::prepareKernelArguments(ppcg_kernel *Kernel, Function *FN) {
|
|
|
|
auto Arg = FN->arg_begin();
|
|
|
|
for (long i = 0; i < Kernel->n_array; i++) {
|
|
|
|
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
2017-07-25 00:22:27 +08:00
|
|
|
const ScopArrayInfo *SAI =
|
|
|
|
ScopArrayInfo::getFromId(isl::manage(isl_id_copy(Id)));
|
2016-08-04 14:55:59 +08:00
|
|
|
isl_id_free(Id);
|
|
|
|
|
|
|
|
if (SAI->getNumberOfDimensions() > 0) {
|
|
|
|
Arg++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-09-18 03:22:18 +08:00
|
|
|
Value *Val = &*Arg;
|
|
|
|
|
|
|
|
if (!gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
|
|
|
Type *TypePtr = SAI->getElementType()->getPointerTo();
|
|
|
|
Value *TypedArgPtr = Builder.CreatePointerCast(Val, TypePtr);
|
|
|
|
Val = Builder.CreateLoad(TypedArgPtr);
|
|
|
|
}
|
|
|
|
|
2016-08-09 23:35:06 +08:00
|
|
|
Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
|
2016-08-04 14:55:59 +08:00
|
|
|
Builder.CreateStore(Val, Alloca);
|
|
|
|
|
|
|
|
Arg++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-18 03:22:31 +08:00
|
|
|
void GPUNodeBuilder::finalizeKernelArguments(ppcg_kernel *Kernel) {
|
|
|
|
auto *FN = Builder.GetInsertBlock()->getParent();
|
|
|
|
auto Arg = FN->arg_begin();
|
|
|
|
|
|
|
|
bool StoredScalar = false;
|
|
|
|
for (long i = 0; i < Kernel->n_array; i++) {
|
|
|
|
if (!ppcg_kernel_requires_array_argument(Kernel, i))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
|
2017-07-25 00:22:27 +08:00
|
|
|
const ScopArrayInfo *SAI =
|
|
|
|
ScopArrayInfo::getFromId(isl::manage(isl_id_copy(Id)));
|
2016-09-18 03:22:31 +08:00
|
|
|
isl_id_free(Id);
|
|
|
|
|
|
|
|
if (SAI->getNumberOfDimensions() > 0) {
|
|
|
|
Arg++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (gpu_array_is_read_only_scalar(&Prog->array[i])) {
|
|
|
|
Arg++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Alloca = BlockGen.getOrCreateAlloca(SAI);
|
|
|
|
Value *ArgPtr = &*Arg;
|
|
|
|
Type *TypePtr = SAI->getElementType()->getPointerTo();
|
|
|
|
Value *TypedArgPtr = Builder.CreatePointerCast(ArgPtr, TypePtr);
|
|
|
|
Value *Val = Builder.CreateLoad(Alloca);
|
|
|
|
Builder.CreateStore(Val, TypedArgPtr);
|
|
|
|
StoredScalar = true;
|
|
|
|
|
|
|
|
Arg++;
|
|
|
|
}
|
|
|
|
|
2017-08-05 03:36:40 +08:00
|
|
|
if (StoredScalar) {
|
2016-09-18 03:22:31 +08:00
|
|
|
/// In case more than one thread contains scalar stores, the generated
|
|
|
|
/// code might be incorrect, if we only store at the end of the kernel.
|
|
|
|
/// To support this case we need to store these scalars back at each
|
|
|
|
/// memory store or at least before each kernel barrier.
|
2017-08-05 03:36:40 +08:00
|
|
|
if (Kernel->n_block != 0 || Kernel->n_grid != 0) {
|
2016-09-18 03:22:31 +08:00
|
|
|
BuildSuccessful = 0;
|
2017-08-05 03:36:40 +08:00
|
|
|
DEBUG(
|
|
|
|
dbgs() << getUniqueScopName(&S)
|
|
|
|
<< " has a store to a scalar value that"
|
|
|
|
" would be undefined to run in parallel. Bailing out.\n";);
|
|
|
|
}
|
|
|
|
}
|
2016-09-18 03:22:31 +08:00
|
|
|
}
|
|
|
|
|
2016-08-04 20:18:14 +08:00
|
|
|
void GPUNodeBuilder::createKernelVariables(ppcg_kernel *Kernel, Function *FN) {
|
|
|
|
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
|
|
|
|
|
|
|
|
for (int i = 0; i < Kernel->n_var; ++i) {
|
|
|
|
struct ppcg_kernel_var &Var = Kernel->var[i];
|
|
|
|
isl_id *Id = isl_space_get_tuple_id(Var.array->space, isl_dim_set);
|
2017-07-25 00:22:27 +08:00
|
|
|
Type *EleTy = ScopArrayInfo::getFromId(isl::manage(Id))->getElementType();
|
2016-08-04 20:18:14 +08:00
|
|
|
|
2016-08-04 21:57:29 +08:00
|
|
|
Type *ArrayTy = EleTy;
|
2016-08-04 20:18:14 +08:00
|
|
|
SmallVector<const SCEV *, 4> Sizes;
|
|
|
|
|
2016-09-13 01:08:31 +08:00
|
|
|
Sizes.push_back(nullptr);
|
2016-08-05 16:27:24 +08:00
|
|
|
for (unsigned int j = 1; j < Var.array->n_index; ++j) {
|
2016-08-04 20:18:14 +08:00
|
|
|
isl_val *Val = isl_vec_get_element_val(Var.size, j);
|
2016-08-04 21:57:29 +08:00
|
|
|
long Bound = isl_val_get_num_si(Val);
|
2016-08-04 20:18:14 +08:00
|
|
|
isl_val_free(Val);
|
|
|
|
Sizes.push_back(S.getSE()->getConstant(Builder.getInt64Ty(), Bound));
|
2016-08-05 16:27:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for (int j = Var.array->n_index - 1; j >= 0; --j) {
|
|
|
|
isl_val *Val = isl_vec_get_element_val(Var.size, j);
|
|
|
|
long Bound = isl_val_get_num_si(Val);
|
|
|
|
isl_val_free(Val);
|
2016-08-04 20:18:14 +08:00
|
|
|
ArrayTy = ArrayType::get(ArrayTy, Bound);
|
|
|
|
}
|
|
|
|
|
2016-08-04 20:39:03 +08:00
|
|
|
const ScopArrayInfo *SAI;
|
|
|
|
Value *Allocation;
|
|
|
|
if (Var.type == ppcg_access_shared) {
|
|
|
|
auto GlobalVar = new GlobalVariable(
|
|
|
|
*M, ArrayTy, false, GlobalValue::InternalLinkage, 0, Var.name,
|
|
|
|
nullptr, GlobalValue::ThreadLocalMode::NotThreadLocal, 3);
|
|
|
|
GlobalVar->setAlignment(EleTy->getPrimitiveSizeInBits() / 8);
|
2016-08-04 21:57:29 +08:00
|
|
|
GlobalVar->setInitializer(Constant::getNullValue(ArrayTy));
|
|
|
|
|
2016-08-04 20:39:03 +08:00
|
|
|
Allocation = GlobalVar;
|
|
|
|
} else if (Var.type == ppcg_access_private) {
|
|
|
|
Allocation = Builder.CreateAlloca(ArrayTy, 0, "private_array");
|
|
|
|
} else {
|
|
|
|
llvm_unreachable("unknown variable type");
|
|
|
|
}
|
2017-01-15 04:25:44 +08:00
|
|
|
SAI =
|
|
|
|
S.getOrCreateScopArrayInfo(Allocation, EleTy, Sizes, MemoryKind::Array);
|
2016-08-04 20:18:14 +08:00
|
|
|
Id = isl_id_alloc(S.getIslCtx(), Var.name, nullptr);
|
2016-08-04 20:39:03 +08:00
|
|
|
IDToValue[Id] = Allocation;
|
|
|
|
LocalArrays.push_back(Allocation);
|
2016-08-04 20:18:14 +08:00
|
|
|
KernelIds.push_back(Id);
|
|
|
|
IDToSAI[Id] = SAI;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
void GPUNodeBuilder::createKernelFunction(
|
|
|
|
ppcg_kernel *Kernel, SetVector<Value *> &SubtreeValues,
|
|
|
|
SetVector<Function *> &SubtreeFunctions) {
|
Prefix the name of the calling host function in the name of callee GPU kernel
Summary:
Provide more context to the name of a GPU kernel by prefixing its name with the host function that calls it. E.g. The first kernel called by `gemm` would be `FUNC_gemm_KERNEL_0`.
Kernels currently follow the "kernel_#" (# = 0,1,2,3,...) nomenclature. This patch makes it easier to map host caller and device callee, especially when there are many kernels produced by Polly-ACC.
Reviewers: grosser, Meinersbur, bollu, philip.pfaffe, kbarton!
Reviewed By: grosser
Subscribers: nemanjai, pollydev
Tags: #polly
Differential Revision: https://reviews.llvm.org/D33985
llvm-svn: 307173
2017-07-06 00:48:21 +08:00
|
|
|
std::string Identifier = getKernelFuncName(Kernel->id);
|
2016-07-19 15:32:38 +08:00
|
|
|
GPUModule.reset(new Module(Identifier, Builder.getContext()));
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
|
|
|
|
switch (Arch) {
|
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
if (Runtime == GPURuntime::CUDA)
|
|
|
|
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
|
|
|
|
else if (Runtime == GPURuntime::OpenCL)
|
|
|
|
GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-nvcl"));
|
|
|
|
GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
|
|
|
|
break;
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
case GPUArch::SPIR32:
|
|
|
|
GPUModule->setTargetTriple(Triple::normalize("spir-unknown-unknown"));
|
|
|
|
GPUModule->setDataLayout(computeSPIRDataLayout(false /* is64Bit */));
|
|
|
|
break;
|
|
|
|
case GPUArch::SPIR64:
|
|
|
|
GPUModule->setTargetTriple(Triple::normalize("spir64-unknown-unknown"));
|
|
|
|
GPUModule->setDataLayout(computeSPIRDataLayout(true /* is64Bit */));
|
|
|
|
break;
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-07-21 21:15:59 +08:00
|
|
|
Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-07-19 15:33:11 +08:00
|
|
|
BasicBlock *PrevBlock = Builder.GetInsertBlock();
|
2016-07-19 15:32:38 +08:00
|
|
|
auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
|
|
|
|
|
2016-07-19 15:33:11 +08:00
|
|
|
DT.addNewBlock(EntryBlock, PrevBlock);
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
Builder.SetInsertPoint(EntryBlock);
|
|
|
|
Builder.CreateRetVoid();
|
|
|
|
Builder.SetInsertPoint(EntryBlock, EntryBlock->begin());
|
2016-07-19 15:32:44 +08:00
|
|
|
|
2016-08-03 20:00:07 +08:00
|
|
|
ScopDetection::markFunctionAsInvalid(FN);
|
|
|
|
|
2016-08-04 14:55:59 +08:00
|
|
|
prepareKernelArguments(Kernel, FN);
|
2016-08-04 20:18:14 +08:00
|
|
|
createKernelVariables(Kernel, FN);
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
|
|
|
|
switch (Arch) {
|
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
insertKernelIntrinsics(Kernel);
|
|
|
|
break;
|
|
|
|
case GPUArch::SPIR32:
|
|
|
|
case GPUArch::SPIR64:
|
|
|
|
insertKernelCallsSPIR(Kernel);
|
|
|
|
break;
|
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
}
|
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
std::string GPUNodeBuilder::createKernelASM() {
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
llvm::Triple GPUTriple;
|
|
|
|
|
|
|
|
switch (Arch) {
|
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
switch (Runtime) {
|
|
|
|
case GPURuntime::CUDA:
|
|
|
|
GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-cuda"));
|
|
|
|
break;
|
|
|
|
case GPURuntime::OpenCL:
|
|
|
|
GPUTriple = llvm::Triple(Triple::normalize("nvptx64-nvidia-nvcl"));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
case GPUArch::SPIR64:
|
|
|
|
case GPUArch::SPIR32:
|
|
|
|
std::string SPIRAssembly;
|
|
|
|
raw_string_ostream IROstream(SPIRAssembly);
|
|
|
|
IROstream << *GPUModule;
|
|
|
|
IROstream.flush();
|
|
|
|
return SPIRAssembly;
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
}
|
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
std::string ErrMsg;
|
|
|
|
auto GPUTarget = TargetRegistry::lookupTarget(GPUTriple.getTriple(), ErrMsg);
|
|
|
|
|
|
|
|
if (!GPUTarget) {
|
|
|
|
errs() << ErrMsg << "\n";
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
TargetOptions Options;
|
|
|
|
Options.UnsafeFPMath = FastMath;
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
|
|
|
|
std::string subtarget;
|
|
|
|
|
|
|
|
switch (Arch) {
|
|
|
|
case GPUArch::NVPTX64:
|
|
|
|
subtarget = CudaVersion;
|
|
|
|
break;
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
case GPUArch::SPIR32:
|
|
|
|
case GPUArch::SPIR64:
|
|
|
|
llvm_unreachable("No subtarget for SPIR architecture");
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<TargetMachine> TargetM(GPUTarget->createTargetMachine(
|
|
|
|
GPUTriple.getTriple(), subtarget, "", Options, Optional<Reloc::Model>()));
|
2016-07-22 15:11:12 +08:00
|
|
|
|
|
|
|
SmallString<0> ASMString;
|
|
|
|
raw_svector_ostream ASMStream(ASMString);
|
|
|
|
llvm::legacy::PassManager PM;
|
|
|
|
|
|
|
|
PM.add(createTargetTransformInfoWrapperPass(TargetM->getTargetIRAnalysis()));
|
|
|
|
|
|
|
|
if (TargetM->addPassesToEmitFile(
|
|
|
|
PM, ASMStream, TargetMachine::CGFT_AssemblyFile, true /* verify */)) {
|
|
|
|
errs() << "The target does not support generation of this file type!\n";
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
|
|
|
|
PM.run(*GPUModule);
|
|
|
|
|
|
|
|
return ASMStream.str();
|
|
|
|
}
|
|
|
|
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
bool GPUNodeBuilder::requiresCUDALibDevice() {
|
2017-08-06 11:04:15 +08:00
|
|
|
bool RequiresLibDevice = false;
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
for (Function &F : GPUModule->functions()) {
|
|
|
|
if (!F.isDeclaration())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
std::string CUDALibDeviceFunc = getCUDALibDeviceFuntion(&F);
|
|
|
|
if (CUDALibDeviceFunc.length() != 0) {
|
|
|
|
F.setName(CUDALibDeviceFunc);
|
2017-08-06 11:04:15 +08:00
|
|
|
RequiresLibDevice = true;
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-06 11:04:15 +08:00
|
|
|
return RequiresLibDevice;
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void GPUNodeBuilder::addCUDALibDevice() {
|
|
|
|
if (Arch != GPUArch::NVPTX64)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (requiresCUDALibDevice()) {
|
|
|
|
SMDiagnostic Error;
|
|
|
|
|
|
|
|
errs() << CUDALibDevice << "\n";
|
|
|
|
auto LibDeviceModule =
|
|
|
|
parseIRFile(CUDALibDevice, Error, GPUModule->getContext());
|
|
|
|
|
|
|
|
if (!LibDeviceModule) {
|
|
|
|
BuildSuccessful = false;
|
|
|
|
report_fatal_error("Could not find or load libdevice. Skipping GPU "
|
|
|
|
"kernel generation. Please set -polly-acc-libdevice "
|
|
|
|
"accordingly.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Linker L(*GPUModule);
|
|
|
|
|
|
|
|
// Set an nvptx64 target triple to avoid linker warnings. The original
|
|
|
|
// triple of the libdevice files are nvptx-unknown-unknown.
|
|
|
|
LibDeviceModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
|
|
|
|
L.linkInModule(std::move(LibDeviceModule), Linker::LinkOnlyNeeded);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-26 00:31:21 +08:00
|
|
|
std::string GPUNodeBuilder::finalizeKernelFunction() {
|
2017-06-26 22:56:56 +08:00
|
|
|
|
2016-09-12 14:06:31 +08:00
|
|
|
if (verifyModule(*GPUModule)) {
|
2017-06-26 22:56:56 +08:00
|
|
|
DEBUG(dbgs() << "verifyModule failed on module:\n";
|
|
|
|
GPUModule->print(dbgs(), nullptr); dbgs() << "\n";);
|
2017-07-21 19:21:44 +08:00
|
|
|
DEBUG(dbgs() << "verifyModule Error:\n";
|
|
|
|
verifyModule(*GPUModule, &dbgs()););
|
2017-06-26 22:56:56 +08:00
|
|
|
|
|
|
|
if (FailOnVerifyModuleFailure)
|
|
|
|
llvm_unreachable("VerifyModule failed.");
|
|
|
|
|
2016-09-12 14:06:31 +08:00
|
|
|
BuildSuccessful = false;
|
|
|
|
return "";
|
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
addCUDALibDevice();
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
if (DumpKernelIR)
|
|
|
|
outs() << *GPUModule << "\n";
|
|
|
|
|
[Polly][GPGPU] Added SPIR Code Generation and Corresponding Runtime Support for Intel
Summary:
Added SPIR Code Generation to the PPCG Code Generator. This can be invoked using
the polly-gpu-arch flag value 'spir32' or 'spir64' for 32 and 64 bit code respectively.
In addition to that, runtime support has been added to execute said SPIR code on Intel
GPU's, where the system is equipped with Intel's open source driver Beignet (development
version). This requires the cmake flag 'USE_INTEL_OCL' to be turned on, and the polly-gpu-runtime
flag value to be 'libopencl'.
The transformation of LLVM IR to SPIR is currently quite a hack, consisting in part of regex
string transformations.
Has been tested (working) with Polybench 3.2 on an Intel i7-5500U (integrated graphics chip).
Reviewers: bollu, grosser, Meinersbur, singam-sanjay
Reviewed By: grosser, singam-sanjay
Subscribers: pollydev, nemanjai, mgorny, Anastasia, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35185
llvm-svn: 308751
2017-07-22 00:11:06 +08:00
|
|
|
if (Arch != GPUArch::SPIR32 && Arch != GPUArch::SPIR64) {
|
|
|
|
// Optimize module.
|
|
|
|
llvm::legacy::PassManager OptPasses;
|
|
|
|
PassManagerBuilder PassBuilder;
|
|
|
|
PassBuilder.OptLevel = 3;
|
|
|
|
PassBuilder.SizeLevel = 0;
|
|
|
|
PassBuilder.populateModulePassManager(OptPasses);
|
|
|
|
OptPasses.run(*GPUModule);
|
|
|
|
}
|
2016-07-24 14:43:21 +08:00
|
|
|
|
2016-07-22 15:11:12 +08:00
|
|
|
std::string Assembly = createKernelASM();
|
|
|
|
|
|
|
|
if (DumpKernelASM)
|
|
|
|
outs() << Assembly << "\n";
|
|
|
|
|
2016-07-19 15:32:38 +08:00
|
|
|
GPUModule.release();
|
2016-07-19 15:32:44 +08:00
|
|
|
KernelIDs.clear();
|
2016-07-26 00:31:21 +08:00
|
|
|
|
|
|
|
return Assembly;
|
2016-07-19 15:32:38 +08:00
|
|
|
}
|
2017-08-03 20:09:33 +08:00
|
|
|
/// Construct an `isl_pw_aff_list` from a vector of `isl_pw_aff`
|
|
|
|
/// @param PwAffs The list of piecewise affine functions to create an
|
|
|
|
/// `isl_pw_aff_list` from. We expect an rvalue ref because
|
|
|
|
/// all the isl_pw_aff are used up by this function.
|
|
|
|
///
|
|
|
|
/// @returns The `isl_pw_aff_list`.
|
|
|
|
__isl_give isl_pw_aff_list *
|
|
|
|
createPwAffList(isl_ctx *Context,
|
|
|
|
const std::vector<__isl_take isl_pw_aff *> &&PwAffs) {
|
|
|
|
isl_pw_aff_list *List = isl_pw_aff_list_alloc(Context, PwAffs.size());
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < PwAffs.size(); i++) {
|
|
|
|
List = isl_pw_aff_list_insert(List, i, PwAffs[i]);
|
|
|
|
}
|
|
|
|
return List;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Align all the `PwAffs` such that they have the same parameter dimensions.
|
|
|
|
///
|
|
|
|
/// We loop over all `pw_aff` and align all of their spaces together to
|
|
|
|
/// create a common space for all the `pw_aff`. This common space is the
|
|
|
|
/// `AlignSpace`. We then align all the `pw_aff` to this space. We start
|
|
|
|
/// with the given `SeedSpace`.
|
|
|
|
/// @param PwAffs The list of piecewise affine functions we want to align.
|
|
|
|
/// This is an rvalue reference because the entire vector is
|
|
|
|
/// used up by the end of the operation.
|
|
|
|
/// @param SeedSpace The space to start the alignment process with.
|
|
|
|
/// @returns A std::pair, whose first element is the aligned space,
|
|
|
|
/// whose second element is the vector of aligned piecewise
|
|
|
|
/// affines.
|
|
|
|
static std::pair<__isl_give isl_space *, std::vector<__isl_give isl_pw_aff *>>
|
|
|
|
alignPwAffs(const std::vector<__isl_take isl_pw_aff *> &&PwAffs,
|
|
|
|
__isl_take isl_space *SeedSpace) {
|
|
|
|
assert(SeedSpace && "Invalid seed space given.");
|
|
|
|
|
|
|
|
isl_space *AlignSpace = SeedSpace;
|
|
|
|
for (isl_pw_aff *PwAff : PwAffs) {
|
|
|
|
isl_space *PwAffSpace = isl_pw_aff_get_domain_space(PwAff);
|
|
|
|
AlignSpace = isl_space_align_params(AlignSpace, PwAffSpace);
|
|
|
|
}
|
|
|
|
std::vector<isl_pw_aff *> AdjustedPwAffs;
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < PwAffs.size(); i++) {
|
|
|
|
isl_pw_aff *Adjusted = PwAffs[i];
|
|
|
|
assert(Adjusted && "Invalid pw_aff given.");
|
|
|
|
Adjusted = isl_pw_aff_align_params(Adjusted, isl_space_copy(AlignSpace));
|
|
|
|
AdjustedPwAffs.push_back(Adjusted);
|
|
|
|
}
|
|
|
|
return std::make_pair(AlignSpace, AdjustedPwAffs);
|
|
|
|
}
|
2016-07-19 15:32:38 +08:00
|
|
|
|
2016-07-13 23:54:58 +08:00
|
|
|
namespace {
|
|
|
|
class PPCGCodeGeneration : public ScopPass {
|
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
GPURuntime Runtime = GPURuntime::CUDA;
|
|
|
|
|
|
|
|
GPUArch Architecture = GPUArch::NVPTX64;
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
/// The scop that is currently processed.
|
|
|
|
Scop *S;
|
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
LoopInfo *LI;
|
|
|
|
DominatorTree *DT;
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
const DataLayout *DL;
|
|
|
|
RegionInfo *RI;
|
|
|
|
|
2016-07-13 23:54:58 +08:00
|
|
|
PPCGCodeGeneration() : ScopPass(ID) {}
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
/// Construct compilation options for PPCG.
|
|
|
|
///
|
|
|
|
/// @returns The compilation options.
|
|
|
|
ppcg_options *createPPCGOptions() {
|
|
|
|
auto DebugOptions =
|
|
|
|
(ppcg_debug_options *)malloc(sizeof(ppcg_debug_options));
|
|
|
|
auto Options = (ppcg_options *)malloc(sizeof(ppcg_options));
|
|
|
|
|
|
|
|
DebugOptions->dump_schedule_constraints = false;
|
|
|
|
DebugOptions->dump_schedule = false;
|
|
|
|
DebugOptions->dump_final_schedule = false;
|
|
|
|
DebugOptions->dump_sizes = false;
|
2016-08-04 20:44:03 +08:00
|
|
|
DebugOptions->verbose = false;
|
2016-07-14 18:22:19 +08:00
|
|
|
|
|
|
|
Options->debug = DebugOptions;
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
Options->group_chains = false;
|
2016-07-14 18:22:19 +08:00
|
|
|
Options->reschedule = true;
|
|
|
|
Options->scale_tile_loops = false;
|
|
|
|
Options->wrap = false;
|
|
|
|
|
|
|
|
Options->non_negative_parameters = false;
|
|
|
|
Options->ctx = nullptr;
|
|
|
|
Options->sizes = nullptr;
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
Options->tile = true;
|
2016-07-14 22:14:02 +08:00
|
|
|
Options->tile_size = 32;
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
Options->isolate_full_tiles = false;
|
|
|
|
|
2016-08-04 20:39:03 +08:00
|
|
|
Options->use_private_memory = PrivateMemory;
|
2016-08-04 20:18:14 +08:00
|
|
|
Options->use_shared_memory = SharedMemory;
|
|
|
|
Options->max_shared_memory = 48 * 1024;
|
2016-07-14 18:22:19 +08:00
|
|
|
|
|
|
|
Options->target = PPCG_TARGET_CUDA;
|
|
|
|
Options->openmp = false;
|
|
|
|
Options->linearize_device_arrays = true;
|
2017-07-20 23:48:36 +08:00
|
|
|
Options->allow_gnu_extensions = false;
|
|
|
|
|
|
|
|
Options->unroll_copy_shared = false;
|
|
|
|
Options->unroll_gpu_tile = false;
|
|
|
|
Options->live_range_reordering = true;
|
2016-07-14 18:22:19 +08:00
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
Options->live_range_reordering = true;
|
|
|
|
Options->hybrid = false;
|
2016-07-14 18:22:19 +08:00
|
|
|
Options->opencl_compiler_options = nullptr;
|
|
|
|
Options->opencl_use_gpu = false;
|
|
|
|
Options->opencl_n_include_file = 0;
|
|
|
|
Options->opencl_include_files = nullptr;
|
|
|
|
Options->opencl_print_kernel_types = false;
|
|
|
|
Options->opencl_embed_kernel_code = false;
|
|
|
|
|
|
|
|
Options->save_schedule_file = nullptr;
|
|
|
|
Options->load_schedule_file = nullptr;
|
|
|
|
|
|
|
|
return Options;
|
|
|
|
}
|
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
/// Get a tagged access relation containing all accesses of type @p AccessTy.
|
|
|
|
///
|
|
|
|
/// Instead of a normal access of the form:
|
|
|
|
///
|
|
|
|
/// Stmt[i,j,k] -> Array[f_0(i,j,k), f_1(i,j,k)]
|
|
|
|
///
|
|
|
|
/// a tagged access has the form
|
|
|
|
///
|
|
|
|
/// [Stmt[i,j,k] -> id[]] -> Array[f_0(i,j,k), f_1(i,j,k)]
|
|
|
|
///
|
|
|
|
/// where 'id' is an additional space that references the memory access that
|
|
|
|
/// triggered the access.
|
|
|
|
///
|
|
|
|
/// @param AccessTy The type of the memory accesses to collect.
|
|
|
|
///
|
|
|
|
/// @return The relation describing all tagged memory accesses.
|
|
|
|
isl_union_map *getTaggedAccesses(enum MemoryAccess::AccessType AccessTy) {
|
2017-08-07 04:11:59 +08:00
|
|
|
isl_union_map *Accesses = isl_union_map_empty(S->getParamSpace().release());
|
2016-07-14 18:22:25 +08:00
|
|
|
|
|
|
|
for (auto &Stmt : *S)
|
|
|
|
for (auto &Acc : Stmt)
|
|
|
|
if (Acc->getType() == AccessTy) {
|
2017-07-23 12:08:38 +08:00
|
|
|
isl_map *Relation = Acc->getAccessRelation().release();
|
2017-08-07 00:39:52 +08:00
|
|
|
Relation =
|
|
|
|
isl_map_intersect_domain(Relation, Stmt.getDomain().release());
|
2016-07-14 18:22:25 +08:00
|
|
|
|
|
|
|
isl_space *Space = isl_map_get_space(Relation);
|
|
|
|
Space = isl_space_range(Space);
|
|
|
|
Space = isl_space_from_range(Space);
|
2017-07-23 12:08:11 +08:00
|
|
|
Space =
|
|
|
|
isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
|
2016-07-14 18:22:25 +08:00
|
|
|
isl_map *Universe = isl_map_universe(Space);
|
|
|
|
Relation = isl_map_domain_product(Relation, Universe);
|
|
|
|
Accesses = isl_union_map_add_map(Accesses, Relation);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Accesses;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Get the set of all read accesses, tagged with the access id.
|
|
|
|
///
|
|
|
|
/// @see getTaggedAccesses
|
|
|
|
isl_union_map *getTaggedReads() {
|
|
|
|
return getTaggedAccesses(MemoryAccess::READ);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Get the set of all may (and must) accesses, tagged with the access id.
|
|
|
|
///
|
|
|
|
/// @see getTaggedAccesses
|
|
|
|
isl_union_map *getTaggedMayWrites() {
|
|
|
|
return isl_union_map_union(getTaggedAccesses(MemoryAccess::MAY_WRITE),
|
|
|
|
getTaggedAccesses(MemoryAccess::MUST_WRITE));
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Get the set of all must accesses, tagged with the access id.
|
|
|
|
///
|
|
|
|
/// @see getTaggedAccesses
|
|
|
|
isl_union_map *getTaggedMustWrites() {
|
|
|
|
return getTaggedAccesses(MemoryAccess::MUST_WRITE);
|
|
|
|
}
|
|
|
|
|
2016-07-14 18:51:52 +08:00
|
|
|
/// Collect parameter and array names as isl_ids.
|
|
|
|
///
|
|
|
|
/// To reason about the different parameters and arrays used, ppcg requires
|
|
|
|
/// a list of all isl_ids in use. As PPCG traditionally performs
|
|
|
|
/// source-to-source compilation each of these isl_ids is mapped to the
|
|
|
|
/// expression that represents it. As we do not have a corresponding
|
|
|
|
/// expression in Polly, we just map each id to a 'zero' expression to match
|
|
|
|
/// the data format that ppcg expects.
|
|
|
|
///
|
|
|
|
/// @returns Retun a map from collected ids to 'zero' ast expressions.
|
|
|
|
__isl_give isl_id_to_ast_expr *getNames() {
|
|
|
|
auto *Names = isl_id_to_ast_expr_alloc(
|
2016-07-14 18:53:00 +08:00
|
|
|
S->getIslCtx(),
|
|
|
|
S->getNumParams() + std::distance(S->array_begin(), S->array_end()));
|
2016-07-14 18:51:52 +08:00
|
|
|
auto *Zero = isl_ast_expr_from_val(isl_val_zero(S->getIslCtx()));
|
|
|
|
|
2017-07-28 14:49:44 +08:00
|
|
|
for (const SCEV *P : S->parameters()) {
|
2017-08-07 03:31:27 +08:00
|
|
|
isl_id *Id = S->getIdForParam(P).release();
|
2016-07-14 18:51:52 +08:00
|
|
|
Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto &Array : S->arrays()) {
|
2017-07-22 07:07:56 +08:00
|
|
|
auto Id = Array->getBasePtrId().release();
|
2016-07-14 18:51:52 +08:00
|
|
|
Names = isl_id_to_ast_expr_set(Names, Id, isl_ast_expr_copy(Zero));
|
|
|
|
}
|
|
|
|
|
|
|
|
isl_ast_expr_free(Zero);
|
|
|
|
|
|
|
|
return Names;
|
|
|
|
}
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
/// Create a new PPCG scop from the current scop.
|
|
|
|
///
|
2016-07-14 18:22:25 +08:00
|
|
|
/// The PPCG scop is initialized with data from the current polly::Scop. From
|
|
|
|
/// this initial data, the data-dependences in the PPCG scop are initialized.
|
|
|
|
/// We do not use Polly's dependence analysis for now, to ensure we match
|
|
|
|
/// the PPCG default behaviour more closely.
|
2016-07-14 18:22:19 +08:00
|
|
|
///
|
|
|
|
/// @returns A new ppcg scop.
|
|
|
|
ppcg_scop *createPPCGScop() {
|
2017-07-20 23:48:36 +08:00
|
|
|
MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
|
|
|
|
|
|
|
|
PPCGScop->options = createPPCGOptions();
|
2017-07-05 22:57:04 +08:00
|
|
|
// enable live range reordering
|
|
|
|
PPCGScop->options->live_range_reordering = 1;
|
2016-07-14 18:22:19 +08:00
|
|
|
|
|
|
|
PPCGScop->start = 0;
|
|
|
|
PPCGScop->end = 0;
|
|
|
|
|
2017-08-07 03:52:38 +08:00
|
|
|
PPCGScop->context = S->getContext().release();
|
2017-08-07 05:42:25 +08:00
|
|
|
PPCGScop->domain = S->getDomains().release();
|
2017-07-20 23:48:36 +08:00
|
|
|
// TODO: investigate this further. PPCG calls collect_call_domains.
|
2017-08-07 03:52:38 +08:00
|
|
|
PPCGScop->call = isl_union_set_from_set(S->getContext().release());
|
2016-07-14 18:22:25 +08:00
|
|
|
PPCGScop->tagged_reads = getTaggedReads();
|
2017-08-07 03:22:27 +08:00
|
|
|
PPCGScop->reads = S->getReads().release();
|
2016-07-14 18:22:19 +08:00
|
|
|
PPCGScop->live_in = nullptr;
|
2016-07-14 18:22:25 +08:00
|
|
|
PPCGScop->tagged_may_writes = getTaggedMayWrites();
|
2017-08-07 03:22:27 +08:00
|
|
|
PPCGScop->may_writes = S->getWrites().release();
|
2016-07-14 18:22:25 +08:00
|
|
|
PPCGScop->tagged_must_writes = getTaggedMustWrites();
|
2017-08-07 03:22:27 +08:00
|
|
|
PPCGScop->must_writes = S->getMustWrites().release();
|
2016-07-14 18:22:19 +08:00
|
|
|
PPCGScop->live_out = nullptr;
|
2017-07-20 23:48:36 +08:00
|
|
|
PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
|
|
|
|
PPCGScop->must_kills = KillsInfo.MustKills.take();
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
PPCGScop->tagger = nullptr;
|
2017-07-05 22:57:04 +08:00
|
|
|
PPCGScop->independence =
|
|
|
|
isl_union_map_empty(isl_set_get_space(PPCGScop->context));
|
2016-07-14 18:22:19 +08:00
|
|
|
PPCGScop->dep_flow = nullptr;
|
|
|
|
PPCGScop->tagged_dep_flow = nullptr;
|
|
|
|
PPCGScop->dep_false = nullptr;
|
|
|
|
PPCGScop->dep_forced = nullptr;
|
|
|
|
PPCGScop->dep_order = nullptr;
|
|
|
|
PPCGScop->tagged_dep_order = nullptr;
|
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
PPCGScop->schedule = S->getScheduleTree();
|
2017-07-05 22:57:04 +08:00
|
|
|
// If we have something non-trivial to kill, add it to the schedule
|
|
|
|
if (KillsInfo.KillsSchedule.get())
|
|
|
|
PPCGScop->schedule = isl_schedule_sequence(
|
|
|
|
PPCGScop->schedule, KillsInfo.KillsSchedule.take());
|
|
|
|
|
|
|
|
PPCGScop->names = getNames();
|
2016-07-14 18:22:19 +08:00
|
|
|
PPCGScop->pet = nullptr;
|
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
compute_tagger(PPCGScop);
|
|
|
|
compute_dependences(PPCGScop);
|
2017-07-20 23:48:36 +08:00
|
|
|
eliminate_dead_code(PPCGScop);
|
2016-07-14 18:22:25 +08:00
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
return PPCGScop;
|
|
|
|
}
|
|
|
|
|
2017-06-08 20:06:15 +08:00
|
|
|
/// Collect the array accesses in a statement.
|
2016-07-15 15:05:54 +08:00
|
|
|
///
|
|
|
|
/// @param Stmt The statement for which to collect the accesses.
|
|
|
|
///
|
|
|
|
/// @returns A list of array accesses.
|
|
|
|
gpu_stmt_access *getStmtAccesses(ScopStmt &Stmt) {
|
|
|
|
gpu_stmt_access *Accesses = nullptr;
|
|
|
|
|
|
|
|
for (MemoryAccess *Acc : Stmt) {
|
|
|
|
auto Access = isl_alloc_type(S->getIslCtx(), struct gpu_stmt_access);
|
|
|
|
Access->read = Acc->isRead();
|
|
|
|
Access->write = Acc->isWrite();
|
2017-07-23 12:08:38 +08:00
|
|
|
Access->access = Acc->getAccessRelation().release();
|
2016-07-15 15:05:54 +08:00
|
|
|
isl_space *Space = isl_map_get_space(Access->access);
|
|
|
|
Space = isl_space_range(Space);
|
|
|
|
Space = isl_space_from_range(Space);
|
2017-07-23 12:08:11 +08:00
|
|
|
Space = isl_space_set_tuple_id(Space, isl_dim_in, Acc->getId().release());
|
2016-07-15 15:05:54 +08:00
|
|
|
isl_map *Universe = isl_map_universe(Space);
|
|
|
|
Access->tagged_access =
|
2017-07-23 12:08:38 +08:00
|
|
|
isl_map_domain_product(Acc->getAccessRelation().release(), Universe);
|
2016-08-04 20:18:14 +08:00
|
|
|
Access->exact_write = !Acc->isMayWrite();
|
2017-07-23 12:08:11 +08:00
|
|
|
Access->ref_id = Acc->getId().release();
|
2016-07-15 15:05:54 +08:00
|
|
|
Access->next = Accesses;
|
2016-08-04 20:18:14 +08:00
|
|
|
Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
|
2016-07-15 15:05:54 +08:00
|
|
|
Accesses = Access;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Accesses;
|
|
|
|
}
|
|
|
|
|
2016-07-14 23:51:37 +08:00
|
|
|
/// Collect the list of GPU statements.
|
|
|
|
///
|
|
|
|
/// Each statement has an id, a pointer to the underlying data structure,
|
|
|
|
/// as well as a list with all memory accesses.
|
|
|
|
///
|
|
|
|
/// TODO: Initialize the list of memory accesses.
|
|
|
|
///
|
|
|
|
/// @returns A linked-list of statements.
|
|
|
|
gpu_stmt *getStatements() {
|
|
|
|
gpu_stmt *Stmts = isl_calloc_array(S->getIslCtx(), struct gpu_stmt,
|
|
|
|
std::distance(S->begin(), S->end()));
|
|
|
|
|
|
|
|
int i = 0;
|
|
|
|
for (auto &Stmt : *S) {
|
|
|
|
gpu_stmt *GPUStmt = &Stmts[i];
|
|
|
|
|
2017-08-07 00:39:52 +08:00
|
|
|
GPUStmt->id = Stmt.getDomainId().release();
|
2016-07-14 23:51:37 +08:00
|
|
|
|
|
|
|
// We use the pet stmt pointer to keep track of the Polly statements.
|
|
|
|
GPUStmt->stmt = (pet_stmt *)&Stmt;
|
2016-07-15 15:05:54 +08:00
|
|
|
GPUStmt->accesses = getStmtAccesses(Stmt);
|
2016-07-14 23:51:37 +08:00
|
|
|
i++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Stmts;
|
|
|
|
}
|
|
|
|
|
2016-07-15 15:05:54 +08:00
|
|
|
/// Derive the extent of an array.
|
|
|
|
///
|
2016-08-10 18:58:19 +08:00
|
|
|
/// The extent of an array is the set of elements that are within the
|
|
|
|
/// accessed array. For the inner dimensions, the extent constraints are
|
|
|
|
/// 0 and the size of the corresponding array dimension. For the first
|
|
|
|
/// (outermost) dimension, the extent constraints are the minimal and maximal
|
|
|
|
/// subscript value for the first dimension.
|
2016-07-15 15:05:54 +08:00
|
|
|
///
|
|
|
|
/// @param Array The array to derive the extent for.
|
|
|
|
///
|
|
|
|
/// @returns An isl_set describing the extent of the array.
|
|
|
|
__isl_give isl_set *getExtent(ScopArrayInfo *Array) {
|
2016-08-10 18:58:19 +08:00
|
|
|
unsigned NumDims = Array->getNumberOfDimensions();
|
2017-08-07 03:22:27 +08:00
|
|
|
isl_union_map *Accesses = S->getAccesses().release();
|
2017-08-07 05:42:25 +08:00
|
|
|
Accesses =
|
|
|
|
isl_union_map_intersect_domain(Accesses, S->getDomains().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
Accesses = isl_union_map_detect_equalities(Accesses);
|
2016-07-15 15:05:54 +08:00
|
|
|
isl_union_set *AccessUSet = isl_union_map_range(Accesses);
|
2016-08-10 18:58:19 +08:00
|
|
|
AccessUSet = isl_union_set_coalesce(AccessUSet);
|
|
|
|
AccessUSet = isl_union_set_detect_equalities(AccessUSet);
|
|
|
|
AccessUSet = isl_union_set_coalesce(AccessUSet);
|
|
|
|
|
|
|
|
if (isl_union_set_is_empty(AccessUSet)) {
|
|
|
|
isl_union_set_free(AccessUSet);
|
2017-07-22 07:07:56 +08:00
|
|
|
return isl_set_empty(Array->getSpace().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Array->getNumberOfDimensions() == 0) {
|
|
|
|
isl_union_set_free(AccessUSet);
|
2017-07-22 07:07:56 +08:00
|
|
|
return isl_set_universe(Array->getSpace().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
}
|
|
|
|
|
2016-07-15 15:05:54 +08:00
|
|
|
isl_set *AccessSet =
|
2017-07-22 07:07:56 +08:00
|
|
|
isl_union_set_extract_set(AccessUSet, Array->getSpace().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
|
2016-07-15 15:05:54 +08:00
|
|
|
isl_union_set_free(AccessUSet);
|
2017-07-22 07:07:56 +08:00
|
|
|
isl_local_space *LS =
|
|
|
|
isl_local_space_from_space(Array->getSpace().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
|
|
|
|
isl_pw_aff *Val =
|
|
|
|
isl_pw_aff_from_aff(isl_aff_var_on_domain(LS, isl_dim_set, 0));
|
|
|
|
|
|
|
|
isl_pw_aff *OuterMin = isl_set_dim_min(isl_set_copy(AccessSet), 0);
|
|
|
|
isl_pw_aff *OuterMax = isl_set_dim_max(AccessSet, 0);
|
|
|
|
OuterMin = isl_pw_aff_add_dims(OuterMin, isl_dim_in,
|
|
|
|
isl_pw_aff_dim(Val, isl_dim_in));
|
|
|
|
OuterMax = isl_pw_aff_add_dims(OuterMax, isl_dim_in,
|
|
|
|
isl_pw_aff_dim(Val, isl_dim_in));
|
2017-07-22 07:07:56 +08:00
|
|
|
OuterMin = isl_pw_aff_set_tuple_id(OuterMin, isl_dim_in,
|
|
|
|
Array->getBasePtrId().release());
|
|
|
|
OuterMax = isl_pw_aff_set_tuple_id(OuterMax, isl_dim_in,
|
|
|
|
Array->getBasePtrId().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
|
2017-07-22 07:07:56 +08:00
|
|
|
isl_set *Extent = isl_set_universe(Array->getSpace().release());
|
2016-08-10 18:58:19 +08:00
|
|
|
|
|
|
|
Extent = isl_set_intersect(
|
|
|
|
Extent, isl_pw_aff_le_set(OuterMin, isl_pw_aff_copy(Val)));
|
|
|
|
Extent = isl_set_intersect(Extent, isl_pw_aff_ge_set(OuterMax, Val));
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < NumDims; ++i)
|
|
|
|
Extent = isl_set_lower_bound_si(Extent, isl_dim_set, i, 0);
|
|
|
|
|
2017-05-19 23:07:45 +08:00
|
|
|
for (unsigned i = 0; i < NumDims; ++i) {
|
2016-08-10 18:58:19 +08:00
|
|
|
isl_pw_aff *PwAff =
|
2017-07-22 07:07:56 +08:00
|
|
|
const_cast<isl_pw_aff *>(Array->getDimensionSizePw(i).release());
|
2017-05-19 23:07:45 +08:00
|
|
|
|
|
|
|
// isl_pw_aff can be NULL for zero dimension. Only in the case of a
|
|
|
|
// Fortran array will we have a legitimate dimension.
|
|
|
|
if (!PwAff) {
|
|
|
|
assert(i == 0 && "invalid dimension isl_pw_aff for nonzero dimension");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-08-10 18:58:19 +08:00
|
|
|
isl_pw_aff *Val = isl_pw_aff_from_aff(isl_aff_var_on_domain(
|
2017-07-22 07:07:56 +08:00
|
|
|
isl_local_space_from_space(Array->getSpace().release()), isl_dim_set,
|
|
|
|
i));
|
2016-08-10 18:58:19 +08:00
|
|
|
PwAff = isl_pw_aff_add_dims(PwAff, isl_dim_in,
|
|
|
|
isl_pw_aff_dim(Val, isl_dim_in));
|
|
|
|
PwAff = isl_pw_aff_set_tuple_id(PwAff, isl_dim_in,
|
|
|
|
isl_pw_aff_get_tuple_id(Val, isl_dim_in));
|
|
|
|
auto *Set = isl_pw_aff_gt_set(PwAff, Val);
|
|
|
|
Extent = isl_set_intersect(Set, Extent);
|
|
|
|
}
|
2016-07-15 15:05:54 +08:00
|
|
|
|
2016-08-10 18:58:19 +08:00
|
|
|
return Extent;
|
2016-07-15 15:05:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Derive the bounds of an array.
|
|
|
|
///
|
|
|
|
/// For the first dimension we derive the bound of the array from the extent
|
|
|
|
/// of this dimension. For inner dimensions we obtain their size directly from
|
|
|
|
/// ScopArrayInfo.
|
|
|
|
///
|
|
|
|
/// @param PPCGArray The array to compute bounds for.
|
|
|
|
/// @param Array The polly array from which to take the information.
|
|
|
|
void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
|
2017-08-03 20:09:33 +08:00
|
|
|
std::vector<isl_pw_aff *> Bounds;
|
2017-07-20 23:48:36 +08:00
|
|
|
|
2016-07-15 15:05:54 +08:00
|
|
|
if (PPCGArray.n_index > 0) {
|
2016-09-11 21:30:12 +08:00
|
|
|
if (isl_set_is_empty(PPCGArray.extent)) {
|
|
|
|
isl_set *Dom = isl_set_copy(PPCGArray.extent);
|
|
|
|
isl_local_space *LS = isl_local_space_from_space(
|
|
|
|
isl_space_params(isl_set_get_space(Dom)));
|
|
|
|
isl_set_free(Dom);
|
2017-07-20 23:48:36 +08:00
|
|
|
isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS));
|
2017-08-03 20:09:33 +08:00
|
|
|
Bounds.push_back(Zero);
|
2016-09-11 21:30:12 +08:00
|
|
|
} else {
|
|
|
|
isl_set *Dom = isl_set_copy(PPCGArray.extent);
|
|
|
|
Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
|
|
|
|
isl_pw_aff *Bound = isl_set_dim_max(isl_set_copy(Dom), 0);
|
|
|
|
isl_set_free(Dom);
|
|
|
|
Dom = isl_pw_aff_domain(isl_pw_aff_copy(Bound));
|
|
|
|
isl_local_space *LS =
|
|
|
|
isl_local_space_from_space(isl_set_get_space(Dom));
|
|
|
|
isl_aff *One = isl_aff_zero_on_domain(LS);
|
|
|
|
One = isl_aff_add_constant_si(One, 1);
|
|
|
|
Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
|
2017-08-07 03:52:38 +08:00
|
|
|
Bound = isl_pw_aff_gist(Bound, S->getContext().release());
|
2017-08-03 20:09:33 +08:00
|
|
|
Bounds.push_back(Bound);
|
2016-09-11 21:30:12 +08:00
|
|
|
}
|
2016-07-15 15:05:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned i = 1; i < PPCGArray.n_index; ++i) {
|
2017-07-22 07:07:56 +08:00
|
|
|
isl_pw_aff *Bound = Array->getDimensionSizePw(i).release();
|
2016-07-15 15:05:54 +08:00
|
|
|
auto LS = isl_pw_aff_get_domain_space(Bound);
|
|
|
|
auto Aff = isl_multi_aff_zero(LS);
|
|
|
|
Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
|
2017-08-03 20:09:33 +08:00
|
|
|
Bounds.push_back(Bound);
|
2016-07-15 15:05:54 +08:00
|
|
|
}
|
2017-07-20 23:48:36 +08:00
|
|
|
|
2017-08-03 20:09:33 +08:00
|
|
|
/// To construct a `isl_multi_pw_aff`, we need all the indivisual `pw_aff`
|
|
|
|
/// to have the same parameter dimensions. So, we need to align them to an
|
|
|
|
/// appropriate space.
|
|
|
|
/// Scop::Context is _not_ an appropriate space, because when we have
|
|
|
|
/// `-polly-ignore-parameter-bounds` enabled, the Scop::Context does not
|
|
|
|
/// contain all parameter dimensions.
|
|
|
|
/// So, use the helper `alignPwAffs` to align all the `isl_pw_aff` together.
|
2017-08-07 04:11:59 +08:00
|
|
|
isl_space *SeedAlignSpace = S->getParamSpace().release();
|
2017-08-03 20:09:33 +08:00
|
|
|
SeedAlignSpace = isl_space_add_dims(SeedAlignSpace, isl_dim_set, 1);
|
|
|
|
|
|
|
|
isl_space *AlignSpace = nullptr;
|
|
|
|
std::vector<isl_pw_aff *> AlignedBounds;
|
|
|
|
std::tie(AlignSpace, AlignedBounds) =
|
|
|
|
alignPwAffs(std::move(Bounds), SeedAlignSpace);
|
|
|
|
|
|
|
|
assert(AlignSpace && "alignPwAffs did not initialise AlignSpace");
|
|
|
|
|
|
|
|
isl_pw_aff_list *BoundsList =
|
|
|
|
createPwAffList(S->getIslCtx(), std::move(AlignedBounds));
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent);
|
2017-08-03 20:09:33 +08:00
|
|
|
BoundsSpace = isl_space_align_params(BoundsSpace, AlignSpace);
|
2017-07-20 23:48:36 +08:00
|
|
|
|
|
|
|
assert(BoundsSpace && "Unable to access space of array.");
|
|
|
|
assert(BoundsList && "Unable to access list of bounds.");
|
|
|
|
|
|
|
|
PPCGArray.bound =
|
|
|
|
isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList);
|
|
|
|
assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly.");
|
2016-07-15 15:05:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Create the arrays for @p PPCGProg.
|
|
|
|
///
|
|
|
|
/// @param PPCGProg The program to compute the arrays for.
|
2017-07-25 20:35:36 +08:00
|
|
|
void createArrays(gpu_prog *PPCGProg,
|
|
|
|
const SmallVector<ScopArrayInfo *, 4> &ValidSAIs) {
|
2016-07-15 15:05:54 +08:00
|
|
|
int i = 0;
|
2017-07-25 20:35:36 +08:00
|
|
|
for (auto &Array : ValidSAIs) {
|
2016-07-15 15:05:54 +08:00
|
|
|
std::string TypeName;
|
|
|
|
raw_string_ostream OS(TypeName);
|
|
|
|
|
|
|
|
OS << *Array->getElementType();
|
|
|
|
TypeName = OS.str();
|
|
|
|
|
|
|
|
gpu_array_info &PPCGArray = PPCGProg->array[i];
|
|
|
|
|
2017-07-22 07:07:56 +08:00
|
|
|
PPCGArray.space = Array->getSpace().release();
|
2016-07-15 15:05:54 +08:00
|
|
|
PPCGArray.type = strdup(TypeName.c_str());
|
|
|
|
PPCGArray.size = Array->getElementType()->getPrimitiveSizeInBits() / 8;
|
|
|
|
PPCGArray.name = strdup(Array->getName().c_str());
|
|
|
|
PPCGArray.extent = nullptr;
|
|
|
|
PPCGArray.n_index = Array->getNumberOfDimensions();
|
|
|
|
PPCGArray.extent = getExtent(Array);
|
|
|
|
PPCGArray.n_ref = 0;
|
|
|
|
PPCGArray.refs = nullptr;
|
|
|
|
PPCGArray.accessed = true;
|
2016-09-18 03:22:18 +08:00
|
|
|
PPCGArray.read_only_scalar =
|
|
|
|
Array->isReadOnly() && Array->getNumberOfDimensions() == 0;
|
2016-07-15 15:05:54 +08:00
|
|
|
PPCGArray.has_compound_element = false;
|
|
|
|
PPCGArray.local = false;
|
|
|
|
PPCGArray.declare_local = false;
|
|
|
|
PPCGArray.global = false;
|
|
|
|
PPCGArray.linearize = false;
|
|
|
|
PPCGArray.dep_order = nullptr;
|
2016-07-25 20:47:39 +08:00
|
|
|
PPCGArray.user = Array;
|
2016-07-15 15:05:54 +08:00
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
PPCGArray.bound = nullptr;
|
2016-07-15 15:05:54 +08:00
|
|
|
setArrayBounds(PPCGArray, Array);
|
2016-07-15 18:51:14 +08:00
|
|
|
i++;
|
2016-07-18 23:44:32 +08:00
|
|
|
|
|
|
|
collect_references(PPCGProg, &PPCGArray);
|
2016-07-15 15:05:54 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Create an identity map between the arrays in the scop.
|
|
|
|
///
|
|
|
|
/// @returns An identity map between the arrays in the scop.
|
|
|
|
isl_union_map *getArrayIdentity() {
|
2017-08-07 04:11:59 +08:00
|
|
|
isl_union_map *Maps = isl_union_map_empty(S->getParamSpace().release());
|
2016-07-15 15:05:54 +08:00
|
|
|
|
2016-07-30 17:25:51 +08:00
|
|
|
for (auto &Array : S->arrays()) {
|
2017-07-22 07:07:56 +08:00
|
|
|
isl_space *Space = Array->getSpace().release();
|
2016-07-15 15:05:54 +08:00
|
|
|
Space = isl_space_map_from_set(Space);
|
|
|
|
isl_map *Identity = isl_map_identity(Space);
|
|
|
|
Maps = isl_union_map_add_map(Maps, Identity);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Maps;
|
|
|
|
}
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
/// Create a default-initialized PPCG GPU program.
|
|
|
|
///
|
2017-06-08 20:06:15 +08:00
|
|
|
/// @returns A new gpu program description.
|
2016-07-14 18:22:19 +08:00
|
|
|
gpu_prog *createPPCGProg(ppcg_scop *PPCGScop) {
|
|
|
|
|
|
|
|
if (!PPCGScop)
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
auto PPCGProg = isl_calloc_type(S->getIslCtx(), struct gpu_prog);
|
|
|
|
|
|
|
|
PPCGProg->ctx = S->getIslCtx();
|
|
|
|
PPCGProg->scop = PPCGScop;
|
2016-07-14 18:51:52 +08:00
|
|
|
PPCGProg->context = isl_set_copy(PPCGScop->context);
|
2016-07-15 15:05:54 +08:00
|
|
|
PPCGProg->read = isl_union_map_copy(PPCGScop->reads);
|
|
|
|
PPCGProg->may_write = isl_union_map_copy(PPCGScop->may_writes);
|
|
|
|
PPCGProg->must_write = isl_union_map_copy(PPCGScop->must_writes);
|
|
|
|
PPCGProg->tagged_must_kill =
|
|
|
|
isl_union_map_copy(PPCGScop->tagged_must_kills);
|
|
|
|
PPCGProg->to_inner = getArrayIdentity();
|
|
|
|
PPCGProg->to_outer = getArrayIdentity();
|
2017-07-20 23:48:36 +08:00
|
|
|
// TODO: verify that this assignment is correct.
|
2016-07-14 18:22:19 +08:00
|
|
|
PPCGProg->any_to_outer = nullptr;
|
2017-07-05 22:57:04 +08:00
|
|
|
|
|
|
|
// this needs to be set when live range reordering is enabled.
|
|
|
|
// NOTE: I believe that is conservatively correct. I'm not sure
|
|
|
|
// what the semantics of this is.
|
|
|
|
// Quoting PPCG/gpu.h: "Order dependences on non-scalars."
|
|
|
|
PPCGProg->array_order =
|
|
|
|
isl_union_map_empty(isl_set_get_space(PPCGScop->context));
|
2016-07-14 23:51:37 +08:00
|
|
|
PPCGProg->n_stmts = std::distance(S->begin(), S->end());
|
|
|
|
PPCGProg->stmts = getStatements();
|
2017-07-25 20:35:36 +08:00
|
|
|
|
|
|
|
// Only consider arrays that have a non-empty extent.
|
|
|
|
// Otherwise, this will cause us to consider the following kinds of
|
|
|
|
// empty arrays:
|
|
|
|
// 1. Invariant loads that are represented by SAI objects.
|
|
|
|
// 2. Arrays with statically known zero size.
|
|
|
|
auto ValidSAIsRange =
|
|
|
|
make_filter_range(S->arrays(), [this](ScopArrayInfo *SAI) -> bool {
|
|
|
|
return !isl::manage(getExtent(SAI)).is_empty();
|
|
|
|
});
|
|
|
|
SmallVector<ScopArrayInfo *, 4> ValidSAIs(ValidSAIsRange.begin(),
|
|
|
|
ValidSAIsRange.end());
|
|
|
|
|
|
|
|
PPCGProg->n_array =
|
|
|
|
ValidSAIs.size(); // std::distance(S->array_begin(), S->array_end());
|
2016-07-15 15:05:54 +08:00
|
|
|
PPCGProg->array = isl_calloc_array(S->getIslCtx(), struct gpu_array_info,
|
|
|
|
PPCGProg->n_array);
|
|
|
|
|
2017-07-25 20:35:36 +08:00
|
|
|
createArrays(PPCGProg, ValidSAIs);
|
2016-07-14 18:22:19 +08:00
|
|
|
|
2016-08-10 18:58:19 +08:00
|
|
|
PPCGProg->may_persist = compute_may_persist(PPCGProg);
|
2016-07-14 18:22:19 +08:00
|
|
|
return PPCGProg;
|
|
|
|
}
|
|
|
|
|
2016-07-14 23:51:37 +08:00
|
|
|
struct PrintGPUUserData {
|
|
|
|
struct cuda_info *CudaInfo;
|
|
|
|
struct gpu_prog *PPCGProg;
|
|
|
|
std::vector<ppcg_kernel *> Kernels;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Print a user statement node in the host code.
|
|
|
|
///
|
|
|
|
/// We use ppcg's printing facilities to print the actual statement and
|
|
|
|
/// additionally build up a list of all kernels that are encountered in the
|
|
|
|
/// host ast.
|
|
|
|
///
|
|
|
|
/// @param P The printer to print to
|
|
|
|
/// @param Options The printing options to use
|
|
|
|
/// @param Node The node to print
|
|
|
|
/// @param User A user pointer to carry additional data. This pointer is
|
|
|
|
/// expected to be of type PrintGPUUserData.
|
|
|
|
///
|
|
|
|
/// @returns A printer to which the output has been printed.
|
|
|
|
static __isl_give isl_printer *
|
|
|
|
printHostUser(__isl_take isl_printer *P,
|
|
|
|
__isl_take isl_ast_print_options *Options,
|
|
|
|
__isl_take isl_ast_node *Node, void *User) {
|
|
|
|
auto Data = (struct PrintGPUUserData *)User;
|
|
|
|
auto Id = isl_ast_node_get_annotation(Node);
|
|
|
|
|
|
|
|
if (Id) {
|
2016-07-16 01:12:41 +08:00
|
|
|
bool IsUser = !strcmp(isl_id_get_name(Id), "user");
|
|
|
|
|
|
|
|
// If this is a user statement, format it ourselves as ppcg would
|
|
|
|
// otherwise try to call pet functionality that is not available in
|
|
|
|
// Polly.
|
|
|
|
if (IsUser) {
|
|
|
|
P = isl_printer_start_line(P);
|
|
|
|
P = isl_printer_print_ast_node(P, Node);
|
|
|
|
P = isl_printer_end_line(P);
|
|
|
|
isl_id_free(Id);
|
|
|
|
isl_ast_print_options_free(Options);
|
|
|
|
return P;
|
|
|
|
}
|
|
|
|
|
2016-07-14 23:51:37 +08:00
|
|
|
auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
|
|
|
|
isl_id_free(Id);
|
|
|
|
Data->Kernels.push_back(Kernel);
|
|
|
|
}
|
|
|
|
|
|
|
|
return print_host_user(P, Options, Node, User);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Print C code corresponding to the control flow in @p Kernel.
|
|
|
|
///
|
|
|
|
/// @param Kernel The kernel to print
|
|
|
|
void printKernel(ppcg_kernel *Kernel) {
|
|
|
|
auto *P = isl_printer_to_str(S->getIslCtx());
|
|
|
|
P = isl_printer_set_output_format(P, ISL_FORMAT_C);
|
|
|
|
auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
|
|
|
|
P = isl_ast_node_print(Kernel->tree, P, Options);
|
|
|
|
char *String = isl_printer_get_str(P);
|
|
|
|
printf("%s\n", String);
|
|
|
|
free(String);
|
|
|
|
isl_printer_free(P);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Print C code corresponding to the GPU code described by @p Tree.
|
|
|
|
///
|
|
|
|
/// @param Tree An AST describing GPU code
|
|
|
|
/// @param PPCGProg The PPCG program from which @Tree has been constructed.
|
|
|
|
void printGPUTree(isl_ast_node *Tree, gpu_prog *PPCGProg) {
|
|
|
|
auto *P = isl_printer_to_str(S->getIslCtx());
|
|
|
|
P = isl_printer_set_output_format(P, ISL_FORMAT_C);
|
|
|
|
|
|
|
|
PrintGPUUserData Data;
|
|
|
|
Data.PPCGProg = PPCGProg;
|
|
|
|
|
|
|
|
auto *Options = isl_ast_print_options_alloc(S->getIslCtx());
|
|
|
|
Options =
|
|
|
|
isl_ast_print_options_set_print_user(Options, printHostUser, &Data);
|
|
|
|
P = isl_ast_node_print(Tree, P, Options);
|
|
|
|
char *String = isl_printer_get_str(P);
|
|
|
|
printf("# host\n");
|
|
|
|
printf("%s\n", String);
|
|
|
|
free(String);
|
|
|
|
isl_printer_free(P);
|
|
|
|
|
|
|
|
for (auto Kernel : Data.Kernels) {
|
|
|
|
printf("# kernel%d\n", Kernel->id);
|
|
|
|
printKernel(Kernel);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
// Generate a GPU program using PPCG.
|
|
|
|
//
|
|
|
|
// GPU mapping consists of multiple steps:
|
|
|
|
//
|
|
|
|
// 1) Compute new schedule for the program.
|
|
|
|
// 2) Map schedule to GPU (TODO)
|
|
|
|
// 3) Generate code for new schedule (TODO)
|
|
|
|
//
|
|
|
|
// We do not use here the Polly ScheduleOptimizer, as the schedule optimizer
|
|
|
|
// is mostly CPU specific. Instead, we use PPCG's GPU code generation
|
|
|
|
// strategy directly from this pass.
|
|
|
|
gpu_gen *generateGPU(ppcg_scop *PPCGScop, gpu_prog *PPCGProg) {
|
|
|
|
|
|
|
|
auto PPCGGen = isl_calloc_type(S->getIslCtx(), struct gpu_gen);
|
|
|
|
|
|
|
|
PPCGGen->ctx = S->getIslCtx();
|
|
|
|
PPCGGen->options = PPCGScop->options;
|
|
|
|
PPCGGen->print = nullptr;
|
|
|
|
PPCGGen->print_user = nullptr;
|
2016-07-14 23:51:32 +08:00
|
|
|
PPCGGen->build_ast_expr = &pollyBuildAstExprForStmt;
|
2016-07-14 18:22:25 +08:00
|
|
|
PPCGGen->prog = PPCGProg;
|
|
|
|
PPCGGen->tree = nullptr;
|
|
|
|
PPCGGen->types.n = 0;
|
|
|
|
PPCGGen->types.name = nullptr;
|
|
|
|
PPCGGen->sizes = nullptr;
|
|
|
|
PPCGGen->used_sizes = nullptr;
|
|
|
|
PPCGGen->kernel_id = 0;
|
|
|
|
|
|
|
|
// Set scheduling strategy to same strategy PPCG is using.
|
|
|
|
isl_options_set_schedule_outer_coincidence(PPCGGen->ctx, true);
|
|
|
|
isl_options_set_schedule_maximize_band_depth(PPCGGen->ctx, true);
|
2016-07-16 00:15:47 +08:00
|
|
|
isl_options_set_schedule_whole_component(PPCGGen->ctx, false);
|
2016-07-14 18:22:25 +08:00
|
|
|
|
|
|
|
isl_schedule *Schedule = get_schedule(PPCGGen);
|
|
|
|
|
2016-07-14 18:51:52 +08:00
|
|
|
int has_permutable = has_any_permutable_node(Schedule);
|
|
|
|
|
Make sure that all parameter dimensions are set in schedule
Summary:
In case the option -polly-ignore-parameter-bounds is set, not all parameters
will be added to context and domains. This is useful to keep the size of the
sets and maps we work with small. Unfortunately, for AST generation it is
necessary to ensure all parameters are part of the schedule tree. Hence,
we modify the GPGPU code generation to make sure this is the case.
To obtain the necessary information we expose a new function
Scop::getFullParamSpace(). We also make a couple of functions const to be
able to make SCoP::getFullParamSpace() const.
Reviewers: Meinersbur, bollu, gareevroman, efriedma, huihuiz, sebpop, simbuerg
Subscribers: nemanjai, kbarton, pollydev, llvm-commits
Tags: #polly
Differential Revision: https://reviews.llvm.org/D36243
llvm-svn: 309939
2017-08-03 21:51:15 +08:00
|
|
|
Schedule =
|
|
|
|
isl_schedule_align_params(Schedule, S->getFullParamSpace().release());
|
|
|
|
|
2016-07-14 23:51:37 +08:00
|
|
|
if (!has_permutable || has_permutable < 0) {
|
2016-07-14 18:51:52 +08:00
|
|
|
Schedule = isl_schedule_free(Schedule);
|
2017-08-05 03:36:40 +08:00
|
|
|
DEBUG(dbgs() << getUniqueScopName(S)
|
|
|
|
<< " does not have permutable bands. Bailing out\n";);
|
2016-07-14 23:51:37 +08:00
|
|
|
} else {
|
2016-07-14 18:51:52 +08:00
|
|
|
Schedule = map_to_device(PPCGGen, Schedule);
|
2016-07-14 23:51:37 +08:00
|
|
|
PPCGGen->tree = generate_code(PPCGGen, isl_schedule_copy(Schedule));
|
|
|
|
}
|
2016-07-14 18:51:52 +08:00
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
if (DumpSchedule) {
|
|
|
|
isl_printer *P = isl_printer_to_str(S->getIslCtx());
|
|
|
|
P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
|
|
|
|
P = isl_printer_print_str(P, "Schedule\n");
|
|
|
|
P = isl_printer_print_str(P, "========\n");
|
|
|
|
if (Schedule)
|
|
|
|
P = isl_printer_print_schedule(P, Schedule);
|
|
|
|
else
|
|
|
|
P = isl_printer_print_str(P, "No schedule found\n");
|
|
|
|
|
|
|
|
printf("%s\n", isl_printer_get_str(P));
|
|
|
|
isl_printer_free(P);
|
|
|
|
}
|
|
|
|
|
2016-07-14 23:51:37 +08:00
|
|
|
if (DumpCode) {
|
|
|
|
printf("Code\n");
|
|
|
|
printf("====\n");
|
|
|
|
if (PPCGGen->tree)
|
|
|
|
printGPUTree(PPCGGen->tree, PPCGProg);
|
|
|
|
else
|
|
|
|
printf("No code generated\n");
|
|
|
|
}
|
|
|
|
|
2016-07-14 18:22:25 +08:00
|
|
|
isl_schedule_free(Schedule);
|
|
|
|
|
|
|
|
return PPCGGen;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Free gpu_gen structure.
|
|
|
|
///
|
|
|
|
/// @param PPCGGen The ppcg_gen object to free.
|
|
|
|
void freePPCGGen(gpu_gen *PPCGGen) {
|
|
|
|
isl_ast_node_free(PPCGGen->tree);
|
|
|
|
isl_union_map_free(PPCGGen->sizes);
|
|
|
|
isl_union_map_free(PPCGGen->used_sizes);
|
|
|
|
free(PPCGGen);
|
|
|
|
}
|
|
|
|
|
2016-07-15 18:32:22 +08:00
|
|
|
/// Free the options in the ppcg scop structure.
|
|
|
|
///
|
|
|
|
/// ppcg is not freeing these options for us. To avoid leaks we do this
|
|
|
|
/// ourselves.
|
|
|
|
///
|
|
|
|
/// @param PPCGScop The scop referencing the options to free.
|
|
|
|
void freeOptions(ppcg_scop *PPCGScop) {
|
|
|
|
free(PPCGScop->options->debug);
|
|
|
|
PPCGScop->options->debug = nullptr;
|
|
|
|
free(PPCGScop->options);
|
|
|
|
PPCGScop->options = nullptr;
|
|
|
|
}
|
|
|
|
|
2016-09-18 14:50:35 +08:00
|
|
|
/// Approximate the number of points in the set.
|
|
|
|
///
|
|
|
|
/// This function returns an ast expression that overapproximates the number
|
|
|
|
/// of points in an isl set through the rectangular hull surrounding this set.
|
|
|
|
///
|
|
|
|
/// @param Set The set to count.
|
|
|
|
/// @param Build The isl ast build object to use for creating the ast
|
|
|
|
/// expression.
|
|
|
|
///
|
|
|
|
/// @returns An approximation of the number of points in the set.
|
|
|
|
__isl_give isl_ast_expr *approxPointsInSet(__isl_take isl_set *Set,
|
|
|
|
__isl_keep isl_ast_build *Build) {
|
|
|
|
|
|
|
|
isl_val *One = isl_val_int_from_si(isl_set_get_ctx(Set), 1);
|
|
|
|
auto *Expr = isl_ast_expr_from_val(isl_val_copy(One));
|
|
|
|
|
|
|
|
isl_space *Space = isl_set_get_space(Set);
|
|
|
|
Space = isl_space_params(Space);
|
|
|
|
auto *Univ = isl_set_universe(Space);
|
|
|
|
isl_pw_aff *OneAff = isl_pw_aff_val_on_domain(Univ, One);
|
|
|
|
|
|
|
|
for (long i = 0; i < isl_set_dim(Set, isl_dim_set); i++) {
|
|
|
|
isl_pw_aff *Max = isl_set_dim_max(isl_set_copy(Set), i);
|
|
|
|
isl_pw_aff *Min = isl_set_dim_min(isl_set_copy(Set), i);
|
|
|
|
isl_pw_aff *DimSize = isl_pw_aff_sub(Max, Min);
|
|
|
|
DimSize = isl_pw_aff_add(DimSize, isl_pw_aff_copy(OneAff));
|
|
|
|
auto DimSizeExpr = isl_ast_build_expr_from_pw_aff(Build, DimSize);
|
|
|
|
Expr = isl_ast_expr_mul(Expr, DimSizeExpr);
|
|
|
|
}
|
|
|
|
|
|
|
|
isl_set_free(Set);
|
|
|
|
isl_pw_aff_free(OneAff);
|
|
|
|
|
|
|
|
return Expr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Approximate a number of dynamic instructions executed by a given
|
|
|
|
/// statement.
|
|
|
|
///
|
|
|
|
/// @param Stmt The statement for which to compute the number of dynamic
|
|
|
|
/// instructions.
|
|
|
|
/// @param Build The isl ast build object to use for creating the ast
|
|
|
|
/// expression.
|
|
|
|
/// @returns An approximation of the number of dynamic instructions executed
|
|
|
|
/// by @p Stmt.
|
|
|
|
__isl_give isl_ast_expr *approxDynamicInst(ScopStmt &Stmt,
|
|
|
|
__isl_keep isl_ast_build *Build) {
|
2017-08-07 00:39:52 +08:00
|
|
|
auto Iterations = approxPointsInSet(Stmt.getDomain().release(), Build);
|
2016-09-18 14:50:35 +08:00
|
|
|
|
|
|
|
long InstCount = 0;
|
|
|
|
|
|
|
|
if (Stmt.isBlockStmt()) {
|
|
|
|
auto *BB = Stmt.getBasicBlock();
|
|
|
|
InstCount = std::distance(BB->begin(), BB->end());
|
|
|
|
} else {
|
|
|
|
auto *R = Stmt.getRegion();
|
|
|
|
|
|
|
|
for (auto *BB : R->blocks()) {
|
|
|
|
InstCount += std::distance(BB->begin(), BB->end());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
isl_val *InstVal = isl_val_int_from_si(S->getIslCtx(), InstCount);
|
|
|
|
auto *InstExpr = isl_ast_expr_from_val(InstVal);
|
|
|
|
return isl_ast_expr_mul(InstExpr, Iterations);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Approximate dynamic instructions executed in scop.
|
|
|
|
///
|
|
|
|
/// @param S The scop for which to approximate dynamic instructions.
|
|
|
|
/// @param Build The isl ast build object to use for creating the ast
|
|
|
|
/// expression.
|
|
|
|
/// @returns An approximation of the number of dynamic instructions executed
|
|
|
|
/// in @p S.
|
|
|
|
__isl_give isl_ast_expr *
|
|
|
|
getNumberOfIterations(Scop &S, __isl_keep isl_ast_build *Build) {
|
|
|
|
isl_ast_expr *Instructions;
|
|
|
|
|
|
|
|
isl_val *Zero = isl_val_int_from_si(S.getIslCtx(), 0);
|
|
|
|
Instructions = isl_ast_expr_from_val(Zero);
|
|
|
|
|
|
|
|
for (ScopStmt &Stmt : S) {
|
|
|
|
isl_ast_expr *StmtInstructions = approxDynamicInst(Stmt, Build);
|
|
|
|
Instructions = isl_ast_expr_add(Instructions, StmtInstructions);
|
|
|
|
}
|
|
|
|
return Instructions;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Create a check that ensures sufficient compute in scop.
|
|
|
|
///
|
|
|
|
/// @param S The scop for which to ensure sufficient compute.
|
|
|
|
/// @param Build The isl ast build object to use for creating the ast
|
|
|
|
/// expression.
|
|
|
|
/// @returns An expression that evaluates to TRUE in case of sufficient
|
|
|
|
/// compute and to FALSE, otherwise.
|
|
|
|
__isl_give isl_ast_expr *
|
|
|
|
createSufficientComputeCheck(Scop &S, __isl_keep isl_ast_build *Build) {
|
|
|
|
auto Iterations = getNumberOfIterations(S, Build);
|
|
|
|
auto *MinComputeVal = isl_val_int_from_si(S.getIslCtx(), MinCompute);
|
|
|
|
auto *MinComputeExpr = isl_ast_expr_from_val(MinComputeVal);
|
|
|
|
return isl_ast_expr_ge(Iterations, MinComputeExpr);
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
/// Check if the basic block contains a function we cannot codegen for GPU
|
|
|
|
/// kernels.
|
|
|
|
///
|
|
|
|
/// If this basic block does something with a `Function` other than calling
|
|
|
|
/// a function that we support in a kernel, return true.
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
bool containsInvalidKernelFunctionInBlock(const BasicBlock *BB,
|
|
|
|
bool AllowCUDALibDevice) {
|
2017-06-26 21:12:06 +08:00
|
|
|
for (const Instruction &Inst : *BB) {
|
|
|
|
const CallInst *Call = dyn_cast<CallInst>(&Inst);
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
if (Call && isValidFunctionInKernel(Call->getCalledFunction(),
|
|
|
|
AllowCUDALibDevice)) {
|
2017-06-26 21:12:06 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
for (Value *SrcVal : Inst.operands()) {
|
|
|
|
PointerType *p = dyn_cast<PointerType>(SrcVal->getType());
|
|
|
|
if (!p)
|
|
|
|
continue;
|
|
|
|
if (isa<FunctionType>(p->getElementType()))
|
|
|
|
return true;
|
|
|
|
}
|
2017-06-26 21:12:06 +08:00
|
|
|
}
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
/// Return whether the Scop S uses functions in a way that we do not support.
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
bool containsInvalidKernelFunction(const Scop &S, bool AllowCUDALibDevice) {
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
for (auto &Stmt : S) {
|
|
|
|
if (Stmt.isBlockStmt()) {
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
if (containsInvalidKernelFunctionInBlock(Stmt.getBasicBlock(),
|
|
|
|
AllowCUDALibDevice))
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
assert(Stmt.isRegionStmt() &&
|
|
|
|
"Stmt was neither block nor region statement");
|
|
|
|
for (const BasicBlock *BB : Stmt.getRegion()->blocks())
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
if (containsInvalidKernelFunctionInBlock(BB, AllowCUDALibDevice))
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
/// Generate code for a given GPU AST described by @p Root.
|
|
|
|
///
|
2016-07-19 15:32:38 +08:00
|
|
|
/// @param Root An isl_ast_node pointing to the root of the GPU AST.
|
|
|
|
/// @param Prog The GPU Program to generate code for.
|
|
|
|
void generateCode(__isl_take isl_ast_node *Root, gpu_prog *Prog) {
|
2016-07-18 19:56:39 +08:00
|
|
|
ScopAnnotator Annotator;
|
|
|
|
Annotator.buildAliasScopes(*S);
|
|
|
|
|
|
|
|
Region *R = &S->getRegion();
|
|
|
|
|
|
|
|
simplifyRegion(R, DT, LI, RI);
|
|
|
|
|
|
|
|
BasicBlock *EnteringBB = R->getEnteringBlock();
|
|
|
|
|
|
|
|
PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
|
|
|
|
|
|
|
|
// Only build the run-time condition and parameters _after_ having
|
|
|
|
// introduced the conditional branch. This is important as the conditional
|
|
|
|
// branch will guard the original scop from new induction variables that
|
|
|
|
// the SCEVExpander may introduce while code generating the parameters and
|
|
|
|
// which may introduce scalar dependences that prevent us from correctly
|
|
|
|
// code generating this scop.
|
2017-07-14 18:00:25 +08:00
|
|
|
BBPair StartExitBlocks;
|
|
|
|
BranchInst *CondBr = nullptr;
|
|
|
|
std::tie(StartExitBlocks, CondBr) =
|
2017-04-04 18:01:53 +08:00
|
|
|
executeScopConditionally(*S, Builder.getTrue(), *DT, *RI, *LI);
|
2017-06-26 20:17:11 +08:00
|
|
|
BasicBlock *StartBlock = std::get<0>(StartExitBlocks);
|
2016-07-18 19:56:39 +08:00
|
|
|
|
2017-07-14 18:00:25 +08:00
|
|
|
assert(CondBr && "CondBr not initialized by executeScopConditionally");
|
|
|
|
|
2017-04-04 18:01:53 +08:00
|
|
|
GPUNodeBuilder NodeBuilder(Builder, Annotator, *DL, *LI, *SE, *DT, *S,
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
StartBlock, Prog, Runtime, Architecture);
|
2016-11-03 06:32:23 +08:00
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
// TODO: Handle LICM
|
|
|
|
auto SplitBlock = StartBlock->getSinglePredecessor();
|
|
|
|
Builder.SetInsertPoint(SplitBlock->getTerminator());
|
2017-08-07 03:52:38 +08:00
|
|
|
NodeBuilder.addParameters(S->getContext().release());
|
2016-08-09 01:35:55 +08:00
|
|
|
|
|
|
|
isl_ast_build *Build = isl_ast_build_alloc(S->getIslCtx());
|
2017-05-23 18:12:56 +08:00
|
|
|
isl_ast_expr *Condition = IslAst::buildRunCondition(*S, Build);
|
2016-09-18 14:50:35 +08:00
|
|
|
isl_ast_expr *SufficientCompute = createSufficientComputeCheck(*S, Build);
|
|
|
|
Condition = isl_ast_expr_and(Condition, SufficientCompute);
|
2016-08-09 01:35:55 +08:00
|
|
|
isl_ast_build_free(Build);
|
|
|
|
|
2017-07-20 23:48:36 +08:00
|
|
|
// preload invariant loads. Note: This should happen before the RTC
|
|
|
|
// because the RTC may depend on values that are invariant load hoisted.
|
2017-07-28 22:48:32 +08:00
|
|
|
if (!NodeBuilder.preloadInvariantLoads())
|
|
|
|
report_fatal_error("preloading invariant loads failed in function: " +
|
|
|
|
S->getFunction().getName() +
|
|
|
|
" | Scop Region: " + S->getNameStr());
|
2017-07-20 23:48:36 +08:00
|
|
|
|
2016-08-09 01:35:55 +08:00
|
|
|
Value *RTC = NodeBuilder.createRTC(Condition);
|
|
|
|
Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
|
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
Builder.SetInsertPoint(&*StartBlock->begin());
|
2016-07-25 17:16:01 +08:00
|
|
|
|
2016-07-18 19:56:39 +08:00
|
|
|
NodeBuilder.create(Root);
|
2016-09-12 14:06:31 +08:00
|
|
|
|
2016-09-18 16:31:09 +08:00
|
|
|
/// In case a sequential kernel has more surrounding loops as any parallel
|
|
|
|
/// kernel, the SCoP is probably mostly sequential. Hence, there is no
|
2017-03-12 16:19:01 +08:00
|
|
|
/// point in running it on a GPU.
|
2016-09-18 16:31:09 +08:00
|
|
|
if (NodeBuilder.DeepestSequential > NodeBuilder.DeepestParallel)
|
2017-07-14 18:00:25 +08:00
|
|
|
CondBr->setOperand(0, Builder.getFalse());
|
2016-09-18 16:31:09 +08:00
|
|
|
|
2016-09-12 14:06:31 +08:00
|
|
|
if (!NodeBuilder.BuildSuccessful)
|
2017-07-14 18:00:25 +08:00
|
|
|
CondBr->setOperand(0, Builder.getFalse());
|
2016-07-18 19:56:39 +08:00
|
|
|
}
|
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
bool runOnScop(Scop &CurrentScop) override {
|
|
|
|
S = &CurrentScop;
|
2016-07-18 19:56:39 +08:00
|
|
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
|
|
|
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
|
|
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
2017-04-11 12:59:13 +08:00
|
|
|
DL = &S->getRegion().getEntry()->getModule()->getDataLayout();
|
2016-07-18 19:56:39 +08:00
|
|
|
RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
|
2016-07-14 18:22:19 +08:00
|
|
|
|
2017-06-26 21:12:06 +08:00
|
|
|
// We currently do not support functions other than intrinsics inside
|
|
|
|
// kernels, as code generation will need to offload function calls to the
|
|
|
|
// kernel. This may lead to a kernel trying to call a function on the host.
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
// This also allows us to prevent codegen from trying to take the
|
|
|
|
// address of an intrinsic function to send to the kernel.
|
[GPGPU] Add support for NVIDIA libdevice
Summary:
This allows us to map functions such as exp, expf, expl, for which no
LLVM intrinsics exist. Instead, we link to NVIDIA's libdevice which provides
high-performance implementations of a wide range of (math) functions. We
currently link only a small subset, the exp, cos and copysign functions. Other
functions will be enabled as needed.
Reviewers: bollu, singam-sanjay
Reviewed By: bollu
Subscribers: tstellar, tra, nemanjai, pollydev, mgorny, llvm-commits, kbarton
Tags: #polly
Differential Revision: https://reviews.llvm.org/D35703
llvm-svn: 309560
2017-07-31 22:03:16 +08:00
|
|
|
if (containsInvalidKernelFunction(CurrentScop,
|
|
|
|
Architecture == GPUArch::NVPTX64)) {
|
2017-06-26 21:12:06 +08:00
|
|
|
DEBUG(
|
2017-08-05 03:36:40 +08:00
|
|
|
dbgs() << getUniqueScopName(S)
|
|
|
|
<< " contains function which cannot be materialised in a GPU "
|
|
|
|
"kernel. Bailing out.\n";);
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
return false;
|
2017-06-26 21:12:06 +08:00
|
|
|
}
|
[Polly] [PPCGCodeGeneration] Skip Scops which contain function pointers.
In `PPCGCodeGeneration`, we try to take the references of every `Value`
that is used within a Scop to offload to the kernel. This occurs in
`GPUNodeBuilder::createLaunchParameters`.
This breaks if one of the values is a function pointer, since one of
these cases will trigger:
1. We try to to take the references of an intrinsic function, and this
breaks at `verifyModule`, since it is illegal to take the reference of
an intrinsic.
2. We manage to take the reference to a function, but this fails at
`verifyModule` since the function will not be present in the module that
is created in the kernel.
3. Even if `verifyModule` succeeds (which should not occur), we would
then try to call a *host function* from the *device*, which is
illegal runtime behaviour.
So, we disable this entire range of possibilities by simply not allowing
function references within a `Scop` which corresponds to a kernel.
However, note that this is too conservative. We *can* allow intrinsics
within kernels if the backend can lower the intrinsic correctly. For
example, an intrinsic like `llvm.powi.*` can actually be lowered by the `NVPTX`
backend.
We will now gradually whitelist intrinsics which are known to be safe.
Differential Revision: https://reviews.llvm.org/D33414
llvm-svn: 305185
2017-06-12 19:41:09 +08:00
|
|
|
|
2016-07-14 18:22:19 +08:00
|
|
|
auto PPCGScop = createPPCGScop();
|
|
|
|
auto PPCGProg = createPPCGProg(PPCGScop);
|
2016-07-14 18:22:25 +08:00
|
|
|
auto PPCGGen = generateGPU(PPCGScop, PPCGProg);
|
2016-07-18 19:56:39 +08:00
|
|
|
|
2017-07-01 03:42:21 +08:00
|
|
|
if (PPCGGen->tree) {
|
2016-07-19 15:32:38 +08:00
|
|
|
generateCode(isl_ast_node_copy(PPCGGen->tree), PPCGProg);
|
2017-07-01 03:42:21 +08:00
|
|
|
CurrentScop.markAsToBeSkipped();
|
2017-08-05 03:36:40 +08:00
|
|
|
} else {
|
|
|
|
DEBUG(dbgs() << getUniqueScopName(S)
|
|
|
|
<< " has empty PPCGGen->tree. Bailing out.\n");
|
2017-07-01 03:42:21 +08:00
|
|
|
}
|
2016-07-18 19:56:39 +08:00
|
|
|
|
2016-07-15 18:32:22 +08:00
|
|
|
freeOptions(PPCGScop);
|
2016-07-14 18:22:25 +08:00
|
|
|
freePPCGGen(PPCGGen);
|
2016-07-14 18:22:19 +08:00
|
|
|
gpu_prog_free(PPCGProg);
|
|
|
|
ppcg_scop_free(PPCGScop);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
2016-07-13 23:54:58 +08:00
|
|
|
|
|
|
|
void printScop(raw_ostream &, Scop &) const override {}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
|
|
|
AU.addRequired<RegionInfoPass>();
|
|
|
|
AU.addRequired<ScalarEvolutionWrapperPass>();
|
2017-05-12 22:37:29 +08:00
|
|
|
AU.addRequired<ScopDetectionWrapperPass>();
|
2016-07-13 23:54:58 +08:00
|
|
|
AU.addRequired<ScopInfoRegionPass>();
|
|
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
|
|
|
|
|
|
|
AU.addPreserved<AAResultsWrapperPass>();
|
|
|
|
AU.addPreserved<BasicAAWrapperPass>();
|
|
|
|
AU.addPreserved<LoopInfoWrapperPass>();
|
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
|
|
|
AU.addPreserved<GlobalsAAWrapperPass>();
|
2017-05-12 22:37:29 +08:00
|
|
|
AU.addPreserved<ScopDetectionWrapperPass>();
|
2016-07-13 23:54:58 +08:00
|
|
|
AU.addPreserved<ScalarEvolutionWrapperPass>();
|
|
|
|
AU.addPreserved<SCEVAAWrapperPass>();
|
|
|
|
|
|
|
|
// FIXME: We do not yet add regions for the newly generated code to the
|
|
|
|
// region tree.
|
|
|
|
AU.addPreserved<RegionInfoPass>();
|
|
|
|
AU.addPreserved<ScopInfoRegionPass>();
|
|
|
|
}
|
|
|
|
};
|
2017-03-01 23:54:27 +08:00
|
|
|
} // namespace
|
2016-07-13 23:54:58 +08:00
|
|
|
|
|
|
|
char PPCGCodeGeneration::ID = 1;
|
|
|
|
|
[Polly] Added OpenCL Runtime to GPURuntime Library for GPGPU CodeGen
Summary:
When compiling for GPU, one can now choose to compile for OpenCL or CUDA,
with the corresponding polly-gpu-runtime flag (libopencl / libcudart). The
GPURuntime library (GPUJIT) has been extended with the OpenCL Runtime library
for that purpose, correctly choosing the corresponding library calls to the
option chosen when compiling (via different initialization calls).
Additionally, a specific GPU Target architecture can now be chosen with -polly-gpu-arch (only nvptx64 implemented thus far).
Reviewers: grosser, bollu, Meinersbur, etherzhhb, singam-sanjay
Reviewed By: grosser, Meinersbur
Subscribers: singam-sanjay, llvm-commits, pollydev, nemanjai, mgorny, yaxunl, Anastasia
Tags: #polly
Differential Revision: https://reviews.llvm.org/D32431
llvm-svn: 302379
2017-05-08 05:03:46 +08:00
|
|
|
Pass *polly::createPPCGCodeGenerationPass(GPUArch Arch, GPURuntime Runtime) {
|
|
|
|
PPCGCodeGeneration *generator = new PPCGCodeGeneration();
|
|
|
|
generator->Runtime = Runtime;
|
|
|
|
generator->Architecture = Arch;
|
|
|
|
return generator;
|
|
|
|
}
|
2016-07-13 23:54:58 +08:00
|
|
|
|
|
|
|
INITIALIZE_PASS_BEGIN(PPCGCodeGeneration, "polly-codegen-ppcg",
|
|
|
|
"Polly - Apply PPCG translation to SCOP", false, false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
|
2017-05-12 22:37:29 +08:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
|
2016-07-13 23:54:58 +08:00
|
|
|
INITIALIZE_PASS_END(PPCGCodeGeneration, "polly-codegen-ppcg",
|
|
|
|
"Polly - Apply PPCG translation to SCOP", false, false)
|