forked from OSchip/llvm-project
Add preliminary implementation for GPGPU code generation.
Translate the selected parallel loop body into a ptx string and run it with cuda driver API. We limit this preliminary implementation to target the following special test cases: - Support only 2-dimensional parallel loops with or without only one innermost non-parallel loop. - Support write memory access to only one array in a SCoP. Contributed by: Yabin Hu <yabin.hwu@gmail.com> llvm-svn: 160164
This commit is contained in:
parent
a9c373e49d
commit
b299d28181
|
@ -0,0 +1,193 @@
|
||||||
|
//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// This file contains functions to create GPGPU parallel loops as LLVM-IR.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
#ifndef POLLY_CODEGEN_PTXGENERATOR_H
|
||||||
|
#define POLLY_CODEGEN_PTXGENERATOR_H
|
||||||
|
|
||||||
|
#include "llvm/IRBuilder.h"
|
||||||
|
#include "llvm/ADT/SetVector.h"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
class Value;
|
||||||
|
class Pass;
|
||||||
|
class BasicBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace polly {
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
class PTXGenerator {
|
||||||
|
public:
|
||||||
|
typedef std::map<Value*, Value*> ValueToValueMapTy;
|
||||||
|
|
||||||
|
PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple);
|
||||||
|
|
||||||
|
/// @brief Create a GPGPU parallel loop.
|
||||||
|
///
|
||||||
|
/// @param UsedValues A set of LLVM-IR Values that should be available to
|
||||||
|
/// the new loop body.
|
||||||
|
/// @param OriginalIVS The new values of the original induction variables.
|
||||||
|
/// @param VMap This map is filled by createParallelLoop(). It
|
||||||
|
/// maps the values in UsedValues to Values through which
|
||||||
|
/// their content is available within the loop body.
|
||||||
|
/// @param LoopBody A pointer to an iterator that is set to point to the
|
||||||
|
/// body of the created loop. It should be used to insert
|
||||||
|
/// instructions that form the actual loop body.
|
||||||
|
void startGeneration(SetVector<Value*> &UsedValues,
|
||||||
|
SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap,
|
||||||
|
BasicBlock::iterator *LoopBody);
|
||||||
|
|
||||||
|
/// @brief Execute the post-operations to build a GPGPU parallel loop.
|
||||||
|
///
|
||||||
|
void finishGeneration(Function *SubFunction);
|
||||||
|
|
||||||
|
/// @brief Set the parameters for launching PTX kernel.
|
||||||
|
///
|
||||||
|
/// @param GridW A value of the width of a GPU grid.
|
||||||
|
/// @param GridH A value of the height of a GPU grid.
|
||||||
|
/// @param BlockW A value of the width of a GPU block.
|
||||||
|
/// @param BlockH A value of the height of a GPU block.
|
||||||
|
void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) {
|
||||||
|
GridWidth = GridW;
|
||||||
|
GridHeight = GridH;
|
||||||
|
BlockWidth = BlockW;
|
||||||
|
BlockHeight = BlockH;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief Set the size of the output array.
|
||||||
|
///
|
||||||
|
/// This size is used to allocate memory on the device and the host.
|
||||||
|
///
|
||||||
|
/// @param Bytes Output array size in bytes.
|
||||||
|
void setOutputBytes(unsigned Bytes) {
|
||||||
|
OutputBytes = Bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
IRBuilder<> &Builder;
|
||||||
|
Pass *P;
|
||||||
|
|
||||||
|
/// @brief The target triple of the device.
|
||||||
|
const std::string &GPUTriple;
|
||||||
|
|
||||||
|
/// @brief Parameters used for launching PTX kernel.
|
||||||
|
int GridWidth, GridHeight, BlockWidth, BlockHeight;
|
||||||
|
|
||||||
|
/// @brief Size of the output array in bytes.
|
||||||
|
unsigned OutputBytes;
|
||||||
|
|
||||||
|
/// @brief Polly's GPU data types.
|
||||||
|
StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy;
|
||||||
|
|
||||||
|
void InitializeGPUDataTypes();
|
||||||
|
IntegerType *getInt64Type(); // i64
|
||||||
|
PointerType *getI8PtrType(); // char *
|
||||||
|
PointerType *getPtrI8PtrType(); // char **
|
||||||
|
PointerType *getFloatPtrType(); // float *
|
||||||
|
PointerType *getGPUContextPtrType(); // %struct.PollyGPUContextT *
|
||||||
|
PointerType *getGPUModulePtrType(); // %struct.PollyGPUModuleT *
|
||||||
|
PointerType *getGPUDevicePtrType(); // %struct.PollyGPUDeviceT *
|
||||||
|
PointerType *getPtrGPUDevicePtrType(); // %struct.PollyGPUDevicePtrT *
|
||||||
|
PointerType *getGPUFunctionPtrType(); // %struct.PollyGPUFunctionT *
|
||||||
|
PointerType *getGPUEventPtrType(); // %struct.PollyGPUEventT *
|
||||||
|
|
||||||
|
Module *getModule();
|
||||||
|
|
||||||
|
/// @brief Create the kernel string containing LLVM IR.
|
||||||
|
///
|
||||||
|
/// @param SubFunction A pointer to the device code function.
|
||||||
|
/// @return A global string variable containing the LLVM IR codes
|
||||||
|
// of the SubFunction.
|
||||||
|
Value *createPTXKernelFunction(Function *SubFunction);
|
||||||
|
|
||||||
|
/// @brief Get the entry name of the device kernel function.
|
||||||
|
///
|
||||||
|
/// @param SubFunction A pointer to the device code function.
|
||||||
|
/// @return A global string variable containing the entry name of
|
||||||
|
/// the SubFunction.
|
||||||
|
Value *getPTXKernelEntryName(Function *SubFunction);
|
||||||
|
|
||||||
|
void createCallInitDevice(Value *Context, Value *Device);
|
||||||
|
void createCallGetPTXModule(Value *Buffer, Value *Module);
|
||||||
|
void createCallGetPTXKernelEntry(Value *Entry, Value *Module,
|
||||||
|
Value *Kernel);
|
||||||
|
void createCallAllocateMemoryForHostAndDevice(Value *HostData,
|
||||||
|
Value *DeviceData,
|
||||||
|
Value *Size);
|
||||||
|
void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData,
|
||||||
|
Value *Size);
|
||||||
|
void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData,
|
||||||
|
Value *Size);
|
||||||
|
void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth,
|
||||||
|
Value *BlockHeight, Value *DeviceData);
|
||||||
|
void createCallLaunchKernel(Value *Kernel, Value *GridWidth,
|
||||||
|
Value *GridHeight);
|
||||||
|
void createCallStartTimerByCudaEvent(Value *StartEvent,
|
||||||
|
Value *StopEvent);
|
||||||
|
void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent,
|
||||||
|
Value *Timer);
|
||||||
|
void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData,
|
||||||
|
Value *Module, Value *Context,
|
||||||
|
Value *Kernel);
|
||||||
|
|
||||||
|
/// @brief Create the CUDA subfunction.
|
||||||
|
///
|
||||||
|
/// @param UsedValues A set of LLVM-IR Values that should be available to
|
||||||
|
/// the new loop body.
|
||||||
|
/// @param VMap This map that is filled by createSubfunction(). It
|
||||||
|
/// maps the values in UsedValues to Values through which
|
||||||
|
/// their content is available within the loop body.
|
||||||
|
/// @param OriginalIVS The new values of the original induction variables.
|
||||||
|
/// @param SubFunction The newly created SubFunction is returned here.
|
||||||
|
void createSubfunction(SetVector<Value*> &UsedValues,
|
||||||
|
SetVector<Value*> &OriginalIVS,
|
||||||
|
ValueToValueMapTy &VMap,
|
||||||
|
Function **SubFunction);
|
||||||
|
|
||||||
|
/// @brief Create the definition of the CUDA subfunction.
|
||||||
|
///
|
||||||
|
/// @param NumArgs The number of parameters of this subfunction. This is
|
||||||
|
/// usually set to the number of memory accesses which
|
||||||
|
/// will be copied from host to device.
|
||||||
|
Function *createSubfunctionDefinition(int NumArgs);
|
||||||
|
|
||||||
|
/// @brief Extract all the ptx related subfunctions into a new module.
|
||||||
|
///
|
||||||
|
/// @param M Current module.
|
||||||
|
/// @return The generated module containing only gpu related
|
||||||
|
/// subfunctions.
|
||||||
|
Module *extractPTXFunctionsFromModule(const Module *M);
|
||||||
|
|
||||||
|
/// @brief Get the Value of CUDA block width.
|
||||||
|
Value *getCUDABlockWidth();
|
||||||
|
|
||||||
|
/// @brief Get the Value of CUDA block height.
|
||||||
|
Value *getCUDABlockHeight();
|
||||||
|
|
||||||
|
/// @brief Get the Value of CUDA Gird width.
|
||||||
|
Value *getCUDAGridWidth();
|
||||||
|
|
||||||
|
/// @brief Get the Value of CUDA grid height.
|
||||||
|
Value *getCUDAGridHeight();
|
||||||
|
|
||||||
|
/// @brief Get the Value of the bytes of the output array.
|
||||||
|
Value *getOutputArraySizeInBytes();
|
||||||
|
|
||||||
|
/// @brief Erase the ptx-related subfunctions and declarations.
|
||||||
|
///
|
||||||
|
/// @param SubFunction A pointer to the device code function.
|
||||||
|
void eraseUnusedFunctions(Function *SubFunction);
|
||||||
|
};
|
||||||
|
} // end namespace polly
|
||||||
|
#endif /* POLLY_CODEGEN_PTXGENERATOR_H */
|
|
@ -125,6 +125,9 @@ public:
|
||||||
/// @brief Is this a read memory access?
|
/// @brief Is this a read memory access?
|
||||||
bool isRead() const { return Type == MemoryAccess::Read; }
|
bool isRead() const { return Type == MemoryAccess::Read; }
|
||||||
|
|
||||||
|
/// @brief Is this a write memory access?
|
||||||
|
bool isWrite() const { return Type == MemoryAccess::Write; }
|
||||||
|
|
||||||
isl_map *getAccessRelation() const;
|
isl_map *getAccessRelation() const;
|
||||||
|
|
||||||
/// @brief Get an isl string representing this access function.
|
/// @brief Get an isl string representing this access function.
|
||||||
|
|
|
@ -15,4 +15,5 @@ add_polly_library(PollyCodeGen
|
||||||
${ISL_CODEGEN_FILES}
|
${ISL_CODEGEN_FILES}
|
||||||
LoopGenerators.cpp
|
LoopGenerators.cpp
|
||||||
Utils.cpp
|
Utils.cpp
|
||||||
|
PTXGenerator.cpp
|
||||||
)
|
)
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
#include "polly/CodeGen/CodeGeneration.h"
|
#include "polly/CodeGen/CodeGeneration.h"
|
||||||
#include "polly/CodeGen/BlockGenerators.h"
|
#include "polly/CodeGen/BlockGenerators.h"
|
||||||
#include "polly/CodeGen/LoopGenerators.h"
|
#include "polly/CodeGen/LoopGenerators.h"
|
||||||
|
#include "polly/CodeGen/PTXGenerator.h"
|
||||||
#include "polly/CodeGen/Utils.h"
|
#include "polly/CodeGen/Utils.h"
|
||||||
#include "polly/Support/GICHelper.h"
|
#include "polly/Support/GICHelper.h"
|
||||||
|
|
||||||
|
@ -65,6 +66,17 @@ OpenMP("enable-polly-openmp",
|
||||||
cl::value_desc("OpenMP code generation enabled if true"),
|
cl::value_desc("OpenMP code generation enabled if true"),
|
||||||
cl::init(false), cl::ZeroOrMore);
|
cl::init(false), cl::ZeroOrMore);
|
||||||
|
|
||||||
|
static cl::opt<bool>
|
||||||
|
GPGPU("enable-polly-gpgpu",
|
||||||
|
cl::desc("Generate GPU parallel code"), cl::Hidden,
|
||||||
|
cl::value_desc("GPGPU code generation enabled if true"),
|
||||||
|
cl::init(false), cl::ZeroOrMore);
|
||||||
|
|
||||||
|
static cl::opt<std::string>
|
||||||
|
GPUTriple("polly-gpgpu-triple",
|
||||||
|
cl::desc("Target triple for GPU code generation"),
|
||||||
|
cl::Hidden, cl::init(""));
|
||||||
|
|
||||||
static cl::opt<bool>
|
static cl::opt<bool>
|
||||||
AtLeastOnce("enable-polly-atLeastOnce",
|
AtLeastOnce("enable-polly-atLeastOnce",
|
||||||
cl::desc("Give polly the hint, that every loop is executed at least"
|
cl::desc("Give polly the hint, that every loop is executed at least"
|
||||||
|
@ -284,6 +296,25 @@ private:
|
||||||
/// statement.
|
/// statement.
|
||||||
void codegenForOpenMP(const clast_for *f);
|
void codegenForOpenMP(const clast_for *f);
|
||||||
|
|
||||||
|
/// @brief Create GPGPU device memory access values.
|
||||||
|
///
|
||||||
|
/// Create a list of values that will be set to be parameters of the GPGPU
|
||||||
|
/// subfunction. These parameters represent device memory base addresses
|
||||||
|
/// and the size in bytes.
|
||||||
|
SetVector<Value*> getGPUValues(unsigned &OutputBytes);
|
||||||
|
|
||||||
|
/// @brief Create a GPU parallel for loop.
|
||||||
|
///
|
||||||
|
/// This loop reflects a loop as if it would have been created by a GPU
|
||||||
|
/// statement.
|
||||||
|
void codegenForGPGPU(const clast_for *F);
|
||||||
|
|
||||||
|
/// @brief Get innermost statement for the transformed loops.
|
||||||
|
const clast_stmt *getScheduleInfo(const clast_for *F,
|
||||||
|
std::vector<int> &NumIters,
|
||||||
|
unsigned &LoopDepth,
|
||||||
|
unsigned &NonPLoopDepth);
|
||||||
|
|
||||||
/// @brief Check if a loop is parallel
|
/// @brief Check if a loop is parallel
|
||||||
///
|
///
|
||||||
/// Detect if a clast_for loop can be executed in parallel.
|
/// Detect if a clast_for loop can be executed in parallel.
|
||||||
|
@ -530,6 +561,161 @@ void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) {
|
||||||
Builder.SetInsertPoint(AfterLoop);
|
Builder.SetInsertPoint(AfterLoop);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned getArraySizeInBytes(const ArrayType *AT) {
|
||||||
|
unsigned Bytes = AT->getNumElements();
|
||||||
|
if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
|
||||||
|
Bytes *= getArraySizeInBytes(T);
|
||||||
|
else
|
||||||
|
Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
|
||||||
|
|
||||||
|
return Bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
|
||||||
|
SetVector<Value*> Values;
|
||||||
|
OutputBytes = 0;
|
||||||
|
|
||||||
|
// Record the memory reference base addresses.
|
||||||
|
for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
|
||||||
|
ScopStmt *Stmt = *SI;
|
||||||
|
for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
|
||||||
|
E = Stmt->memacc_end(); I != E; ++I) {
|
||||||
|
Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
|
||||||
|
Values.insert((BaseAddr));
|
||||||
|
|
||||||
|
// FIXME: we assume that there is one and only one array to be written
|
||||||
|
// in a SCoP.
|
||||||
|
int NumWrites = 0;
|
||||||
|
if ((*I)->isWrite()) {
|
||||||
|
++NumWrites;
|
||||||
|
assert(NumWrites <= 1 &&
|
||||||
|
"We support at most one array to be written in a SCoP.");
|
||||||
|
if (const PointerType * PT =
|
||||||
|
dyn_cast<PointerType>(BaseAddr->getType())) {
|
||||||
|
Type *T = PT->getArrayElementType();
|
||||||
|
const ArrayType *ATy = dyn_cast<ArrayType>(T);
|
||||||
|
OutputBytes = getArraySizeInBytes(ATy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Values;
|
||||||
|
}
|
||||||
|
|
||||||
|
const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
|
||||||
|
std::vector<int> &NumIters,
|
||||||
|
unsigned &LoopDepth,
|
||||||
|
unsigned &NonPLoopDepth) {
|
||||||
|
clast_stmt *Stmt = (clast_stmt *)F;
|
||||||
|
const clast_for *Result;
|
||||||
|
bool NonParaFlag = false;
|
||||||
|
LoopDepth = 0;
|
||||||
|
NonPLoopDepth = 0;
|
||||||
|
|
||||||
|
while (Stmt) {
|
||||||
|
if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
|
||||||
|
const clast_for *T = (clast_for *) Stmt;
|
||||||
|
if (isParallelFor(T)) {
|
||||||
|
if (!NonParaFlag) {
|
||||||
|
NumIters.push_back(getNumberOfIterations(T));
|
||||||
|
Result = T;
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
NonParaFlag = true;
|
||||||
|
|
||||||
|
Stmt = T->body;
|
||||||
|
LoopDepth++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Stmt = Stmt->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(NumIters.size() == 4 &&
|
||||||
|
"The loops should be tiled into 4-depth parallel loops and an "
|
||||||
|
"innermost non-parallel one (if exist).");
|
||||||
|
NonPLoopDepth = LoopDepth - NumIters.size();
|
||||||
|
assert(NonPLoopDepth <= 1
|
||||||
|
&& "We support only one innermost non-parallel loop currently.");
|
||||||
|
return (const clast_stmt *)Result->body;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
|
||||||
|
BasicBlock::iterator LoopBody;
|
||||||
|
SetVector<Value *> Values;
|
||||||
|
SetVector<Value *> IVS;
|
||||||
|
std::vector<int> NumIterations;
|
||||||
|
PTXGenerator::ValueToValueMapTy VMap;
|
||||||
|
|
||||||
|
assert(!GPUTriple.empty()
|
||||||
|
&& "Target triple should be set properly for GPGPU code generation.");
|
||||||
|
PTXGenerator PTXGen(Builder, P, GPUTriple);
|
||||||
|
|
||||||
|
// Get original IVS and ScopStmt
|
||||||
|
unsigned TiledLoopDepth, NonPLoopDepth;
|
||||||
|
const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
|
||||||
|
TiledLoopDepth, NonPLoopDepth);
|
||||||
|
const clast_stmt *TmpStmt;
|
||||||
|
const clast_user_stmt *U;
|
||||||
|
const clast_for *InnerFor;
|
||||||
|
if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
|
||||||
|
InnerFor = (const clast_for *)InnerStmt;
|
||||||
|
TmpStmt = InnerFor->body;
|
||||||
|
} else
|
||||||
|
TmpStmt = InnerStmt;
|
||||||
|
U = (const clast_user_stmt *) TmpStmt;
|
||||||
|
ScopStmt *Statement = (ScopStmt *) U->statement->usr;
|
||||||
|
for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
|
||||||
|
const Value* IV = Statement->getInductionVariableForDimension(i);
|
||||||
|
IVS.insert(const_cast<Value *>(IV));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned OutBytes;
|
||||||
|
Values = getGPUValues(OutBytes);
|
||||||
|
PTXGen.setOutputBytes(OutBytes);
|
||||||
|
PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
|
||||||
|
|
||||||
|
BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
|
||||||
|
Builder.SetInsertPoint(LoopBody);
|
||||||
|
|
||||||
|
BasicBlock *AfterBB = 0;
|
||||||
|
if (NonPLoopDepth) {
|
||||||
|
Value *LowerBound, *UpperBound, *IV, *Stride;
|
||||||
|
Type *IntPtrTy = getIntPtrTy();
|
||||||
|
LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
|
||||||
|
UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
|
||||||
|
Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
|
||||||
|
IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
|
||||||
|
const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
|
||||||
|
Value *OldIV = const_cast<Value *>(OldIV_);
|
||||||
|
VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
|
||||||
|
}
|
||||||
|
|
||||||
|
updateWithValueMap(VMap, /* reverse */ false);
|
||||||
|
BlockGenerator::generate(Builder, *Statement, ValueMap, P);
|
||||||
|
updateWithValueMap(VMap, /* reverse */ true);
|
||||||
|
|
||||||
|
if (AfterBB)
|
||||||
|
Builder.SetInsertPoint(AfterBB->begin());
|
||||||
|
|
||||||
|
// FIXME: The replacement of the host base address with the parameter of ptx
|
||||||
|
// subfunction should have been done by updateWithValueMap. We use the
|
||||||
|
// following codes to avoid affecting other parts of Polly. This should be
|
||||||
|
// fixed later.
|
||||||
|
Function *FN = Builder.GetInsertBlock()->getParent();
|
||||||
|
for (unsigned j = 0; j < Values.size(); j++) {
|
||||||
|
Value *baseAddr = Values[j];
|
||||||
|
for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
|
||||||
|
for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
|
||||||
|
I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Builder.SetInsertPoint(AfterLoop);
|
||||||
|
PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
|
||||||
|
NumIterations[2], NumIterations[3]);
|
||||||
|
PTXGen.finishGeneration(FN);
|
||||||
|
}
|
||||||
|
|
||||||
bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
|
bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
|
||||||
const clast_stmt *stmt = f->body;
|
const clast_stmt *stmt = f->body;
|
||||||
|
|
||||||
|
@ -647,6 +833,16 @@ void ClastStmtCodeGen::codegen(const clast_for *f) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (GPGPU && isParallelFor(f)) {
|
||||||
|
if (!parallelCodeGeneration) {
|
||||||
|
parallelCodeGeneration = true;
|
||||||
|
parallelLoops.push_back(f->iterator);
|
||||||
|
codegenForGPGPU(f);
|
||||||
|
parallelCodeGeneration = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
codegenForSequential(f);
|
codegenForSequential(f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,652 @@
|
||||||
|
//===------ PTXGenerator.cpp - IR helper to create loops -----------------===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// This file contains functions to create GPU parallel codes as LLVM-IR.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "polly/CodeGen/PTXGenerator.h"
|
||||||
|
#include "polly/ScopDetection.h"
|
||||||
|
#include "polly/ScopInfo.h"
|
||||||
|
|
||||||
|
#include "llvm/Intrinsics.h"
|
||||||
|
#include "llvm/Module.h"
|
||||||
|
#include "llvm/PassManager.h"
|
||||||
|
#include "llvm/ADT/SetVector.h"
|
||||||
|
#include "llvm/Analysis/Dominators.h"
|
||||||
|
#include "llvm/Support/Debug.h"
|
||||||
|
#include "llvm/Support/FormattedStream.h"
|
||||||
|
#include "llvm/Support/TargetRegistry.h"
|
||||||
|
#include "llvm/Target/TargetData.h"
|
||||||
|
#include "llvm/Target/TargetMachine.h"
|
||||||
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||||
|
#include "llvm/Transforms/Utils/Cloning.h"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
using namespace polly;
|
||||||
|
|
||||||
|
PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P,
|
||||||
|
const std::string &Triple):
|
||||||
|
Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
|
||||||
|
BlockWidth(1), BlockHeight(1), OutputBytes(0) {
|
||||||
|
|
||||||
|
InitializeGPUDataTypes();
|
||||||
|
}
|
||||||
|
|
||||||
|
Module *PTXGenerator::getModule() {
|
||||||
|
return Builder.GetInsertBlock()->getParent()->getParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
|
||||||
|
assert(NumArgs == 1 && "we support only one array access now.");
|
||||||
|
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = Builder.GetInsertBlock()->getParent();
|
||||||
|
std::vector<Type*> Arguments;
|
||||||
|
for (int i = 0; i < NumArgs; i++)
|
||||||
|
Arguments.push_back(Builder.getInt8PtrTy());
|
||||||
|
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
|
||||||
|
Function *FN = Function::Create(FT, Function::InternalLinkage,
|
||||||
|
F->getName() + "_ptx_subfn", M);
|
||||||
|
FN->setCallingConv(CallingConv::PTX_Kernel);
|
||||||
|
|
||||||
|
// Do not run any optimization pass on the new function.
|
||||||
|
P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
|
||||||
|
|
||||||
|
for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
|
||||||
|
AI->setName("ptx.Array");
|
||||||
|
|
||||||
|
return FN;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues,
|
||||||
|
SetVector<Value*> &OriginalIVS,
|
||||||
|
PTXGenerator::ValueToValueMapTy &VMap,
|
||||||
|
Function **SubFunction) {
|
||||||
|
Function *FN = createSubfunctionDefinition(UsedValues.size());
|
||||||
|
Module *M = getModule();
|
||||||
|
LLVMContext &Context = FN->getContext();
|
||||||
|
IntegerType *Ty = Builder.getInt64Ty();
|
||||||
|
|
||||||
|
// Store the previous basic block.
|
||||||
|
BasicBlock *PrevBB = Builder.GetInsertBlock();
|
||||||
|
|
||||||
|
// Create basic blocks.
|
||||||
|
BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
|
||||||
|
BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
|
||||||
|
BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
|
||||||
|
|
||||||
|
DominatorTree &DT = P->getAnalysis<DominatorTree>();
|
||||||
|
DT.addNewBlock(HeaderBB, PrevBB);
|
||||||
|
DT.addNewBlock(ExitBB, HeaderBB);
|
||||||
|
DT.addNewBlock(BodyBB, HeaderBB);
|
||||||
|
|
||||||
|
Builder.SetInsertPoint(HeaderBB);
|
||||||
|
|
||||||
|
// Insert VMap items with maps of array base address on the host to base
|
||||||
|
// address on the device.
|
||||||
|
Function::arg_iterator AI = FN->arg_begin();
|
||||||
|
for (unsigned j = 0; j < UsedValues.size(); j++) {
|
||||||
|
Value *BaseAddr = UsedValues[j];
|
||||||
|
Type *ArrayTy = BaseAddr->getType();
|
||||||
|
Value *Param = Builder.CreateBitCast(AI, ArrayTy);
|
||||||
|
VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param));
|
||||||
|
AI++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: These intrinsics should be inserted on-demand. However, we insert
|
||||||
|
// them all currently for simplicity.
|
||||||
|
Function *GetNctaidX =
|
||||||
|
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
|
||||||
|
Function *GetNctaidY =
|
||||||
|
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
|
||||||
|
Function *GetCtaidX =
|
||||||
|
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
|
||||||
|
Function *GetCtaidY =
|
||||||
|
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
|
||||||
|
Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
|
||||||
|
Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
|
||||||
|
Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
|
||||||
|
Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
|
||||||
|
|
||||||
|
Value *GridWidth = Builder.CreateCall(GetNctaidX);
|
||||||
|
GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
|
||||||
|
Value *GridHeight = Builder.CreateCall(GetNctaidY);
|
||||||
|
GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
|
||||||
|
Value *BlockWidth = Builder.CreateCall(GetNtidX);
|
||||||
|
BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
|
||||||
|
Value *BlockHeight = Builder.CreateCall(GetNtidY);
|
||||||
|
BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
|
||||||
|
Value *BIDx = Builder.CreateCall(GetCtaidX);
|
||||||
|
BIDx = Builder.CreateIntCast(BIDx, Ty, false);
|
||||||
|
Value *BIDy = Builder.CreateCall(GetCtaidY);
|
||||||
|
BIDy = Builder.CreateIntCast(BIDy, Ty, false);
|
||||||
|
Value *TIDx = Builder.CreateCall(GetTidX);
|
||||||
|
TIDx = Builder.CreateIntCast(TIDx, Ty, false);
|
||||||
|
Value *TIDy = Builder.CreateCall(GetTidY);
|
||||||
|
TIDy = Builder.CreateIntCast(TIDy, Ty, false);
|
||||||
|
|
||||||
|
Builder.CreateBr(BodyBB);
|
||||||
|
Builder.SetInsertPoint(BodyBB);
|
||||||
|
|
||||||
|
unsigned NumDims = OriginalIVS.size();
|
||||||
|
std::vector<Value *> Substitutions;
|
||||||
|
Value *BlockID, *ThreadID;
|
||||||
|
switch (NumDims) {
|
||||||
|
case 1: {
|
||||||
|
Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight,
|
||||||
|
"p_gpu_blocksize");
|
||||||
|
BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
|
||||||
|
BlockID = Builder.CreateAdd(BlockID, BIDx);
|
||||||
|
BlockID = Builder.CreateMul(BlockID, BlockSize);
|
||||||
|
ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
|
||||||
|
ThreadID = Builder.CreateAdd(ThreadID, TIDx);
|
||||||
|
ThreadID = Builder.CreateAdd(ThreadID, BlockID);
|
||||||
|
Substitutions.push_back(ThreadID);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 2: {
|
||||||
|
BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
|
||||||
|
BlockID = Builder.CreateAdd(BlockID, BIDx);
|
||||||
|
Substitutions.push_back(BlockID);
|
||||||
|
ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
|
||||||
|
ThreadID = Builder.CreateAdd(ThreadID, TIDx);
|
||||||
|
Substitutions.push_back(ThreadID);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 3: {
|
||||||
|
BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
|
||||||
|
BlockID = Builder.CreateAdd(BlockID, BIDx);
|
||||||
|
Substitutions.push_back(BlockID);
|
||||||
|
Substitutions.push_back(TIDy);
|
||||||
|
Substitutions.push_back(TIDx);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 4: {
|
||||||
|
Substitutions.push_back(BIDy);
|
||||||
|
Substitutions.push_back(BIDx);
|
||||||
|
Substitutions.push_back(TIDy);
|
||||||
|
Substitutions.push_back(TIDx);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
assert(true &&
|
||||||
|
"We cannot transform parallel loops whose depth is larger than 4.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(OriginalIVS.size() == Substitutions.size()
|
||||||
|
&& "The size of IVS should be equal to the size of substitutions.");
|
||||||
|
for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
|
||||||
|
VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i],
|
||||||
|
Substitutions[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateBr(ExitBB);
|
||||||
|
Builder.SetInsertPoint(--Builder.GetInsertPoint());
|
||||||
|
BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
|
||||||
|
|
||||||
|
// Add the termination of the ptx-device subfunction.
|
||||||
|
Builder.SetInsertPoint(ExitBB);
|
||||||
|
Builder.CreateRetVoid();
|
||||||
|
|
||||||
|
Builder.SetInsertPoint(LoopBody);
|
||||||
|
*SubFunction = FN;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues,
|
||||||
|
SetVector<Value*> &OriginalIVS,
|
||||||
|
ValueToValueMapTy &VMap,
|
||||||
|
BasicBlock::iterator *LoopBody) {
|
||||||
|
Function *SubFunction;
|
||||||
|
BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
|
||||||
|
createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
|
||||||
|
*LoopBody = Builder.GetInsertPoint();
|
||||||
|
Builder.SetInsertPoint(PrevInsertPoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
IntegerType *PTXGenerator::getInt64Type() {
|
||||||
|
return Builder.getInt64Ty();
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getI8PtrType() {
|
||||||
|
return PointerType::getUnqual(Builder.getInt8Ty());
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getPtrI8PtrType() {
|
||||||
|
return PointerType::getUnqual(getI8PtrType());
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getFloatPtrType() {
|
||||||
|
return llvm::Type::getFloatPtrTy(getModule()->getContext());
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getGPUContextPtrType() {
|
||||||
|
return PointerType::getUnqual(ContextTy);
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getGPUModulePtrType() {
|
||||||
|
return PointerType::getUnqual(ModuleTy);
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getGPUDevicePtrType() {
|
||||||
|
return PointerType::getUnqual(DeviceTy);
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
|
||||||
|
return PointerType::getUnqual(DevDataTy);
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getGPUFunctionPtrType() {
|
||||||
|
return PointerType::getUnqual(KernelTy);
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerType *PTXGenerator::getGPUEventPtrType() {
|
||||||
|
return PointerType::getUnqual(EventTy);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::InitializeGPUDataTypes() {
|
||||||
|
LLVMContext &Context = getModule()->getContext();
|
||||||
|
|
||||||
|
ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
|
||||||
|
ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
|
||||||
|
KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
|
||||||
|
DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
|
||||||
|
DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT");
|
||||||
|
EventTy = StructType::create(Context, "struct.PollyGPUEventT");
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
|
||||||
|
const char *Name = "polly_initDevice";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
|
||||||
|
Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall2(F, Context, Device);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
|
||||||
|
const char *Name = "polly_getPTXModule";
|
||||||
|
llvm::Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getI8PtrType());
|
||||||
|
Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall2(F, Buffer, Module);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
|
||||||
|
Value *Kernel) {
|
||||||
|
const char *Name = "polly_getPTXKernelEntry";
|
||||||
|
llvm::Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getI8PtrType());
|
||||||
|
Args.push_back(getGPUModulePtrType());
|
||||||
|
Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall3(F, Entry, Module, Kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
|
||||||
|
Value *DeviceData,
|
||||||
|
Value *Size) {
|
||||||
|
const char *Name = "polly_allocateMemoryForHostAndDevice";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getPtrI8PtrType());
|
||||||
|
Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall3(F, HostData, DeviceData, Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
|
||||||
|
Value *HostData,
|
||||||
|
Value *Size) {
|
||||||
|
const char *Name = "polly_copyFromHostToDevice";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getPtrGPUDevicePtrType());
|
||||||
|
Args.push_back(getI8PtrType());
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall3(F, DeviceData, HostData, Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
|
||||||
|
Value *DeviceData,
|
||||||
|
Value *Size) {
|
||||||
|
const char *Name = "polly_copyFromDeviceToHost";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getI8PtrType());
|
||||||
|
Args.push_back(getPtrGPUDevicePtrType());
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall3(F, HostData, DeviceData, Size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
|
||||||
|
Value *BlockWidth,
|
||||||
|
Value *BlockHeight,
|
||||||
|
Value *DeviceData) {
|
||||||
|
const char *Name = "polly_setKernelParameters";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getGPUFunctionPtrType());
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
Args.push_back(getPtrGPUDevicePtrType());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
|
||||||
|
Value *GridHeight) {
|
||||||
|
const char *Name = "polly_launchKernel";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getGPUFunctionPtrType());
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
Args.push_back(getInt64Type());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
|
||||||
|
Value *StopEvent) {
|
||||||
|
const char *Name = "polly_startTimerByCudaEvent";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
|
||||||
|
Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall2(F, StartEvent, StopEvent);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
|
||||||
|
Value *StopEvent,
|
||||||
|
Value *Timer) {
|
||||||
|
const char *Name = "polly_stopTimerByCudaEvent";
|
||||||
|
Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getGPUEventPtrType());
|
||||||
|
Args.push_back(getGPUEventPtrType());
|
||||||
|
Args.push_back(getFloatPtrType());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
|
||||||
|
Value *DeviceData,
|
||||||
|
Value *Module,
|
||||||
|
Value *Context,
|
||||||
|
Value *Kernel) {
|
||||||
|
const char *Name = "polly_cleanupGPGPUResources";
|
||||||
|
llvm::Module *M = getModule();
|
||||||
|
Function *F = M->getFunction(Name);
|
||||||
|
|
||||||
|
// If F is not available, declare it.
|
||||||
|
if (!F) {
|
||||||
|
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
|
||||||
|
std::vector<Type*> Args;
|
||||||
|
Args.push_back(getI8PtrType());
|
||||||
|
Args.push_back(getPtrGPUDevicePtrType());
|
||||||
|
Args.push_back(getGPUModulePtrType());
|
||||||
|
Args.push_back(getGPUContextPtrType());
|
||||||
|
Args.push_back(getGPUFunctionPtrType());
|
||||||
|
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
|
||||||
|
F = Function::Create(Ty, Linkage, Name, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::getCUDAGridWidth() {
|
||||||
|
return ConstantInt::get(getInt64Type(), GridWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::getCUDAGridHeight() {
|
||||||
|
return ConstantInt::get(getInt64Type(), GridHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::getCUDABlockWidth() {
|
||||||
|
return ConstantInt::get(getInt64Type(), BlockWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::getCUDABlockHeight() {
|
||||||
|
return ConstantInt::get(getInt64Type(), BlockHeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::getOutputArraySizeInBytes() {
|
||||||
|
return ConstantInt::get(getInt64Type(), OutputBytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
|
||||||
|
Module *M = getModule();
|
||||||
|
std::string LLVMKernelStr;
|
||||||
|
raw_string_ostream NameROS(LLVMKernelStr);
|
||||||
|
formatted_raw_ostream FOS(NameROS);
|
||||||
|
FOS << "target triple = \"" << GPUTriple <<"\"\n";
|
||||||
|
SubFunction->print(FOS);
|
||||||
|
|
||||||
|
// Insert ptx intrinsics into the kernel string.
|
||||||
|
for (Module::iterator I = M->begin(), E = M->end(); I != E; ) {
|
||||||
|
Function *F = I++;
|
||||||
|
// Function must be a prototype and unused.
|
||||||
|
if (F->isDeclaration() && F->isIntrinsic()) {
|
||||||
|
switch (F->getIntrinsicID()) {
|
||||||
|
case Intrinsic::ptx_read_nctaid_x:
|
||||||
|
case Intrinsic::ptx_read_nctaid_y:
|
||||||
|
case Intrinsic::ptx_read_ctaid_x:
|
||||||
|
case Intrinsic::ptx_read_ctaid_y:
|
||||||
|
case Intrinsic::ptx_read_ntid_x:
|
||||||
|
case Intrinsic::ptx_read_ntid_y:
|
||||||
|
case Intrinsic::ptx_read_tid_x:
|
||||||
|
case Intrinsic::ptx_read_tid_y:
|
||||||
|
F->print(FOS);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr,
|
||||||
|
"llvm_kernel");
|
||||||
|
Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
|
||||||
|
Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");
|
||||||
|
|
||||||
|
Function *GetDeviceKernel = Intrinsic::getDeclaration(M,
|
||||||
|
Intrinsic::codegen);
|
||||||
|
|
||||||
|
return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
|
||||||
|
StringRef Entry = SubFunction->getName();
|
||||||
|
return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
|
||||||
|
Module *M = getModule();
|
||||||
|
SubFunction->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x"))
|
||||||
|
FuncPTXReadNCtaidX->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y"))
|
||||||
|
FuncPTXReadNCtaidY->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x"))
|
||||||
|
FuncPTXReadCtaidX->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y"))
|
||||||
|
FuncPTXReadCtaidY->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x"))
|
||||||
|
FuncPTXReadNTidX->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y"))
|
||||||
|
FuncPTXReadNTidY->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x"))
|
||||||
|
FuncPTXReadTidX->eraseFromParent();
|
||||||
|
|
||||||
|
if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y"))
|
||||||
|
FuncPTXReadTidY->eraseFromParent();
|
||||||
|
}
|
||||||
|
|
||||||
|
void PTXGenerator::finishGeneration(Function *F) {
|
||||||
|
// Define data used by the GPURuntime library.
|
||||||
|
AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0,
|
||||||
|
"phcontext");
|
||||||
|
AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0,
|
||||||
|
"phdevice");
|
||||||
|
AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0,
|
||||||
|
"phmodule");
|
||||||
|
AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0,
|
||||||
|
"phkernel");
|
||||||
|
AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
|
||||||
|
"pstart_timer");
|
||||||
|
AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
|
||||||
|
"pstop_timer");
|
||||||
|
AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0,
|
||||||
|
"pdevice_data");
|
||||||
|
AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0,
|
||||||
|
"phost_data");
|
||||||
|
Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
|
||||||
|
AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
|
||||||
|
|
||||||
|
// Initialize the GPU device.
|
||||||
|
createCallInitDevice(PtrCUContext, PtrCUDevice);
|
||||||
|
|
||||||
|
// Create the GPU kernel module and entry function.
|
||||||
|
Value *PTXString = createPTXKernelFunction(F);
|
||||||
|
Value *PTXEntry = getPTXKernelEntryName(F);
|
||||||
|
createCallGetPTXModule(PTXString, PtrCUModule);
|
||||||
|
LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
|
||||||
|
createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
|
||||||
|
|
||||||
|
// Allocate device memory and its corresponding host memory.
|
||||||
|
createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
|
||||||
|
getOutputArraySizeInBytes());
|
||||||
|
|
||||||
|
// Get the pointer to the device memory and set the GPU execution parameters.
|
||||||
|
LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
|
||||||
|
LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
|
||||||
|
createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
|
||||||
|
getCUDABlockHeight(), DData);
|
||||||
|
|
||||||
|
// Create the start and end timer and record the start time.
|
||||||
|
createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
|
||||||
|
|
||||||
|
// Launch the GPU kernel.
|
||||||
|
createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
|
||||||
|
|
||||||
|
// Copy the results back from the GPU to the host.
|
||||||
|
LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
|
||||||
|
createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
|
||||||
|
|
||||||
|
// Record the end time.
|
||||||
|
LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
|
||||||
|
LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
|
||||||
|
createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent,
|
||||||
|
PtrElapsedTimes);
|
||||||
|
|
||||||
|
// Cleanup all the resources used.
|
||||||
|
LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
|
||||||
|
createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext,
|
||||||
|
CUKernel);
|
||||||
|
|
||||||
|
// Erase the ptx kernel and device subfunctions and ptx intrinsics from
|
||||||
|
// current module.
|
||||||
|
eraseUnusedFunctions(F);
|
||||||
|
}
|
|
@ -0,0 +1,16 @@
|
||||||
|
int A[128][128];
|
||||||
|
|
||||||
|
int gpu_pure() {
|
||||||
|
int i,j;
|
||||||
|
|
||||||
|
for(i = 0; i < 128; i++)
|
||||||
|
for(j = 0; j < 128; j++)
|
||||||
|
A[i][j] = i*128 + j;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
int b = gpu_pure();
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
|
||||||
|
; ModuleID = '2d_innermost_parallel.s'
|
||||||
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||||
|
target triple = "x86_64-unknown-linux-gnu"
|
||||||
|
|
||||||
|
@A = common global [128 x [128 x i32]] zeroinitializer, align 16
|
||||||
|
|
||||||
|
define i32 @gpu_pure() nounwind uwtable {
|
||||||
|
entry:
|
||||||
|
br label %for.cond
|
||||||
|
|
||||||
|
for.cond: ; preds = %for.inc6, %entry
|
||||||
|
%indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
|
||||||
|
%lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
|
||||||
|
%exitcond6 = icmp ne i32 %lftr.wideiv5, 128
|
||||||
|
br i1 %exitcond6, label %for.body, label %for.end8
|
||||||
|
|
||||||
|
for.body: ; preds = %for.cond
|
||||||
|
br label %for.cond1
|
||||||
|
|
||||||
|
for.cond1: ; preds = %for.inc, %for.body
|
||||||
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
|
||||||
|
%lftr.wideiv = trunc i64 %indvars.iv to i32
|
||||||
|
%exitcond = icmp ne i32 %lftr.wideiv, 128
|
||||||
|
br i1 %exitcond, label %for.body3, label %for.end
|
||||||
|
|
||||||
|
for.body3: ; preds = %for.cond1
|
||||||
|
%tmp = shl nsw i64 %indvars.iv2, 7
|
||||||
|
%tmp7 = add nsw i64 %tmp, %indvars.iv
|
||||||
|
%arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
|
||||||
|
%tmp8 = trunc i64 %tmp7 to i32
|
||||||
|
store i32 %tmp8, i32* %arrayidx5, align 4
|
||||||
|
br label %for.inc
|
||||||
|
|
||||||
|
for.inc: ; preds = %for.body3
|
||||||
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||||
|
br label %for.cond1
|
||||||
|
|
||||||
|
for.end: ; preds = %for.cond1
|
||||||
|
br label %for.inc6
|
||||||
|
|
||||||
|
for.inc6: ; preds = %for.end
|
||||||
|
%indvars.iv.next3 = add i64 %indvars.iv2, 1
|
||||||
|
br label %for.cond
|
||||||
|
|
||||||
|
for.end8: ; preds = %for.cond
|
||||||
|
ret i32 0
|
||||||
|
}
|
||||||
|
|
||||||
|
define i32 @main() nounwind uwtable {
|
||||||
|
entry:
|
||||||
|
%call = call i32 @gpu_pure()
|
||||||
|
ret i32 0
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: call void @polly_initDevice
|
||||||
|
; CHECK: call void @polly_getPTXModule
|
||||||
|
; CHECK: call void @polly_getPTXKernelEntry
|
||||||
|
; CHECK: call void @polly_allocateMemoryForHostAndDevice
|
||||||
|
; CHECK: call void @polly_setKernelParameters
|
||||||
|
; CHECK: call void @polly_startTimerByCudaEvent
|
||||||
|
; CHECK: call void @polly_launchKernel
|
||||||
|
; CHECK: call void @polly_copyFromDeviceToHost
|
||||||
|
; CHECK: call void @polly_stopTimerByCudaEvent
|
||||||
|
; CHECK: call void @polly_cleanupGPGPUResources
|
|
@ -0,0 +1,17 @@
|
||||||
|
int A[128][128];
|
||||||
|
|
||||||
|
int gpu_no_pure() {
|
||||||
|
int i,j,k;
|
||||||
|
|
||||||
|
for(i = 0; i < 128; i++)
|
||||||
|
for(j = 0; j < 128; j++)
|
||||||
|
for(k = 0; k < 256; k++)
|
||||||
|
A[i][j] += i*123/(k+1)+5-j*k-123;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
int b = gpu_no_pure();
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,88 @@
|
||||||
|
; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
|
||||||
|
; ModuleID = '3d_innermost_non_parallel.s'
|
||||||
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
||||||
|
target triple = "x86_64-unknown-linux-gnu"
|
||||||
|
|
||||||
|
@A = common global [128 x [128 x i32]] zeroinitializer, align 16
|
||||||
|
|
||||||
|
define i32 @gpu_no_pure() nounwind uwtable {
|
||||||
|
entry:
|
||||||
|
br label %for.cond
|
||||||
|
|
||||||
|
for.cond: ; preds = %for.inc16, %entry
|
||||||
|
%indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
|
||||||
|
%lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
|
||||||
|
%exitcond6 = icmp ne i32 %lftr.wideiv5, 128
|
||||||
|
br i1 %exitcond6, label %for.body, label %for.end18
|
||||||
|
|
||||||
|
for.body: ; preds = %for.cond
|
||||||
|
br label %for.cond1
|
||||||
|
|
||||||
|
for.cond1: ; preds = %for.inc13, %for.body
|
||||||
|
%indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
|
||||||
|
%lftr.wideiv = trunc i64 %indvars.iv to i32
|
||||||
|
%exitcond1 = icmp ne i32 %lftr.wideiv, 128
|
||||||
|
br i1 %exitcond1, label %for.body3, label %for.end15
|
||||||
|
|
||||||
|
for.body3: ; preds = %for.cond1
|
||||||
|
br label %for.cond4
|
||||||
|
|
||||||
|
for.cond4: ; preds = %for.inc, %for.body3
|
||||||
|
%k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
|
||||||
|
%exitcond = icmp ne i32 %k.0, 256
|
||||||
|
br i1 %exitcond, label %for.body6, label %for.end
|
||||||
|
|
||||||
|
for.body6: ; preds = %for.cond4
|
||||||
|
%tmp = mul nsw i64 %indvars.iv2, 123
|
||||||
|
%add = add nsw i32 %k.0, 1
|
||||||
|
%tmp7 = trunc i64 %tmp to i32
|
||||||
|
%div = sdiv i32 %tmp7, %add
|
||||||
|
%add7 = add nsw i32 %div, 5
|
||||||
|
%tmp8 = trunc i64 %indvars.iv to i32
|
||||||
|
%mul8 = mul nsw i32 %tmp8, %k.0
|
||||||
|
%sub = sub nsw i32 %add7, %mul8
|
||||||
|
%sub9 = add nsw i32 %sub, -123
|
||||||
|
%arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
|
||||||
|
%tmp9 = load i32* %arrayidx11, align 4
|
||||||
|
%add12 = add nsw i32 %tmp9, %sub9
|
||||||
|
store i32 %add12, i32* %arrayidx11, align 4
|
||||||
|
br label %for.inc
|
||||||
|
|
||||||
|
for.inc: ; preds = %for.body6
|
||||||
|
%inc = add nsw i32 %k.0, 1
|
||||||
|
br label %for.cond4
|
||||||
|
|
||||||
|
for.end: ; preds = %for.cond4
|
||||||
|
br label %for.inc13
|
||||||
|
|
||||||
|
for.inc13: ; preds = %for.end
|
||||||
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
||||||
|
br label %for.cond1
|
||||||
|
|
||||||
|
for.end15: ; preds = %for.cond1
|
||||||
|
br label %for.inc16
|
||||||
|
|
||||||
|
for.inc16: ; preds = %for.end15
|
||||||
|
%indvars.iv.next3 = add i64 %indvars.iv2, 1
|
||||||
|
br label %for.cond
|
||||||
|
|
||||||
|
for.end18: ; preds = %for.cond
|
||||||
|
ret i32 0
|
||||||
|
}
|
||||||
|
|
||||||
|
define i32 @main() nounwind uwtable {
|
||||||
|
entry:
|
||||||
|
%call = call i32 @gpu_no_pure()
|
||||||
|
ret i32 0
|
||||||
|
}
|
||||||
|
|
||||||
|
; CHECK: call void @polly_initDevice
|
||||||
|
; CHECK: call void @polly_getPTXModule
|
||||||
|
; CHECK: call void @polly_getPTXKernelEntry
|
||||||
|
; CHECK: call void @polly_allocateMemoryForHostAndDevice
|
||||||
|
; CHECK: call void @polly_setKernelParameters
|
||||||
|
; CHECK: call void @polly_startTimerByCudaEvent
|
||||||
|
; CHECK: call void @polly_launchKernel
|
||||||
|
; CHECK: call void @polly_copyFromDeviceToHost
|
||||||
|
; CHECK: call void @polly_stopTimerByCudaEvent
|
||||||
|
; CHECK: call void @polly_cleanupGPGPUResources
|
|
@ -0,0 +1,21 @@
|
||||||
|
{
|
||||||
|
"context" : "{ : }",
|
||||||
|
"name" : "for.cond => for.end18",
|
||||||
|
"statements" : [
|
||||||
|
{
|
||||||
|
"accesses" : [
|
||||||
|
{
|
||||||
|
"kind" : "read",
|
||||||
|
"relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind" : "write",
|
||||||
|
"relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
|
||||||
|
"name" : "Stmt_for_body6",
|
||||||
|
"schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,21 @@
|
||||||
|
{
|
||||||
|
"context" : "{ : }",
|
||||||
|
"name" : "for.cond => for.end18",
|
||||||
|
"statements" : [
|
||||||
|
{
|
||||||
|
"accesses" : [
|
||||||
|
{
|
||||||
|
"kind" : "read",
|
||||||
|
"relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind" : "write",
|
||||||
|
"relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
|
||||||
|
"name" : "Stmt_for_body6",
|
||||||
|
"schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"context" : "{ : }",
|
||||||
|
"name" : "for.cond => for.end8",
|
||||||
|
"statements" : [
|
||||||
|
{
|
||||||
|
"accesses" : [
|
||||||
|
{
|
||||||
|
"kind" : "write",
|
||||||
|
"relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
|
||||||
|
"name" : "Stmt_for_body3",
|
||||||
|
"schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"context" : "{ : }",
|
||||||
|
"name" : "for.cond => for.end8",
|
||||||
|
"statements" : [
|
||||||
|
{
|
||||||
|
"accesses" : [
|
||||||
|
{
|
||||||
|
"kind" : "write",
|
||||||
|
"relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
|
||||||
|
"name" : "Stmt_for_body3",
|
||||||
|
"schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
Loading…
Reference in New Issue