Add preliminary implementation for GPGPU code generation.

Translate the selected parallel loop body into a ptx string and run it with cuda driver API. We limit this preliminary implementation to target the following special test cases: - Support only 2-dimensional parallel loops with or without only one innermost non-parallel loop. - Support write memory access to only one array in a SCoP. Contributed by: Yabin Hu <yabin.hwu@gmail.com> llvm-svn: 160164
2012-07-13 07:21:00 +00:00 · 2012-07-13 07:21:00 +00:00 · b299d28181
parent a9c373e49d
commit b299d28181
13 changed files with 1307 additions and 0 deletions
--- a/polly/include/polly/CodeGen/PTXGenerator.h
+++ b/polly/include/polly/CodeGen/PTXGenerator.h
@ -0,0 +1,193 @@
+//===- PTXGenerator.h - IR helper to create GPGPU LLVM-IR -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions to create GPGPU parallel loops as LLVM-IR.
+//
+//===----------------------------------------------------------------------===//
+#ifndef POLLY_CODEGEN_PTXGENERATOR_H
+#define POLLY_CODEGEN_PTXGENERATOR_H
+
+#include "llvm/IRBuilder.h"
+#include "llvm/ADT/SetVector.h"
+
+#include <map>
+
+namespace llvm {
+  class Value;
+  class Pass;
+  class BasicBlock;
+}
+
+namespace polly {
+using namespace llvm;
+
+class PTXGenerator {
+public:
+  typedef std::map<Value*, Value*> ValueToValueMapTy;
+
+  PTXGenerator(IRBuilder<> &Builder, Pass *P, const std::string &Triple);
+
+  /// @brief Create a GPGPU parallel loop.
+  ///
+  /// @param UsedValues   A set of LLVM-IR Values that should be available to
+  ///                     the new loop body.
+  /// @param OriginalIVS  The new values of the original induction variables.
+  /// @param VMap         This map is filled by createParallelLoop(). It
+  ///                     maps the values in UsedValues to Values through which
+  ///                     their content is available within the loop body.
+  /// @param LoopBody     A pointer to an iterator that is set to point to the
+  ///                     body of the created loop. It should be used to insert
+  ///                     instructions that form the actual loop body.
+  void startGeneration(SetVector<Value*> &UsedValues,
+                       SetVector<Value*> &OriginalIVS, ValueToValueMapTy &VMap,
+                       BasicBlock::iterator *LoopBody);
+
+  /// @brief Execute the post-operations to build a GPGPU parallel loop.
+  ///
+  void finishGeneration(Function *SubFunction);
+
+  /// @brief Set the parameters for launching PTX kernel.
+  ///
+  /// @param GridW     A value of the width of a GPU grid.
+  /// @param GridH     A value of the height of a GPU grid.
+  /// @param BlockW    A value of the width of a GPU block.
+  /// @param BlockH    A value of the height of a GPU block.
+  void setLaunchingParameters(int GridW, int GridH, int BlockW, int BlockH) {
+    GridWidth = GridW;
+    GridHeight = GridH;
+    BlockWidth = BlockW;
+    BlockHeight = BlockH;
+  }
+
+  /// @brief Set the size of the output array.
+  ///
+  /// This size is used to allocate memory on the device and the host.
+  ///
+  /// @param Bytes        Output array size in bytes.
+  void setOutputBytes(unsigned Bytes) {
+    OutputBytes = Bytes;
+  }
+
+private:
+  IRBuilder<> &Builder;
+  Pass *P;
+
+  /// @brief The target triple of the device.
+  const std::string &GPUTriple;
+
+  /// @brief Parameters used for launching PTX kernel.
+  int GridWidth, GridHeight, BlockWidth, BlockHeight;
+
+  /// @brief Size of the output array in bytes.
+  unsigned OutputBytes;
+
+  /// @brief Polly's GPU data types.
+  StructType *ContextTy, *ModuleTy, *KernelTy, *DeviceTy, *DevDataTy, *EventTy;
+
+  void InitializeGPUDataTypes();
+  IntegerType *getInt64Type();            // i64
+  PointerType *getI8PtrType();            // char *
+  PointerType *getPtrI8PtrType();         // char **
+  PointerType *getFloatPtrType();         // float *
+  PointerType *getGPUContextPtrType();    // %struct.PollyGPUContextT *
+  PointerType *getGPUModulePtrType();     // %struct.PollyGPUModuleT *
+  PointerType *getGPUDevicePtrType();     // %struct.PollyGPUDeviceT *
+  PointerType *getPtrGPUDevicePtrType();  // %struct.PollyGPUDevicePtrT *
+  PointerType *getGPUFunctionPtrType();   // %struct.PollyGPUFunctionT *
+  PointerType *getGPUEventPtrType();      // %struct.PollyGPUEventT *
+
+  Module *getModule();
+
+  /// @brief Create the kernel string containing LLVM IR.
+  ///
+  /// @param SubFunction  A pointer to the device code function.
+  /// @return             A global string variable containing the LLVM IR codes
+  //                      of the SubFunction.
+  Value *createPTXKernelFunction(Function *SubFunction);
+
+  /// @brief Get the entry name of the device kernel function.
+  ///
+  /// @param SubFunction  A pointer to the device code function.
+  /// @return             A global string variable containing the entry name of
+  ///                     the SubFunction.
+  Value *getPTXKernelEntryName(Function *SubFunction);
+
+  void createCallInitDevice(Value *Context, Value *Device);
+  void createCallGetPTXModule(Value *Buffer, Value *Module);
+  void createCallGetPTXKernelEntry(Value *Entry, Value *Module,
+                                   Value *Kernel);
+  void createCallAllocateMemoryForHostAndDevice(Value *HostData,
+                                                Value *DeviceData,
+                                                Value *Size);
+  void createCallCopyFromHostToDevice(Value *DeviceData, Value *HostData,
+                                      Value *Size);
+  void createCallCopyFromDeviceToHost(Value *HostData, Value *DeviceData,
+                                      Value *Size);
+  void createCallSetKernelParameters(Value *Kernel, Value *BlockWidth,
+                                     Value *BlockHeight, Value *DeviceData);
+  void createCallLaunchKernel(Value *Kernel, Value *GridWidth,
+                              Value *GridHeight);
+  void createCallStartTimerByCudaEvent(Value *StartEvent,
+                                       Value *StopEvent);
+  void createCallStopTimerByCudaEvent(Value *StartEvent, Value *StopEvent,
+                                      Value *Timer);
+  void createCallCleanupGPGPUResources(Value *HostData, Value *DeviceData,
+                                       Value *Module, Value *Context,
+                                       Value *Kernel);
+
+  /// @brief Create the CUDA subfunction.
+  ///
+  /// @param UsedValues   A set of LLVM-IR Values that should be available to
+  ///                     the new loop body.
+  /// @param VMap         This map that is filled by createSubfunction(). It
+  ///                     maps the values in UsedValues to Values through which
+  ///                     their content is available within the loop body.
+  /// @param OriginalIVS  The new values of the original induction variables.
+  /// @param SubFunction  The newly created SubFunction is returned here.
+  void createSubfunction(SetVector<Value*> &UsedValues,
+                         SetVector<Value*> &OriginalIVS,
+                         ValueToValueMapTy &VMap,
+                         Function **SubFunction);
+
+  /// @brief Create the definition of the CUDA subfunction.
+  ///
+  /// @param NumArgs      The number of parameters of this subfunction. This is
+  ///                     usually set to the number of memory accesses which
+  ///                     will be copied from host to device.
+  Function *createSubfunctionDefinition(int NumArgs);
+
+  /// @brief Extract all the ptx related subfunctions into a new module.
+  ///
+  /// @param M            Current module.
+  /// @return             The generated module containing only gpu related
+  ///                     subfunctions.
+  Module *extractPTXFunctionsFromModule(const Module *M);
+
+  /// @brief Get the Value of CUDA block width.
+  Value *getCUDABlockWidth();
+
+  /// @brief Get the Value of CUDA block height.
+  Value *getCUDABlockHeight();
+
+  /// @brief Get the Value of CUDA Gird width.
+  Value *getCUDAGridWidth();
+
+  /// @brief Get the Value of CUDA grid height.
+  Value *getCUDAGridHeight();
+
+  /// @brief Get the Value of the bytes of the output array.
+  Value *getOutputArraySizeInBytes();
+
+  /// @brief Erase the ptx-related subfunctions and declarations.
+  ///
+  /// @param SubFunction  A pointer to the device code function.
+  void eraseUnusedFunctions(Function *SubFunction);
+};
+} // end namespace polly
+#endif /* POLLY_CODEGEN_PTXGENERATOR_H */
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@ -125,6 +125,9 @@ public:
  /// @brief Is this a read memory access?
  bool isRead() const { return Type == MemoryAccess::Read; }

+  /// @brief Is this a write memory access?
+  bool isWrite() const { return Type == MemoryAccess::Write; }
+
  isl_map *getAccessRelation() const;

  /// @brief Get an isl string representing this access function.
--- a/polly/lib/CodeGen/CMakeLists.txt
+++ b/polly/lib/CodeGen/CMakeLists.txt
@ -15,4 +15,5 @@ add_polly_library(PollyCodeGen
  ${ISL_CODEGEN_FILES}
  LoopGenerators.cpp
  Utils.cpp
+  PTXGenerator.cpp
 )
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp
@ -31,6 +31,7 @@
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/BlockGenerators.h"
 #include "polly/CodeGen/LoopGenerators.h"
+#include "polly/CodeGen/PTXGenerator.h"
 #include "polly/CodeGen/Utils.h"
 #include "polly/Support/GICHelper.h"

@ -65,6 +66,17 @@ OpenMP("enable-polly-openmp",
       cl::value_desc("OpenMP code generation enabled if true"),
       cl::init(false), cl::ZeroOrMore);

+static cl::opt<bool>
+GPGPU("enable-polly-gpgpu",
+       cl::desc("Generate GPU parallel code"), cl::Hidden,
+       cl::value_desc("GPGPU code generation enabled if true"),
+       cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<std::string>
+GPUTriple("polly-gpgpu-triple",
+       cl::desc("Target triple for GPU code generation"),
+       cl::Hidden, cl::init(""));
+
 static cl::opt<bool>
 AtLeastOnce("enable-polly-atLeastOnce",
       cl::desc("Give polly the hint, that every loop is executed at least"
@ -284,6 +296,25 @@ private:
  /// statement.
  void codegenForOpenMP(const clast_for *f);

+  /// @brief Create GPGPU device memory access values.
+  ///
+  /// Create a list of values that will be set to be parameters of the GPGPU
+  /// subfunction. These parameters represent device memory base addresses
+  /// and the size in bytes.
+  SetVector<Value*> getGPUValues(unsigned &OutputBytes);
+
+  /// @brief Create a GPU parallel for loop.
+  ///
+  /// This loop reflects a loop as if it would have been created by a GPU
+  /// statement.
+  void codegenForGPGPU(const clast_for *F);
+
+  /// @brief Get innermost statement for the transformed loops.
+  const clast_stmt *getScheduleInfo(const clast_for *F,
+                                    std::vector<int> &NumIters,
+                                    unsigned &LoopDepth,
+                                    unsigned &NonPLoopDepth);
+
  /// @brief Check if a loop is parallel
  ///
  /// Detect if a clast_for loop can be executed in parallel.
@ -530,6 +561,161 @@ void ClastStmtCodeGen::codegenForOpenMP(const clast_for *For) {
  Builder.SetInsertPoint(AfterLoop);
 }

+static unsigned getArraySizeInBytes(const ArrayType *AT) {
+  unsigned Bytes = AT->getNumElements();
+  if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
+    Bytes *= getArraySizeInBytes(T);
+  else
+    Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
+
+  return Bytes;
+}
+
+SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
+  SetVector<Value*> Values;
+  OutputBytes = 0;
+
+  // Record the memory reference base addresses.
+  for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
+    ScopStmt *Stmt = *SI;
+    for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
+         E = Stmt->memacc_end(); I != E; ++I) {
+      Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
+      Values.insert((BaseAddr));
+
+      // FIXME: we assume that there is one and only one array to be written
+      // in a SCoP.
+      int NumWrites = 0;
+      if ((*I)->isWrite()) {
+        ++NumWrites;
+        assert(NumWrites <= 1 &&
+               "We support at most one array to be written in a SCoP.");
+        if (const PointerType * PT =
+            dyn_cast<PointerType>(BaseAddr->getType())) {
+          Type *T = PT->getArrayElementType();
+          const ArrayType *ATy = dyn_cast<ArrayType>(T);
+          OutputBytes = getArraySizeInBytes(ATy);
+        }
+      }
+    }
+  }
+
+  return Values;
+}
+
+const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
+                                                    std::vector<int> &NumIters,
+                                                    unsigned &LoopDepth,
+                                                    unsigned &NonPLoopDepth) {
+  clast_stmt *Stmt = (clast_stmt *)F;
+  const clast_for *Result;
+  bool NonParaFlag = false;
+  LoopDepth = 0;
+  NonPLoopDepth = 0;
+
+  while (Stmt) {
+    if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
+      const clast_for *T = (clast_for *) Stmt;
+      if (isParallelFor(T)) {
+        if (!NonParaFlag) {
+          NumIters.push_back(getNumberOfIterations(T));
+          Result = T;
+        }
+      } else
+        NonParaFlag = true;
+
+      Stmt = T->body;
+      LoopDepth++;
+      continue;
+    }
+    Stmt = Stmt->next;
+  }
+
+  assert(NumIters.size() == 4 &&
+         "The loops should be tiled into 4-depth parallel loops and an "
+         "innermost non-parallel one (if exist).");
+  NonPLoopDepth = LoopDepth - NumIters.size();
+  assert(NonPLoopDepth <= 1
+         && "We support only one innermost non-parallel loop currently.");
+  return (const clast_stmt *)Result->body;
+}
+
+void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
+  BasicBlock::iterator LoopBody;
+  SetVector<Value *> Values;
+  SetVector<Value *> IVS;
+  std::vector<int> NumIterations;
+  PTXGenerator::ValueToValueMapTy VMap;
+
+  assert(!GPUTriple.empty()
+         && "Target triple should be set properly for GPGPU code generation.");
+  PTXGenerator PTXGen(Builder, P, GPUTriple);
+
+  // Get original IVS and ScopStmt
+  unsigned TiledLoopDepth, NonPLoopDepth;
+  const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
+                                                TiledLoopDepth, NonPLoopDepth);
+  const clast_stmt *TmpStmt;
+  const clast_user_stmt *U;
+  const clast_for *InnerFor;
+  if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
+    InnerFor = (const clast_for *)InnerStmt;
+    TmpStmt = InnerFor->body;
+  } else
+    TmpStmt = InnerStmt;
+  U = (const clast_user_stmt *) TmpStmt;
+  ScopStmt *Statement = (ScopStmt *) U->statement->usr;
+  for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
+    const Value* IV = Statement->getInductionVariableForDimension(i);
+    IVS.insert(const_cast<Value *>(IV));
+  }
+
+  unsigned OutBytes;
+  Values = getGPUValues(OutBytes);
+  PTXGen.setOutputBytes(OutBytes);
+  PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
+
+  BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(LoopBody);
+
+  BasicBlock *AfterBB = 0;
+  if (NonPLoopDepth) {
+    Value *LowerBound, *UpperBound, *IV, *Stride;
+    Type *IntPtrTy = getIntPtrTy();
+    LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
+    UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
+    Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
+    IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
+    const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
+    Value *OldIV = const_cast<Value *>(OldIV_);
+    VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
+  }
+
+  updateWithValueMap(VMap, /* reverse */ false);
+  BlockGenerator::generate(Builder, *Statement, ValueMap, P);
+  updateWithValueMap(VMap, /* reverse */ true);
+
+  if (AfterBB)
+    Builder.SetInsertPoint(AfterBB->begin());
+
+  // FIXME: The replacement of the host base address with the parameter of ptx
+  // subfunction should have been done by updateWithValueMap. We use the
+  // following codes to avoid affecting other parts of Polly. This should be
+  // fixed later.
+  Function *FN = Builder.GetInsertBlock()->getParent();
+  for (unsigned j = 0; j < Values.size(); j++) {
+    Value *baseAddr = Values[j];
+    for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
+      for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
+        I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
+    }
+  }
+  Builder.SetInsertPoint(AfterLoop);
+  PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
+                                NumIterations[2], NumIterations[3]);
+  PTXGen.finishGeneration(FN);
+}
+
 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
  const clast_stmt *stmt = f->body;

@ -647,6 +833,16 @@ void ClastStmtCodeGen::codegen(const clast_for *f) {
    }
  }

+  if (GPGPU && isParallelFor(f)) {
+    if (!parallelCodeGeneration) {
+      parallelCodeGeneration = true;
+      parallelLoops.push_back(f->iterator);
+      codegenForGPGPU(f);
+      parallelCodeGeneration = false;
+      return;
+    }
+  }
+
  codegenForSequential(f);
 }

--- a/polly/lib/CodeGen/PTXGenerator.cpp
+++ b/polly/lib/CodeGen/PTXGenerator.cpp
@ -0,0 +1,652 @@
+//===------ PTXGenerator.cpp -  IR helper to create loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions to create GPU parallel codes as LLVM-IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/CodeGen/PTXGenerator.h"
+#include "polly/ScopDetection.h"
+#include "polly/ScopInfo.h"
+
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+using namespace polly;
+
+PTXGenerator::PTXGenerator(IRBuilder<> &Builder, Pass *P,
+                           const std::string &Triple):
+  Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
+  BlockWidth(1), BlockHeight(1), OutputBytes(0) {
+
+  InitializeGPUDataTypes();
+}
+
+Module *PTXGenerator::getModule() {
+  return Builder.GetInsertBlock()->getParent()->getParent();
+}
+
+Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
+  assert(NumArgs == 1 && "we support only one array access now.");
+
+  Module *M = getModule();
+  Function *F = Builder.GetInsertBlock()->getParent();
+  std::vector<Type*> Arguments;
+  for (int i = 0; i < NumArgs; i++)
+    Arguments.push_back(Builder.getInt8PtrTy());
+  FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
+  Function *FN = Function::Create(FT, Function::InternalLinkage,
+      F->getName() + "_ptx_subfn", M);
+  FN->setCallingConv(CallingConv::PTX_Kernel);
+
+  // Do not run any optimization pass on the new function.
+  P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
+
+  for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
+    AI->setName("ptx.Array");
+
+  return FN;
+}
+
+void PTXGenerator::createSubfunction(SetVector<Value*> &UsedValues,
+                                     SetVector<Value*> &OriginalIVS,
+                                     PTXGenerator::ValueToValueMapTy &VMap,
+                                     Function **SubFunction) {
+  Function *FN = createSubfunctionDefinition(UsedValues.size());
+  Module *M = getModule();
+  LLVMContext &Context = FN->getContext();
+  IntegerType *Ty = Builder.getInt64Ty();
+
+  // Store the previous basic block.
+  BasicBlock *PrevBB = Builder.GetInsertBlock();
+
+  // Create basic blocks.
+  BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
+  BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
+  BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
+
+  DominatorTree &DT = P->getAnalysis<DominatorTree>();
+  DT.addNewBlock(HeaderBB, PrevBB);
+  DT.addNewBlock(ExitBB, HeaderBB);
+  DT.addNewBlock(BodyBB, HeaderBB);
+
+  Builder.SetInsertPoint(HeaderBB);
+
+  // Insert VMap items with maps of array base address on the host to base
+  // address on the device.
+  Function::arg_iterator AI = FN->arg_begin();
+  for (unsigned j = 0; j < UsedValues.size(); j++) {
+    Value *BaseAddr = UsedValues[j];
+    Type *ArrayTy = BaseAddr->getType();
+    Value *Param = Builder.CreateBitCast(AI, ArrayTy);
+    VMap.insert(std::make_pair<Value*, Value*>(BaseAddr, Param));
+    AI++;
+  }
+
+  // FIXME: These intrinsics should be inserted on-demand. However, we insert
+  // them all currently for simplicity.
+  Function *GetNctaidX =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
+  Function *GetNctaidY =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
+  Function *GetCtaidX =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
+  Function *GetCtaidY =
+    Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
+  Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
+  Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
+  Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
+  Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
+
+  Value *GridWidth = Builder.CreateCall(GetNctaidX);
+  GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
+  Value *GridHeight = Builder.CreateCall(GetNctaidY);
+  GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
+  Value *BlockWidth = Builder.CreateCall(GetNtidX);
+  BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
+  Value *BlockHeight = Builder.CreateCall(GetNtidY);
+  BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
+  Value *BIDx = Builder.CreateCall(GetCtaidX);
+  BIDx = Builder.CreateIntCast(BIDx, Ty, false);
+  Value *BIDy = Builder.CreateCall(GetCtaidY);
+  BIDy = Builder.CreateIntCast(BIDy, Ty, false);
+  Value *TIDx = Builder.CreateCall(GetTidX);
+  TIDx = Builder.CreateIntCast(TIDx, Ty, false);
+  Value *TIDy = Builder.CreateCall(GetTidY);
+  TIDy = Builder.CreateIntCast(TIDy, Ty, false);
+
+  Builder.CreateBr(BodyBB);
+  Builder.SetInsertPoint(BodyBB);
+
+  unsigned NumDims = OriginalIVS.size();
+  std::vector<Value *> Substitutions;
+  Value *BlockID, *ThreadID;
+  switch (NumDims) {
+  case 1: {
+    Value *BlockSize = Builder.CreateMul(BlockWidth, BlockHeight,
+                                         "p_gpu_blocksize");
+    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
+    BlockID = Builder.CreateAdd(BlockID, BIDx);
+    BlockID = Builder.CreateMul(BlockID, BlockSize);
+    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
+    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
+    ThreadID = Builder.CreateAdd(ThreadID, BlockID);
+    Substitutions.push_back(ThreadID);
+    break;
+  }
+  case 2: {
+    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
+    BlockID = Builder.CreateAdd(BlockID, BIDx);
+    Substitutions.push_back(BlockID);
+    ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
+    ThreadID = Builder.CreateAdd(ThreadID, TIDx);
+    Substitutions.push_back(ThreadID);
+    break;
+  }
+  case 3: {
+    BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
+    BlockID = Builder.CreateAdd(BlockID, BIDx);
+    Substitutions.push_back(BlockID);
+    Substitutions.push_back(TIDy);
+    Substitutions.push_back(TIDx);
+    break;
+  }
+  case 4: {
+    Substitutions.push_back(BIDy);
+    Substitutions.push_back(BIDx);
+    Substitutions.push_back(TIDy);
+    Substitutions.push_back(TIDx);
+    break;
+  }
+  default:
+    assert(true &&
+           "We cannot transform parallel loops whose depth is larger than 4.");
+    return;
+  }
+
+  assert(OriginalIVS.size() == Substitutions.size()
+         && "The size of IVS should be equal to the size of substitutions.");
+  for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
+    VMap.insert(std::make_pair<Value*, Value*>(OriginalIVS[i],
+                                               Substitutions[i]));
+  }
+
+  Builder.CreateBr(ExitBB);
+  Builder.SetInsertPoint(--Builder.GetInsertPoint());
+  BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
+
+  // Add the termination of the ptx-device subfunction.
+  Builder.SetInsertPoint(ExitBB);
+  Builder.CreateRetVoid();
+
+  Builder.SetInsertPoint(LoopBody);
+  *SubFunction = FN;
+}
+
+void PTXGenerator::startGeneration(SetVector<Value*> &UsedValues,
+                                   SetVector<Value*> &OriginalIVS,
+                                   ValueToValueMapTy &VMap,
+                                   BasicBlock::iterator *LoopBody) {
+  Function *SubFunction;
+  BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
+  createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
+  *LoopBody = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(PrevInsertPoint);
+}
+
+IntegerType *PTXGenerator::getInt64Type() {
+  return Builder.getInt64Ty();
+}
+
+PointerType *PTXGenerator::getI8PtrType() {
+  return PointerType::getUnqual(Builder.getInt8Ty());
+}
+
+PointerType *PTXGenerator::getPtrI8PtrType() {
+  return PointerType::getUnqual(getI8PtrType());
+}
+
+PointerType *PTXGenerator::getFloatPtrType() {
+  return llvm::Type::getFloatPtrTy(getModule()->getContext());
+}
+
+PointerType *PTXGenerator::getGPUContextPtrType() {
+  return PointerType::getUnqual(ContextTy);
+}
+
+PointerType *PTXGenerator::getGPUModulePtrType() {
+  return PointerType::getUnqual(ModuleTy);
+}
+
+PointerType *PTXGenerator::getGPUDevicePtrType() {
+  return PointerType::getUnqual(DeviceTy);
+}
+
+PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
+  return PointerType::getUnqual(DevDataTy);
+}
+
+PointerType *PTXGenerator::getGPUFunctionPtrType() {
+  return PointerType::getUnqual(KernelTy);
+}
+
+PointerType *PTXGenerator::getGPUEventPtrType() {
+  return PointerType::getUnqual(EventTy);
+}
+
+void PTXGenerator::InitializeGPUDataTypes() {
+  LLVMContext &Context = getModule()->getContext();
+
+  ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
+  ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
+  KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
+  DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
+  DevDataTy = StructType::create(Context,"struct.PollyGPUDevicePtrT");
+  EventTy = StructType::create(Context, "struct.PollyGPUEventT");
+}
+
+void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
+  const char *Name = "polly_initDevice";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
+    Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall2(F, Context, Device);
+}
+
+void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
+  const char *Name = "polly_getPTXModule";
+  llvm::Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall2(F, Buffer, Module);
+}
+
+void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
+                                               Value *Kernel) {
+  const char *Name = "polly_getPTXKernelEntry";
+  llvm::Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(getGPUModulePtrType());
+    Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, Entry, Module, Kernel);
+}
+
+void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
+                                                            Value *DeviceData,
+                                                            Value *Size) {
+  const char *Name = "polly_allocateMemoryForHostAndDevice";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getPtrI8PtrType());
+    Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, HostData, DeviceData, Size);
+}
+
+void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
+                                                  Value *HostData,
+                                                  Value *Size) {
+  const char *Name = "polly_copyFromHostToDevice";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getPtrGPUDevicePtrType());
+    Args.push_back(getI8PtrType());
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, DeviceData, HostData, Size);
+}
+
+void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
+                                                  Value *DeviceData,
+                                                  Value *Size) {
+  const char *Name = "polly_copyFromDeviceToHost";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(getPtrGPUDevicePtrType());
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, HostData, DeviceData, Size);
+}
+
+void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
+                                                 Value *BlockWidth,
+                                                 Value *BlockHeight,
+                                                 Value *DeviceData) {
+  const char *Name = "polly_setKernelParameters";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getGPUFunctionPtrType());
+    Args.push_back(getInt64Type());
+    Args.push_back(getInt64Type());
+    Args.push_back(getPtrGPUDevicePtrType());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
+}
+
+void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
+                                          Value *GridHeight) {
+  const char *Name = "polly_launchKernel";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getGPUFunctionPtrType());
+    Args.push_back(getInt64Type());
+    Args.push_back(getInt64Type());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
+}
+
+void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
+                                                   Value *StopEvent) {
+  const char *Name = "polly_startTimerByCudaEvent";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
+    Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall2(F, StartEvent, StopEvent);
+}
+
+void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
+                                                  Value *StopEvent,
+                                                  Value *Timer) {
+  const char *Name = "polly_stopTimerByCudaEvent";
+  Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getGPUEventPtrType());
+    Args.push_back(getGPUEventPtrType());
+    Args.push_back(getFloatPtrType());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
+}
+
+void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
+                                                   Value *DeviceData,
+                                                   Value *Module,
+                                                   Value *Context,
+                                                   Value *Kernel) {
+  const char *Name = "polly_cleanupGPGPUResources";
+  llvm::Module *M = getModule();
+  Function *F = M->getFunction(Name);
+
+  // If F is not available, declare it.
+  if (!F) {
+    GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
+    std::vector<Type*> Args;
+    Args.push_back(getI8PtrType());
+    Args.push_back(getPtrGPUDevicePtrType());
+    Args.push_back(getGPUModulePtrType());
+    Args.push_back(getGPUContextPtrType());
+    Args.push_back(getGPUFunctionPtrType());
+    FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
+    F = Function::Create(Ty, Linkage, Name, M);
+  }
+
+  Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
+}
+
+Value *PTXGenerator::getCUDAGridWidth() {
+  return ConstantInt::get(getInt64Type(), GridWidth);
+}
+
+Value *PTXGenerator::getCUDAGridHeight() {
+  return ConstantInt::get(getInt64Type(), GridHeight);
+}
+
+Value *PTXGenerator::getCUDABlockWidth() {
+  return ConstantInt::get(getInt64Type(), BlockWidth);
+}
+
+Value *PTXGenerator::getCUDABlockHeight() {
+  return ConstantInt::get(getInt64Type(), BlockHeight);
+}
+
+Value *PTXGenerator::getOutputArraySizeInBytes() {
+  return ConstantInt::get(getInt64Type(), OutputBytes);
+}
+
+Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
+  Module *M = getModule();
+  std::string LLVMKernelStr;
+  raw_string_ostream NameROS(LLVMKernelStr);
+  formatted_raw_ostream FOS(NameROS);
+  FOS << "target triple = \"" << GPUTriple <<"\"\n";
+  SubFunction->print(FOS);
+
+  // Insert ptx intrinsics into the kernel string.
+  for (Module::iterator I = M->begin(), E = M->end(); I != E; ) {
+    Function *F = I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->isIntrinsic()) {
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::ptx_read_nctaid_x:
+      case Intrinsic::ptx_read_nctaid_y:
+      case Intrinsic::ptx_read_ctaid_x:
+      case Intrinsic::ptx_read_ctaid_y:
+      case Intrinsic::ptx_read_ntid_x:
+      case Intrinsic::ptx_read_ntid_y:
+      case Intrinsic::ptx_read_tid_x:
+      case Intrinsic::ptx_read_tid_y:
+        F->print(FOS);
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  Value *LLVMKernel = Builder.CreateGlobalStringPtr(LLVMKernelStr,
+                                                    "llvm_kernel");
+  Value *MCPU = Builder.CreateGlobalStringPtr("sm_10", "mcpu");
+  Value *Features = Builder.CreateGlobalStringPtr("", "cpu_features");
+
+  Function *GetDeviceKernel = Intrinsic::getDeclaration(M,
+                                                        Intrinsic::codegen);
+
+  return Builder.CreateCall3(GetDeviceKernel, LLVMKernel, MCPU, Features);
+}
+
+Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
+  StringRef Entry = SubFunction->getName();
+  return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
+}
+
+void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
+  Module *M = getModule();
+  SubFunction->eraseFromParent();
+
+  if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x"))
+    FuncPTXReadNCtaidX->eraseFromParent();
+
+  if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y"))
+    FuncPTXReadNCtaidY->eraseFromParent();
+
+  if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x"))
+    FuncPTXReadCtaidX->eraseFromParent();
+
+  if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y"))
+    FuncPTXReadCtaidY->eraseFromParent();
+
+  if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x"))
+    FuncPTXReadNTidX->eraseFromParent();
+
+  if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y"))
+    FuncPTXReadNTidY->eraseFromParent();
+
+  if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x"))
+    FuncPTXReadTidX->eraseFromParent();
+
+  if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y"))
+    FuncPTXReadTidY->eraseFromParent();
+}
+
+void PTXGenerator::finishGeneration(Function *F) {
+  // Define data used by the GPURuntime library.
+  AllocaInst *PtrCUContext = Builder.CreateAlloca(getGPUContextPtrType(), 0,
+                                                  "phcontext");
+  AllocaInst *PtrCUDevice = Builder.CreateAlloca(getGPUDevicePtrType(), 0,
+                                                 "phdevice");
+  AllocaInst *PtrCUModule = Builder.CreateAlloca(getGPUModulePtrType(), 0,
+                                                 "phmodule");
+  AllocaInst *PtrCUKernel = Builder.CreateAlloca(getGPUFunctionPtrType(), 0,
+                                                 "phkernel");
+  AllocaInst *PtrCUStartEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
+                                                     "pstart_timer");
+  AllocaInst *PtrCUStopEvent = Builder.CreateAlloca(getGPUEventPtrType(), 0,
+                                                    "pstop_timer");
+  AllocaInst *PtrDevData = Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0,
+                                                "pdevice_data");
+  AllocaInst *PtrHostData = Builder.CreateAlloca(getI8PtrType(), 0,
+                                                 "phost_data");
+  Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
+  AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
+
+  // Initialize the GPU device.
+  createCallInitDevice(PtrCUContext, PtrCUDevice);
+
+  // Create the GPU kernel module and entry function.
+  Value *PTXString = createPTXKernelFunction(F);
+  Value *PTXEntry = getPTXKernelEntryName(F);
+  createCallGetPTXModule(PTXString, PtrCUModule);
+  LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
+  createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
+
+  // Allocate device memory and its corresponding host memory.
+  createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
+                                           getOutputArraySizeInBytes());
+
+  // Get the pointer to the device memory and set the GPU execution parameters.
+  LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
+  LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
+  createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
+                                getCUDABlockHeight(), DData);
+
+  // Create the start and end timer and record the start time.
+  createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
+
+  // Launch the GPU kernel.
+  createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
+
+  // Copy the results back from the GPU to the host.
+  LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
+  createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
+
+  // Record the end time.
+  LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
+  LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
+  createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent,
+                                 PtrElapsedTimes);
+
+  // Cleanup all the resources used.
+  LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
+  createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext,
+                                  CUKernel);
+
+  // Erase the ptx kernel and device subfunctions and ptx intrinsics from
+  // current module.
+  eraseUnusedFunctions(F);
+}
--- a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c
+++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.c
@ -0,0 +1,16 @@
+int A[128][128];
+
+int gpu_pure() {
+  int i,j;
+
+  for(i = 0; i < 128; i++)
+    for(j = 0; j < 128; j++)
+      A[i][j] = i*128 + j;
+
+  return 0;
+}
+
+int main() {
+  int b = gpu_pure();
+  return 0;
+}
--- a/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll
+++ b/polly/test/CodeGen/GPGPU/2d_innermost_parallel.ll
@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '2d_innermost_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_pure() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc6, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc6 ], [ 0, %entry ]
+  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+  br i1 %exitcond6, label %for.body, label %for.end8
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body ]
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %tmp = shl nsw i64 %indvars.iv2, 7
+  %tmp7 = add nsw i64 %tmp, %indvars.iv
+  %arrayidx5 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+  %tmp8 = trunc i64 %tmp7 to i32
+  store i32 %tmp8, i32* %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc6
+
+for.inc6:                                         ; preds = %for.end
+  %indvars.iv.next3 = add i64 %indvars.iv2, 1
+  br label %for.cond
+
+for.end8:                                         ; preds = %for.cond
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %call = call i32 @gpu_pure()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources
--- a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
+++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.c
@ -0,0 +1,17 @@
+int A[128][128];
+
+int gpu_no_pure() {
+  int i,j,k;
+
+  for(i = 0; i < 128; i++)
+    for(j = 0; j < 128; j++)
+      for(k = 0; k < 256; k++)
+        A[i][j] += i*123/(k+1)+5-j*k-123;
+
+  return 0;
+}
+
+int main() {
+  int b = gpu_no_pure();
+  return 0;
+}
--- a/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
+++ b/polly/test/CodeGen/GPGPU/3d_innermost_non_parallel.ll
@ -0,0 +1,88 @@
+; RUN: opt %loadPolly -basicaa -polly-import-jscop -polly-import-jscop-dir=%S -polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu -polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen %s -S | FileCheck %s
+; ModuleID = '3d_innermost_non_parallel.s'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = common global [128 x [128 x i32]] zeroinitializer, align 16
+
+define i32 @gpu_no_pure() nounwind uwtable {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc16, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc16 ], [ 0, %entry ]
+  %lftr.wideiv5 = trunc i64 %indvars.iv2 to i32
+  %exitcond6 = icmp ne i32 %lftr.wideiv5, 128
+  br i1 %exitcond6, label %for.body, label %for.end18
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc13, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc13 ], [ 0, %for.body ]
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond1 = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond1, label %for.body3, label %for.end15
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 256
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %tmp = mul nsw i64 %indvars.iv2, 123
+  %add = add nsw i32 %k.0, 1
+  %tmp7 = trunc i64 %tmp to i32
+  %div = sdiv i32 %tmp7, %add
+  %add7 = add nsw i32 %div, 5
+  %tmp8 = trunc i64 %indvars.iv to i32
+  %mul8 = mul nsw i32 %tmp8, %k.0
+  %sub = sub nsw i32 %add7, %mul8
+  %sub9 = add nsw i32 %sub, -123
+  %arrayidx11 = getelementptr inbounds [128 x [128 x i32]]* @A, i64 0, i64 %indvars.iv2, i64 %indvars.iv
+  %tmp9 = load i32* %arrayidx11, align 4
+  %add12 = add nsw i32 %tmp9, %sub9
+  store i32 %add12, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.end
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br label %for.cond1
+
+for.end15:                                        ; preds = %for.cond1
+  br label %for.inc16
+
+for.inc16:                                        ; preds = %for.end15
+  %indvars.iv.next3 = add i64 %indvars.iv2, 1
+  br label %for.cond
+
+for.end18:                                        ; preds = %for.cond
+  ret i32 0
+}
+
+define i32 @main() nounwind uwtable {
+entry:
+  %call = call i32 @gpu_no_pure()
+  ret i32 0
+}
+
+; CHECK:  call void @polly_initDevice
+; CHECK:  call void @polly_getPTXModule
+; CHECK:  call void @polly_getPTXKernelEntry
+; CHECK:  call void @polly_allocateMemoryForHostAndDevice
+; CHECK:  call void @polly_setKernelParameters
+; CHECK:  call void @polly_startTimerByCudaEvent
+; CHECK:  call void @polly_launchKernel
+; CHECK:  call void @polly_copyFromDeviceToHost
+; CHECK:  call void @polly_stopTimerByCudaEvent
+; CHECK:  call void @polly_cleanupGPGPUResources
--- a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
+++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
+      }
+   ]
+}
--- a/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
+++ b/polly/test/CodeGen/GPGPU/gpu_no_pure___%for.cond---%for.end18.jscop.transformed+gpu
@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end18",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 and i2 >= 0 and i2 <= 255 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, o0, o1, o2, o3, i2, 0] : o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+      }
+   ]
+}
--- a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
+++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop
@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end8",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0] }"
+      }
+   ]
+}
--- a/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
+++ b/polly/test/CodeGen/GPGPU/gpu_pure___%for.cond---%for.end8.jscop.transformed+gpu
@ -0,0 +1,17 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end8",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body3[i0, i1] -> MemRef_A[128i0 + i1] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= 127 and i1 >= 0 and i1 <= 127 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "{ Stmt_for_body3[i0, i1] -> scattering[0, o0, o1, o2, o3]: o0 >= 0 and o0 <= 7 and o1 >= 0 and o1 <= 15 and o2 >= 0 and o2 <= 7 and o3 >= 0 and o3 <= 15 and i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
+      }
+   ]
+}