From edb885cb1230b0693f68d13e5de3baf6b156ef73 Mon Sep 17 00:00:00 2001
From: Tobias Grosser <tobias@grosser.es>
Date: Thu, 21 Jul 2016 13:15:59 +0000
Subject: [PATCH] GPGPU: generate code for ScopStatements

This change introduces the actual compute code in the GPU kernels. To ensure
all values referenced from the statements in the GPU kernel are indeed available
we scan all ScopStmts in the GPU kernel for references to llvm::Values that
are not yet covered by already modeled outer loop iterators, parameters, or
array base pointers and also pass these additional llvm::Values to the
GPU kernel.

For arrays used in the GPU kernel we introduce a new ScopArrayInfo object, which
is referenced by the newly generated access functions within the GPU kernel and
which is used to help with code generation.

llvm-svn: 276270
---
 polly/lib/CodeGen/PPCGCodeGeneration.cpp      | 217 ++++++++++++++++--
 polly/test/GPGPU/double-parallel-loop.ll      |  44 +++-
 polly/test/GPGPU/host-control-flow.ll         |  67 +++++-
 .../GPGPU/kernel-params-only-some-arrays.ll   |   6 +-
 4 files changed, 306 insertions(+), 28 deletions(-)
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index 1621252ba102..8b4d2220297e 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -18,6 +18,7 @@
 #include "polly/LinkAllPasses.h"
 #include "polly/Options.h"
 #include "polly/ScopInfo.h"
+#include "polly/Support/SCEVValidator.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -61,17 +62,37 @@ static cl::opt<bool> DumpKernelIR("polly-acc-dump-kernel-ir",
 /// This function is a callback for to generate the ast expressions for each
 /// of the scheduled ScopStmts.
 static __isl_give isl_id_to_ast_expr *pollyBuildAstExprForStmt(
-    void *Stmt, isl_ast_build *Build,
+    void *StmtT, isl_ast_build *Build,
     isl_multi_pw_aff *(*FunctionIndex)(__isl_take isl_multi_pw_aff *MPA,
                                        isl_id *Id, void *User),
     void *UserIndex,
     isl_ast_expr *(*FunctionExpr)(isl_ast_expr *Expr, isl_id *Id, void *User),
-    void *User_expr) {
+    void *UserExpr) {
 
-  // TODO: Implement the AST expression generation. For now we just return a
-  // nullptr to ensure that we do not free uninitialized pointers.
+  ScopStmt *Stmt = (ScopStmt *)StmtT;
 
-  return nullptr;
+  isl_ctx *Ctx;
+
+  if (!Stmt || !Build)
+    return NULL;
+
+  Ctx = isl_ast_build_get_ctx(Build);
+  isl_id_to_ast_expr *RefToExpr = isl_id_to_ast_expr_alloc(Ctx, 0);
+
+  for (MemoryAccess *Acc : *Stmt) {
+    isl_map *AddrFunc = Acc->getAddressFunction();
+    AddrFunc = isl_map_intersect_domain(AddrFunc, Stmt->getDomain());
+    isl_id *RefId = Acc->getId();
+    isl_pw_multi_aff *PMA = isl_pw_multi_aff_from_map(AddrFunc);
+    isl_multi_pw_aff *MPA = isl_multi_pw_aff_from_pw_multi_aff(PMA);
+    MPA = isl_multi_pw_aff_coalesce(MPA);
+    MPA = FunctionIndex(MPA, RefId, UserIndex);
+    isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);
+    Access = FunctionExpr(Access, RefId, UserExpr);
+    RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);
+  }
+
+  return RefToExpr;
 }
 
 /// Generate code for a GPU specific isl AST.
@@ -86,7 +107,9 @@ public:
   GPUNodeBuilder(PollyIRBuilder &Builder, ScopAnnotator &Annotator, Pass *P,
                  const DataLayout &DL, LoopInfo &LI, ScalarEvolution &SE,
                  DominatorTree &DT, Scop &S, gpu_prog *Prog)
-      : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {}
+      : IslNodeBuilder(Builder, Annotator, P, DL, LI, SE, DT, S), Prog(Prog) {
+    getExprBuilder().setIDToSAI(&IDToSAI);
+  }
 
 private:
   /// A module containing GPU code.
@@ -108,6 +131,8 @@ private:
   /// By releasing this set all isl_ids will be freed.
   std::set<std::unique_ptr<isl_id, IslIdDeleter>> KernelIDs;
 
+  IslExprBuilder::IDToScopArrayInfoTy IDToSAI;
+
   /// Create code for user-defined AST nodes.
   ///
   /// These AST nodes can be of type:
@@ -121,6 +146,13 @@ private:
   /// @param UserStmt The ast node to generate code for.
   virtual void createUser(__isl_take isl_ast_node *UserStmt);
 
+  /// Find llvm::Values referenced in GPU kernel.
+  ///
+  /// @param Kernel The kernel to scan for llvm::Values
+  ///
+  /// @returns A set of values referenced by the kernel.
+  SetVector<Value *> getReferencesInKernel(ppcg_kernel *Kernel);
+
   /// Create GPU kernel.
   ///
   /// Code generate the kernel described by @p KernelStmt.
@@ -135,7 +167,9 @@ private:
   /// start block of this newly created function.
   ///
   /// @param Kernel The kernel to generate code for.
-  void createKernelFunction(ppcg_kernel *Kernel);
+  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
+  void createKernelFunction(ppcg_kernel *Kernel,
+                            SetVector<Value *> &SubtreeValues);
 
   /// Create the declaration of a kernel function.
   ///
@@ -147,14 +181,23 @@ private:
   ///   - Other LLVM Value references (TODO)
   ///
   /// @param Kernel The kernel to generate the function declaration for.
+  /// @param SubtreeValues The set of llvm::Values referenced by this kernel.
+  ///
   /// @returns The newly declared function.
-  Function *createKernelFunctionDecl(ppcg_kernel *Kernel);
+  Function *createKernelFunctionDecl(ppcg_kernel *Kernel,
+                                     SetVector<Value *> &SubtreeValues);
 
   /// Insert intrinsic functions to obtain thread and block ids.
   ///
   /// @param The kernel to generate the intrinsic functions for.
   void insertKernelIntrinsics(ppcg_kernel *Kernel);
 
+  /// Create code for a ScopStmt called in @p Expr.
+  ///
+  /// @param Expr The expression containing the call.
+  /// @param KernelStmt The kernel statement referenced in the call.
+  void createScopStmt(isl_ast_expr *Expr, ppcg_kernel_stmt *KernelStmt);
+
   /// Create an in-kernel synchronization call.
   void createKernelSync();
 
@@ -201,8 +244,7 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
 
   switch (KernelStmt->type) {
   case ppcg_kernel_domain:
-    // TODO Create kernel user stmt
-    isl_ast_expr_free(Expr);
+    createScopStmt(Expr, KernelStmt);
     isl_ast_node_free(UserStmt);
     return;
   case ppcg_kernel_copy:
@@ -222,30 +264,143 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
   return;
 }
 
+void GPUNodeBuilder::createScopStmt(isl_ast_expr *Expr,
+                                    ppcg_kernel_stmt *KernelStmt) {
+  auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
+  isl_id_to_ast_expr *Indexes = KernelStmt->u.d.ref2expr;
+
+  LoopToScevMapT LTS;
+  LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end());
+
+  createSubstitutions(Expr, Stmt, LTS);
+
+  if (Stmt->isBlockStmt())
+    BlockGen.copyStmt(*Stmt, LTS, Indexes);
+  else
+    assert(0 && "Region statement not supported\n");
+}
+
 void GPUNodeBuilder::createKernelSync() {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   auto *Sync = Intrinsic::getDeclaration(M, Intrinsic::nvvm_barrier0);
   Builder.CreateCall(Sync, {});
 }
 
+/// Collect llvm::Values referenced from @p Node
+///
+/// This function only applies to isl_ast_nodes that are user_nodes referring
+/// to a ScopStmt. All other node types are ignore.
+///
+/// @param Node The node to collect references for.
+/// @param User A user pointer used as storage for the data that is collected.
+///
+/// @returns isl_bool_true if data could be collected successfully.
+isl_bool collectReferencesInGPUStmt(__isl_keep isl_ast_node *Node, void *User) {
+  if (isl_ast_node_get_type(Node) != isl_ast_node_user)
+    return isl_bool_true;
+
+  isl_ast_expr *Expr = isl_ast_node_user_get_expr(Node);
+  isl_ast_expr *StmtExpr = isl_ast_expr_get_op_arg(Expr, 0);
+  isl_id *Id = isl_ast_expr_get_id(StmtExpr);
+  const char *Str = isl_id_get_name(Id);
+  isl_id_free(Id);
+  isl_ast_expr_free(StmtExpr);
+  isl_ast_expr_free(Expr);
+
+  if (!isPrefix(Str, "Stmt"))
+    return isl_bool_true;
+
+  Id = isl_ast_node_get_annotation(Node);
+  auto *KernelStmt = (ppcg_kernel_stmt *)isl_id_get_user(Id);
+  auto Stmt = (ScopStmt *)KernelStmt->u.d.stmt->stmt;
+  isl_id_free(Id);
+
+  addReferencesFromStmt(Stmt, User);
+
+  return isl_bool_true;
+}
+
+SetVector<Value *> GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
+  SetVector<Value *> SubtreeValues;
+  SetVector<const SCEV *> SCEVs;
+  SetVector<const Loop *> Loops;
+  SubtreeReferences References = {
+      LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
+
+  for (const auto &I : IDToValue)
+    SubtreeValues.insert(I.second);
+
+  isl_ast_node_foreach_descendant_top_down(
+      Kernel->tree, collectReferencesInGPUStmt, &References);
+
+  for (const SCEV *Expr : SCEVs)
+    findValues(Expr, SE, SubtreeValues);
+
+  for (auto &SAI : S.arrays())
+    SubtreeValues.remove(SAI.second->getBasePtr());
+
+  isl_space *Space = S.getParamSpace();
+  for (long i = 0; i < isl_space_dim(Space, isl_dim_param); i++) {
+    isl_id *Id = isl_space_get_dim_id(Space, isl_dim_param, i);
+    assert(IDToValue.count(Id));
+    Value *Val = IDToValue[Id];
+    SubtreeValues.remove(Val);
+    isl_id_free(Id);
+  }
+  isl_space_free(Space);
+
+  for (long i = 0; i < isl_space_dim(Kernel->space, isl_dim_set); i++) {
+    isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
+    assert(IDToValue.count(Id));
+    Value *Val = IDToValue[Id];
+    SubtreeValues.remove(Val);
+    isl_id_free(Id);
+  }
+
+  return SubtreeValues;
+}
+
 void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
   isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
   ppcg_kernel *Kernel = (ppcg_kernel *)isl_id_get_user(Id);
   isl_id_free(Id);
   isl_ast_node_free(KernelStmt);
 
+  SetVector<Value *> SubtreeValues = getReferencesInKernel(Kernel);
+
   assert(Kernel->tree && "Device AST of kernel node is empty");
 
   Instruction &HostInsertPoint = *Builder.GetInsertPoint();
   IslExprBuilder::IDToValueTy HostIDs = IDToValue;
+  ValueMapT HostValueMap = ValueMap;
 
-  createKernelFunction(Kernel);
+  SetVector<const Loop *> Loops;
+
+  // Create for all loops we depend on values that contain the current loop
+  // iteration. These values are necessary to generate code for SCEVs that
+  // depend on such loops. As a result we need to pass them to the subfunction.
+  for (const Loop *L : Loops) {
+    const SCEV *OuterLIV = SE.getAddRecExpr(SE.getUnknown(Builder.getInt64(0)),
+                                            SE.getUnknown(Builder.getInt64(1)),
+                                            L, SCEV::FlagAnyWrap);
+    Value *V = generateSCEV(OuterLIV);
+    OutsideLoopIterations[L] = SE.getUnknown(V);
+    SubtreeValues.insert(V);
+  }
+
+  createKernelFunction(Kernel, SubtreeValues);
 
   create(isl_ast_node_copy(Kernel->tree));
 
   Builder.SetInsertPoint(&HostInsertPoint);
   IDToValue = HostIDs;
 
+  ValueMap = HostValueMap;
+  ScalarMap.clear();
+  PHIOpMap.clear();
+  EscapeMap.clear();
+  IDToSAI.clear();
+
   finalizeKernelFunction();
 }
 
@@ -263,7 +418,9 @@ static std::string computeNVPTXDataLayout(bool is64Bit) {
   return Ret;
 }
 
-Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
+Function *
+GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
+                                         SetVector<Value *> &SubtreeValues) {
   std::vector<Type *> Args;
   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
 
@@ -284,6 +441,9 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
   for (long i = 0; i < NumVars; i++)
     Args.push_back(Builder.getInt64Ty());
 
+  for (auto *V : SubtreeValues)
+    Args.push_back(V->getType());
+
   auto *FT = FunctionType::get(Builder.getVoidTy(), Args, false);
   auto *FN = Function::Create(FT, Function::ExternalLinkage, Identifier,
                               GPUModule.get());
@@ -294,7 +454,27 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
     if (!ppcg_kernel_requires_array_argument(Kernel, i))
       continue;
 
-    Arg->setName(Prog->array[i].name);
+    Arg->setName(Kernel->array[i].array->name);
+
+    isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
+    const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl_id_copy(Id));
+    Type *EleTy = SAI->getElementType();
+    Value *Val = &*Arg;
+    SmallVector<const SCEV *, 4> Sizes;
+    isl_ast_build *Build =
+        isl_ast_build_from_context(isl_set_copy(Prog->context));
+    for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
+      isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
+          Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
+      auto V = ExprBuilder.create(DimSize);
+      Sizes.push_back(SE.getSCEV(V));
+    }
+    const ScopArrayInfo *SAIRep =
+        S.getOrCreateScopArrayInfo(Val, EleTy, Sizes, ScopArrayInfo::MK_Array);
+
+    isl_ast_build_free(Build);
+    isl_id_free(Id);
+    IDToSAI[Id] = SAIRep;
     Arg++;
   }
 
@@ -314,6 +494,12 @@ Function *GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel) {
     Arg++;
   }
 
+  for (auto *V : SubtreeValues) {
+    Arg->setName(V->getName());
+    ValueMap[V] = &*Arg;
+    Arg++;
+  }
+
   return FN;
 }
 
@@ -346,14 +532,15 @@ void GPUNodeBuilder::insertKernelIntrinsics(ppcg_kernel *Kernel) {
   }
 }
 
-void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel) {
+void GPUNodeBuilder::createKernelFunction(ppcg_kernel *Kernel,
+                                          SetVector<Value *> &SubtreeValues) {
 
   std::string Identifier = "kernel_" + std::to_string(Kernel->id);
   GPUModule.reset(new Module(Identifier, Builder.getContext()));
   GPUModule->setTargetTriple(Triple::normalize("nvptx64-nvidia-cuda"));
   GPUModule->setDataLayout(computeNVPTXDataLayout(true /* is64Bit */));
 
-  Function *FN = createKernelFunctionDecl(Kernel);
+  Function *FN = createKernelFunctionDecl(Kernel, SubtreeValues);
 
   BasicBlock *PrevBlock = Builder.GetInsertBlock();
   auto EntryBlock = BasicBlock::Create(Builder.getContext(), "entry", FN);
diff --git a/polly/test/GPGPU/double-parallel-loop.ll b/polly/test/GPGPU/double-parallel-loop.ll
index f9b3f1fa6c68..2eee8aaaa907 100644
--- a/polly/test/GPGPU/double-parallel-loop.ll
+++ b/polly/test/GPGPU/double-parallel-loop.ll
@@ -105,18 +105,52 @@
 ; KERNEL-IR-NEXT:   %t1 = zext i32 %3 to i64
 ; KERNEL-IR-NEXT:   br label %polly.loop_preheader
 
-; KERNEL-IR-LABEL: polly.loop_exit:
+; KERNEL-IR-LABEL: polly.loop_exit:                                  ; preds = %polly.stmt.bb5
 ; KERNEL-IR-NEXT:   ret void
 
-; KERNEL-IR-LABEL: polly.loop_header:
-; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
+; KERNEL-IR-LABEL: polly.loop_header:                                ; preds = %polly.stmt.bb5, %polly.loop_preheader
+; KERNEL-IR-NEXT:   %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.stmt.bb5 ]
+; KERNEL-IR-NEXT:   %4 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %5 = add nsw i64 %4, %t0
+; KERNEL-IR-NEXT:   %6 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %7 = add nsw i64 %6, %t1
+; KERNEL-IR-NEXT:   %8 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %9 = add nsw i64 %7, %8
+; KERNEL-IR-NEXT:   br label %polly.stmt.bb5
+
+; KERNEL-IR-LABEL: polly.stmt.bb5:                                   ; preds = %polly.loop_header
+; KERNEL-IR-NEXT:   %10 = mul i64 %9, %5
+; KERNEL-IR-NEXT:   %p_tmp6 = sitofp i64 %10 to float
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %11 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, %t0
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %12, 1024
+; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t1
+; KERNEL-IR-NEXT:   %15 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %16 = add nsw i64 %14, %15
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %16
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
+; KERNEL-IR-NEXT:   %tmp8_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4
+; KERNEL-IR-NEXT:   %p_tmp9 = fadd float %tmp8_p_scalar_, %p_tmp6
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A2 = mul nsw i64 %18, 1024
+; KERNEL-IR-NEXT:   %19 = mul nsw i64 32, %b1
+; KERNEL-IR-NEXT:   %20 = add nsw i64 %19, %t1
+; KERNEL-IR-NEXT:   %21 = mul nsw i64 16, %polly.indvar
+; KERNEL-IR-NEXT:   %22 = add nsw i64 %20, %21
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A3 = add nsw i64 %polly.access.mul.MemRef_A2, %22
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A4 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A3
+; KERNEL-IR-NEXT:   store float %p_tmp9, float* %polly.access.MemRef_A4, align 4
 ; KERNEL-IR-NEXT:   %polly.indvar_next = add nsw i64 %polly.indvar, 1
 ; KERNEL-IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 0
 ; KERNEL-IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
 
-; KERNEL-IR-LABEL: polly.loop_preheader:
+; KERNEL-IR-LABEL: polly.loop_preheader:                             ; preds = %entry
 ; KERNEL-IR-NEXT:   br label %polly.loop_header
-; KERNEL-IR-NEXT: }
+
 
 ;    void double_parallel_loop(float A[][1024]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
index d3ec4bbce250..911d5cbc896c 100644
--- a/polly/test/GPGPU/host-control-flow.ll
+++ b/polly/test/GPGPU/host-control-flow.ll
@@ -34,27 +34,82 @@
 ; IR-NEXT:   %polly.loop_cond = icmp sle i64 %polly.indvar, 98
 ; IR-NEXT:   br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
 
-; KERNEL-IR-LABEL: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %c0) {
-; KERNEL-IR-NEXT: entry:
+; KERNEL-IR: define ptx_kernel void @kernel_0(i8* %MemRef_A, i64 %c0) {
+; KERNEL-IR-LABEL: entry:
 ; KERNEL-IR-NEXT:   %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-IR-NEXT:   %b0 = zext i32 %0 to i64
 ; KERNEL-IR-NEXT:   %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; KERNEL-IR-NEXT:   %t0 = zext i32 %1 to i64
 ; KERNEL-IR-NEXT:   br label %polly.cond
 
-; KERNEL-IR-LABEL: polly.cond:
+; KERNEL-IR-LABEL: polly.cond:                                       ; preds = %entry
 ; KERNEL-IR-NEXT:   %2 = mul nsw i64 32, %b0
 ; KERNEL-IR-NEXT:   %3 = add nsw i64 %2, %t0
 ; KERNEL-IR-NEXT:   %4 = icmp sle i64 %3, 97
 ; KERNEL-IR-NEXT:   br i1 %4, label %polly.then, label %polly.else
 
-; KERNEL-IR-LABEL: polly.merge:
+; KERNEL-IR-LABEL: polly.merge:                                      ; preds = %polly.else, %polly.stmt.for.body3
 ; KERNEL-IR-NEXT:   ret void
 
-; KERNEL-IR-LABEL: polly.then:
+; KERNEL-IR-LABEL: polly.then:                                       ; preds = %polly.cond
+; KERNEL-IR-NEXT:   %5 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %6 = add nsw i64 %5, %t0
+; KERNEL-IR-NEXT:   br label %polly.stmt.for.body3
+
+; KERNEL-IR-LABEL: polly.stmt.for.body3:                             ; preds = %polly.then
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A = mul nsw i64 %pexp.pdiv_r, 100
+; KERNEL-IR-NEXT:   %7 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %8 = add nsw i64 %7, %t0
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A = add nsw i64 %polly.access.mul.MemRef_A, %8
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A = getelementptr float, float* %polly.access.cast.MemRef_A, i64 %polly.access.add.MemRef_A
+; KERNEL-IR-NEXT:   %tmp_p_scalar_ = load float, float* %polly.access.MemRef_A, align 4
+; KERNEL-IR-NEXT:   %9 = add i64 %6, 1
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A1 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r2 = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A3 = mul nsw i64 %pexp.pdiv_r2, 100
+; KERNEL-IR-NEXT:   %10 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %11 = add nsw i64 %10, %t0
+; KERNEL-IR-NEXT:   %12 = add nsw i64 %11, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A4 = add nsw i64 %polly.access.mul.MemRef_A3, %12
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A5 = getelementptr float, float* %polly.access.cast.MemRef_A1, i64 %polly.access.add.MemRef_A4
+; KERNEL-IR-NEXT:   %tmp2_p_scalar_ = load float, float* %polly.access.MemRef_A5, align 4
+; KERNEL-IR-NEXT:   %p_add = fadd float %tmp_p_scalar_, %tmp2_p_scalar_
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A6 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %pexp.pdiv_r7 = urem i64 %c0, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A8 = mul nsw i64 %pexp.pdiv_r7, 100
+; KERNEL-IR-NEXT:   %13 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %14 = add nsw i64 %13, %t0
+; KERNEL-IR-NEXT:   %15 = add nsw i64 %14, 2
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A9 = add nsw i64 %polly.access.mul.MemRef_A8, %15
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A10 = getelementptr float, float* %polly.access.cast.MemRef_A6, i64 %polly.access.add.MemRef_A9
+; KERNEL-IR-NEXT:   %tmp3_p_scalar_ = load float, float* %polly.access.MemRef_A10, align 4
+; KERNEL-IR-NEXT:   %p_add12 = fadd float %p_add, %tmp3_p_scalar_
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A11 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %16 = add nsw i64 %c0, 1
+; KERNEL-IR-NEXT:   %pexp.pdiv_r12 = urem i64 %16, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A13 = mul nsw i64 %pexp.pdiv_r12, 100
+; KERNEL-IR-NEXT:   %17 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %18 = add nsw i64 %17, %t0
+; KERNEL-IR-NEXT:   %19 = add nsw i64 %18, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A14 = add nsw i64 %polly.access.mul.MemRef_A13, %19
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A15 = getelementptr float, float* %polly.access.cast.MemRef_A11, i64 %polly.access.add.MemRef_A14
+; KERNEL-IR-NEXT:   %tmp4_p_scalar_ = load float, float* %polly.access.MemRef_A15, align 4
+; KERNEL-IR-NEXT:   %p_add17 = fadd float %tmp4_p_scalar_, %p_add12
+; KERNEL-IR-NEXT:   %polly.access.cast.MemRef_A16 = bitcast i8* %MemRef_A to float*
+; KERNEL-IR-NEXT:   %20 = add nsw i64 %c0, 1
+; KERNEL-IR-NEXT:   %pexp.pdiv_r17 = urem i64 %20, 2
+; KERNEL-IR-NEXT:   %polly.access.mul.MemRef_A18 = mul nsw i64 %pexp.pdiv_r17, 100
+; KERNEL-IR-NEXT:   %21 = mul nsw i64 32, %b0
+; KERNEL-IR-NEXT:   %22 = add nsw i64 %21, %t0
+; KERNEL-IR-NEXT:   %23 = add nsw i64 %22, 1
+; KERNEL-IR-NEXT:   %polly.access.add.MemRef_A19 = add nsw i64 %polly.access.mul.MemRef_A18, %23
+; KERNEL-IR-NEXT:   %polly.access.MemRef_A20 = getelementptr float, float* %polly.access.cast.MemRef_A16, i64 %polly.access.add.MemRef_A19
+; KERNEL-IR-NEXT:   store float %p_add17, float* %polly.access.MemRef_A20, align 4
 ; KERNEL-IR-NEXT:   br label %polly.merge
 
-; KERNEL-IR-LABEL: polly.else:
+; KERNEL-IR-LABEL: polly.else:                                       ; preds = %polly.cond
 ; KERNEL-IR-NEXT:   br label %polly.merge
 ; KERNEL-IR-NEXT: }
 
diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
index b7eafb711245..0d36c54dd8bd 100644
--- a/polly/test/GPGPU/kernel-params-only-some-arrays.ll
+++ b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
@@ -23,7 +23,8 @@
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
 ; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-; KERNEL-NEXT:     ret void
+
+; KERNEL:     ret void
 ; KERNEL-NEXT: }
 
 ; KERNEL: ; ModuleID = 'kernel_1'
@@ -37,7 +38,8 @@
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
 ; KERNEL-NEXT:     %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 ; KERNEL-NEXT:     %t0 = zext i32 %1 to i64
-; KERNEL-NEXT:     ret void
+
+; KERNEL:     ret void
 ; KERNEL-NEXT: }
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"