GPGPU: Do not assume arrays start at 0

Our alias checks precisely check that the minimal and maximal accessed elements do not overlap in a kernel. Hence, we must ensure that our host <-> device transfers do not touch additional memory locations that are not covered in the alias check. To ensure this, we make sure that the data we copy for a given array is only the data from the smallest element accessed to the largest element accessed. We also adjust the size of the array according to the offset at which the array is actually accessed. An interesting result of this is: In case array are accessed with negative subscripts ,e.g., A[-100], we automatically allocate and transfer _more_ data to cover the full array. This is important as such code indeed exists in the wild. llvm-svn: 281611
2016-09-15 14:05:58 +00:00 · 2016-09-15 14:05:58 +00:00 · aaabbbf886
parent 9edf96ec9b
commit aaabbbf886
3 changed files with 216 additions and 6 deletions
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -284,6 +284,21 @@ private:
  /// @param Array The array for which to compute a size.
  Value *getArraySize(gpu_array_info *Array);

+  /// Generate code to compute the minimal offset at which an array is accessed.
+  ///
+  /// The offset of an array is the minimal array location accessed in a scop.
+  ///
+  /// Example:
+  ///
+  ///   for (long i = 0; i < 100; i++)
+  ///     A[i + 42] += ...
+  ///
+  ///   getArrayOffset(A) results in 42.
+  ///
+  /// @param Array The array for which to compute the offset.
+  /// @returns An llvm::Value that contains the offset of the array.
+  Value *getArrayOffset(gpu_array_info *Array);
+
  /// Prepare the kernel arguments for kernel code generation
  ///
  /// @param Kernel The kernel to generate code for.
@ -468,6 +483,12 @@ void GPUNodeBuilder::allocateDeviceArrays() {
    DevArrayName.append(Array->name);

    Value *ArraySize = getArraySize(Array);
+    Value *Offset = getArrayOffset(Array);
+    if (Offset)
+      ArraySize = Builder.CreateSub(
+          ArraySize,
+          Builder.CreateMul(Offset,
+                            Builder.getInt64(ScopArray->getElemSizeInBytes())));
    Value *DevArray = createCallAllocateMemoryForDevice(ArraySize);
    DevArray->setName(DevArrayName);
    DeviceAllocations[ScopArray] = DevArray;
@ -721,6 +742,48 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
  return ArraySize;
 }

+Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
+  if (gpu_array_is_scalar(Array))
+    return nullptr;
+
+  isl_ast_build *Build = isl_ast_build_from_context(S.getContext());
+
+  isl_set *Min = isl_set_lexmin(isl_set_copy(Array->extent));
+
+  isl_set *ZeroSet = isl_set_universe(isl_set_get_space(Min));
+
+  for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++)
+    ZeroSet = isl_set_fix_si(ZeroSet, isl_dim_set, i, 0);
+
+  if (isl_set_is_subset(Min, ZeroSet)) {
+    isl_set_free(Min);
+    isl_set_free(ZeroSet);
+    isl_ast_build_free(Build);
+    return nullptr;
+  }
+  isl_set_free(ZeroSet);
+
+  isl_ast_expr *Result =
+      isl_ast_expr_from_val(isl_val_int_from_si(isl_set_get_ctx(Min), 0));
+
+  for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
+    if (i > 0) {
+      isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
+      isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
+      Result = isl_ast_expr_mul(Result, BExpr);
+    }
+    isl_pw_aff *DimMin = isl_set_dim_min(isl_set_copy(Min), i);
+    isl_ast_expr *MExpr = isl_ast_build_expr_from_pw_aff(Build, DimMin);
+    Result = isl_ast_expr_add(Result, MExpr);
+  }
+
+  Value *ResultValue = ExprBuilder.create(Result);
+  isl_set_free(Min);
+  isl_ast_build_free(Build);
+
+  return ResultValue;
+}
+
 void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
                                        enum DataDirection Direction) {
  isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt);
@ -730,6 +793,7 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
  auto ScopArray = (ScopArrayInfo *)(Array->user);

  Value *Size = getArraySize(Array);
+  Value *Offset = getArrayOffset(Array);
  Value *DevPtr = DeviceAllocations[ScopArray];

  Value *HostPtr;
@ -739,8 +803,20 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt,
  else
    HostPtr = ScopArray->getBasePtr();

+  if (Offset) {
+    HostPtr = Builder.CreatePointerCast(
+        HostPtr, ScopArray->getElementType()->getPointerTo());
+    HostPtr = Builder.CreateGEP(HostPtr, Offset);
+  }
+
  HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy());

+  if (Offset) {
+    Size = Builder.CreateSub(
+        Size, Builder.CreateMul(
+                  Offset, Builder.getInt64(ScopArray->getElemSizeInBytes())));
+  }
+
  if (Direction == HOST_TO_DEVICE)
    createCallCopyFromHostToDevice(HostPtr, DevPtr, Size);
  else
@ -1000,6 +1076,16 @@ GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F,

    Value *DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
    DevArray = createCallGetDevicePtr(DevArray);
+
+    Value *Offset = getArrayOffset(&Prog->array[i]);
+
+    if (Offset) {
+      DevArray = Builder.CreatePointerCast(
+          DevArray, SAI->getElementType()->getPointerTo());
+      DevArray = Builder.CreateGEP(DevArray, Builder.CreateNeg(Offset));
+      DevArray = Builder.CreatePointerCast(DevArray, Builder.getInt8PtrTy());
+    }
+
    Instruction *Param = new AllocaInst(
        Builder.getInt8PtrTy(), Launch + "_param_" + std::to_string(Index),
        EntryBlock->getTerminator());
--- a/polly/test/GPGPU/non-zero-array-offset.ll
+++ b/polly/test/GPGPU/non-zero-array-offset.ll
@ -0,0 +1,124 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \
+; RUN: FileCheck %s -check-prefix=IR
+;
+; REQUIRES: pollyacc
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(8);
+; CODE-NEXT:     dim3 k0_dimGrid(1);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (dev_MemRef_B);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   {
+; CODE-NEXT:     dim3 k1_dimBlock(8);
+; CODE-NEXT:     dim3 k1_dimGrid(1);
+; CODE-NEXT:     kernel1 <<<k1_dimGrid, k1_dimBlock>>> (dev_MemRef_A);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_bb3(t0);
+
+; CODE: # kernel1
+; CODE-NEXT: Stmt_bb11(t0);
+
+; IR:       %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT:  %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32)
+; IR-NEXT:  [[REG0:%.+]] = getelementptr float, float* %B, i64 8
+; IR-NEXT:  [[REG1:%.+]] = bitcast float* [[REG0]] to i8*
+; IR-NEXT:  call void @polly_copyFromHostToDevice(i8* [[REG1]], i8* %p_dev_array_MemRef_B, i64 32)
+
+; IR:      [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
+; IR-NEXT: [[REGB:%.+]]  = bitcast i8* [[REGA]] to float*
+; IR-NEXT: [[REGC:%.+]]  = getelementptr float, float* [[REGB]], i64 -8
+; IR-NEXT: [[REGD:%.+]]  = bitcast float* [[REGC]] to i8*
+
+;    void foo(float A[], float B[]) {
+;      for (long i = 0; i < 8; i++)
+;        B[i + 8] *= 4;
+;
+;      for (long i = 0; i < 8; i++)
+;        A[i] *= 12;
+;    }
+;
+;    #ifdef OUTPUT
+;    int main() {
+;      float A[16];
+;
+;      for (long i = 0; i < 16; i++) {
+;        __sync_synchronize();
+;        A[i] = i;
+;      }
+;
+;      foo(A, A);
+;
+;      float sum = 0;
+;      for (long i = 0; i < 16; i++) {
+;        __sync_synchronize();
+;        sum += A[i];
+;      }
+;
+;      printf("%f\n", sum);
+;    }
+;    #endif
+;
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(float* %A, float* %B) {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb7, %bb
+  %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
+  %exitcond1 = icmp ne i64 %i.0, 8
+  br i1 %exitcond1, label %bb3, label %bb9
+
+bb3:                                              ; preds = %bb2
+  %tmp = add nuw nsw i64 %i.0, 8
+  %tmp4 = getelementptr inbounds float, float* %B, i64 %tmp
+  %tmp5 = load float, float* %tmp4, align 4
+  %tmp6 = fmul float %tmp5, 4.000000e+00
+  store float %tmp6, float* %tmp4, align 4
+  br label %bb7
+
+bb7:                                              ; preds = %bb3
+  %tmp8 = add nuw nsw i64 %i.0, 1
+  br label %bb2
+
+bb9:                                              ; preds = %bb2
+  br label %bb10
+
+bb10:                                             ; preds = %bb15, %bb9
+  %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ]
+  %exitcond = icmp ne i64 %i1.0, 8
+  br i1 %exitcond, label %bb11, label %bb17
+
+bb11:                                             ; preds = %bb10
+  %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0
+  %tmp13 = load float, float* %tmp12, align 4
+  %tmp14 = fmul float %tmp13, 1.200000e+01
+  store float %tmp14, float* %tmp12, align 4
+  br label %bb15
+
+bb15:                                             ; preds = %bb11
+  %tmp16 = add nuw nsw i64 %i1.0, 1
+  br label %bb10
+
+bb17:                                             ; preds = %bb10
+  ret void
+}
--- a/polly/test/GPGPU/phi-nodes-in-kernel.ll
+++ b/polly/test/GPGPU/phi-nodes-in-kernel.ll
@ -38,13 +38,13 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CODE-NEXT:     Stmt_for_cond15_for_cond12_loopexit_crit_edge(0);
 ; CODE-NEXT: }

-; IR:      %1 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromHostToDevice(i8* %1, i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)
+; IR:      [[REGA:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REGA]], i8* %p_dev_array_MemRef_out_l_055__phi, i64 4)

-; IR:      %14 = bitcast i32* %out_l.055.phiops to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* %14, i64 4)
-; IR-NEXT: %15 = bitcast i32* %out_l.055.s2a to i8*
-; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* %15, i64 4)
+; IR:      [[REGB:%.+]] = bitcast i32* %out_l.055.phiops to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055__phi, i8* [[REGB]], i64 4)
+; IR-NEXT: [[REGC:%.+]] = bitcast i32* %out_l.055.s2a to i8*
+; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_out_l_055, i8* [[REGC]], i64 4)

 ; KERNEL-IR: entry:
 ; KERNEL-IR-NEXT:   %out_l.055.s2a = alloca i32