GPGPU: Format statements scheduled on the host ourselves

Otherwise ppcg would try to call into pet functionality that this not available, which obviously will cause trouble. As we can easily print these statements ourselves, we just do so. llvm-svn: 275579
2016-07-15 17:12:41 +00:00 · 2016-07-15 17:12:41 +00:00 · 2025173494
parent 771c34513a
commit 2025173494
2 changed files with 212 additions and 0 deletions
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@ -492,6 +492,20 @@ public:
    auto Id = isl_ast_node_get_annotation(Node);

    if (Id) {
+      bool IsUser = !strcmp(isl_id_get_name(Id), "user");
+
+      // If this is a user statement, format it ourselves as ppcg would
+      // otherwise try to call pet functionality that is not available in
+      // Polly.
+      if (IsUser) {
+        P = isl_printer_start_line(P);
+        P = isl_printer_print_ast_node(P, Node);
+        P = isl_printer_end_line(P);
+        isl_id_free(Id);
+        isl_ast_print_options_free(Options);
+        return P;
+      }
+
      auto Kernel = (struct ppcg_kernel *)isl_id_get_user(Id);
      isl_id_free(Id);
      Data->Kernels.push_back(Kernel);
--- a/polly/test/GPGPU/host-statement.ll
+++ b/polly/test/GPGPU/host-statement.ll
@ -0,0 +1,198 @@
+; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \
+; RUN: -disable-output < %s | \
+; RUN: FileCheck -check-prefix=CODE %s
+
+; REQUIRES: pollyacc
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; This test case tests that we can correctly handle a ScopStmt that is
+; scheduled on the host, instead of within a kernel.
+
+; CODE: Code
+; CODE-NEXT: ====
+; CODE-NEXT: # host
+; CODE-NEXT: {
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
+; CODE-NEXT:   {
+; CODE-NEXT:     dim3 k0_dimBlock(32);
+; CODE-NEXT:     dim3 k0_dimGrid(16);
+; CODE-NEXT:     kernel0 <<<k0_dimGrid, k0_dimBlock>>> (p_0, p_1);
+; CODE-NEXT:     cudaCheckKernel();
+; CODE-NEXT:   }
+
+; CODE:   if (p_0 <= 510 && p_1 <= 510) {
+; CODE-NEXT:     {
+; CODE-NEXT:       dim3 k1_dimBlock(32);
+; CODE-NEXT:       dim3 k1_dimGrid(p_1 <= -1048034 ? 32768 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
+; CODE-NEXT:       kernel1 <<<k1_dimGrid, k1_dimBlock>>> (p_0, p_1);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:     {
+; CODE-NEXT:       dim3 k2_dimBlock(16, 32);
+; CODE-NEXT:       dim3 k2_dimGrid(16, p_1 <= -7650 ? 256 : -p_1 + floord(31 * p_1 + 30, 32) + 16);
+; CODE-NEXT:       kernel2 <<<k2_dimGrid, k2_dimBlock>>> (p_0, p_1);
+; CODE-NEXT:       cudaCheckKernel();
+; CODE-NEXT:     }
+
+; CODE:   }
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   if (p_0 <= 510 && p_1 <= 510)
+; CODE-NEXT:     cudaCheckReturn(cudaMemcpy(MemRef_R, dev_MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
+; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(MemRef_Q, dev_MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyDeviceToHost));
+; CODE-NEXT:     Stmt_for_cond33_preheader();
+
+; CODE: }
+
+; CODE: # kernel0
+; CODE-NEXT: Stmt_for_body16(32 * b0 + t0);
+
+; CODE: # kernel1
+; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 1048576; c0 += 1)
+; CODE-NEXT:   if (p_1 + 32 * b0 + t0 + 1048576 * c0 <= 510) {
+; CODE-NEXT:     Stmt_for_body35(32 * b0 + t0 + 1048576 * c0);
+; CODE-NEXT:     for (int c1 = 0; c1 <= 15; c1 += 1)
+; CODE-NEXT:       for (int c3 = 0; c3 <= 31; c3 += 1)
+; CODE-NEXT:         Stmt_for_body42(32 * b0 + t0 + 1048576 * c0, 32 * c1 + c3);
+; CODE-NEXT:   }
+
+; CODE: # kernel2
+; CODE-NEXT: for (int c0 = 0; c0 <= (-p_1 - 32 * b0 + 510) / 8192; c0 += 1)
+; CODE-NEXT:   if (p_1 + 32 * b0 + t0 + 8192 * c0 <= 510)
+; CODE-NEXT:     for (int c3 = 0; c3 <= 1; c3 += 1)
+; CODE-NEXT:       Stmt_for_body62(32 * b0 + t0 + 8192 * c0, 32 * b1 + t1 + 16 * c3);
+
+
+; Function Attrs: nounwind uwtable
+define internal void @kernel_gramschmidt(i32 %ni, i32 %nj, [512 x double]* %A, [512 x double]* %R, [512 x double]* %Q) #1 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry.split, %for.inc86
+  %indvars.iv24 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next25, %for.inc86 ]
+  %indvars.iv19 = phi i64 [ 1, %entry.split ], [ %indvars.iv.next20, %for.inc86 ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.cond1.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ]
+  %nrm.02 = phi double [ 0.000000e+00, %for.cond1.preheader ], [ %add, %for.inc ]
+  %arrayidx5 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv, i64 %indvars.iv24
+  %tmp = load double, double* %arrayidx5, align 8, !tbaa !1
+  %arrayidx9 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv, i64 %indvars.iv24
+  %tmp27 = load double, double* %arrayidx9, align 8, !tbaa !1
+  %mul = fmul double %tmp, %tmp27
+  %add = fadd double %nrm.02, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 512
+  br i1 %exitcond, label %for.inc, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %add.lcssa = phi double [ %add, %for.inc ]
+  %call = tail call double @sqrt(double %add.lcssa) #2
+  %arrayidx13 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv24
+  store double %call, double* %arrayidx13, align 8, !tbaa !1
+  br label %for.body16
+
+for.cond33.preheader:                             ; preds = %for.body16
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %cmp347 = icmp slt i64 %indvars.iv.next25, 512
+  br i1 %cmp347, label %for.body35.lr.ph, label %for.inc86
+
+for.body35.lr.ph:                                 ; preds = %for.cond33.preheader
+  br label %for.body35
+
+for.body16:                                       ; preds = %for.end, %for.body16
+  %indvars.iv10 = phi i64 [ 0, %for.end ], [ %indvars.iv.next11, %for.body16 ]
+  %arrayidx20 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv10, i64 %indvars.iv24
+  %tmp28 = load double, double* %arrayidx20, align 8, !tbaa !1
+  %arrayidx24 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv24
+  %tmp29 = load double, double* %arrayidx24, align 8, !tbaa !1
+  %div = fdiv double %tmp28, %tmp29
+  %arrayidx28 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv10, i64 %indvars.iv24
+  store double %div, double* %arrayidx28, align 8, !tbaa !1
+  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+  %exitcond12 = icmp ne i64 %indvars.iv.next11, 512
+  br i1 %exitcond12, label %for.body16, label %for.cond33.preheader
+
+for.cond33.loopexit:                              ; preds = %for.body62
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next22 to i32
+  %exitcond23 = icmp ne i32 %lftr.wideiv, 512
+  br i1 %exitcond23, label %for.body35, label %for.cond33.for.inc86_crit_edge
+
+for.body35:                                       ; preds = %for.body35.lr.ph, %for.cond33.loopexit
+  %indvars.iv21 = phi i64 [ %indvars.iv19, %for.body35.lr.ph ], [ %indvars.iv.next22, %for.cond33.loopexit ]
+  %arrayidx39 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
+  store double 0.000000e+00, double* %arrayidx39, align 8, !tbaa !1
+  br label %for.body42
+
+for.cond60.preheader:                             ; preds = %for.body42
+  br label %for.body62
+
+for.body42:                                       ; preds = %for.body35, %for.body42
+  %indvars.iv13 = phi i64 [ 0, %for.body35 ], [ %indvars.iv.next14, %for.body42 ]
+  %arrayidx46 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv13, i64 %indvars.iv24
+  %tmp30 = load double, double* %arrayidx46, align 8, !tbaa !1
+  %arrayidx50 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv13, i64 %indvars.iv21
+  %tmp31 = load double, double* %arrayidx50, align 8, !tbaa !1
+  %mul51 = fmul double %tmp30, %tmp31
+  %arrayidx55 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
+  %tmp32 = load double, double* %arrayidx55, align 8, !tbaa !1
+  %add56 = fadd double %tmp32, %mul51
+  store double %add56, double* %arrayidx55, align 8, !tbaa !1
+  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+  %exitcond15 = icmp ne i64 %indvars.iv.next14, 512
+  br i1 %exitcond15, label %for.body42, label %for.cond60.preheader
+
+for.body62:                                       ; preds = %for.cond60.preheader, %for.body62
+  %indvars.iv16 = phi i64 [ 0, %for.cond60.preheader ], [ %indvars.iv.next17, %for.body62 ]
+  %arrayidx66 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv16, i64 %indvars.iv21
+  %tmp33 = load double, double* %arrayidx66, align 8, !tbaa !1
+  %arrayidx70 = getelementptr inbounds [512 x double], [512 x double]* %Q, i64 %indvars.iv16, i64 %indvars.iv24
+  %tmp34 = load double, double* %arrayidx70, align 8, !tbaa !1
+  %arrayidx74 = getelementptr inbounds [512 x double], [512 x double]* %R, i64 %indvars.iv24, i64 %indvars.iv21
+  %tmp35 = load double, double* %arrayidx74, align 8, !tbaa !1
+  %mul75 = fmul double %tmp34, %tmp35
+  %sub = fsub double %tmp33, %mul75
+  %arrayidx79 = getelementptr inbounds [512 x double], [512 x double]* %A, i64 %indvars.iv16, i64 %indvars.iv21
+  store double %sub, double* %arrayidx79, align 8, !tbaa !1
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  %exitcond18 = icmp ne i64 %indvars.iv.next17, 512
+  br i1 %exitcond18, label %for.body62, label %for.cond33.loopexit
+
+for.cond33.for.inc86_crit_edge:                   ; preds = %for.cond33.loopexit
+  br label %for.inc86
+
+for.inc86:                                        ; preds = %for.cond33.for.inc86_crit_edge, %for.cond33.preheader
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond26 = icmp ne i64 %indvars.iv.next25, 512
+  br i1 %exitcond26, label %for.cond1.preheader, label %for.end88
+
+for.end88:                                        ; preds = %for.inc86
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare double @sqrt(double) #2
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 (trunk 275267) (llvm/trunk 275268)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"double", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}