[OpenMP] Enable HeapToStack conversion in OpenMPOpt for new RTL globalization calls

Summary: The changes to globalization introduced in D97680 introduce a large amount of overhead by default. The old globalization method would always ignore globalization code if executing in SPMD mode. This wasn't strictly correct as data sharing is still possible in SPMD mode. The new interface is correct but introduces globalization code even when unnecessary. This optimization will use the existing HeapToStack transformation in the attributor to allow for unneeded globalization to be replaced with thread-private stack memory. This is done using the newly introduced library instances for the RTL functions added in D102087. Depends on D97818 Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D102197
2021-06-07 13:24:30 -04:00 · 2021-06-07 13:24:30 -04:00 · 7d69da71dd
parent 2662351e3b
commit 7d69da71dd
5 changed files with 103 additions and 4 deletions
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@ -499,6 +499,11 @@ __OMP_ATTRS_SET(InaccessibleArgOnlyAttrs,
                                   EnumAttr(WillReturn), EnumAttr(NoFree))
                    : AttributeSet(EnumAttr(NoUnwind)))

+__OMP_ATTRS_SET(NoCaptureAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoCapture))
+                    : AttributeSet(EnumAttr(NoCapture)))
+
 #if 0
 __OMP_ATTRS_SET(InaccessibleOnlyAttrs,
                OptimisticAttributes
@ -840,6 +845,11 @@ __OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),
 __OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
                ParamAttrs(ReadOnlyPtrAttrs))

+__OMP_RTL_ATTRS(__kmpc_alloc_shared, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_free_shared, AllocAttrs, AttributeSet(),
+                ParamAttrs(NoCaptureAttrs))
+
 __OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), ParamAttrs())

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@ -1623,9 +1623,13 @@ private:
    };
    GlobalizationRFI.foreachUse(SCC, CreateAA);

+    // Create an ExecutionDomain AA for every function and a HeapToStack AA for
+    // every function if there is a device kernel.
    for (auto *F : SCC) {
      if (!F->isDeclaration())
        A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
+      if (!OMPInfoCache.Kernels.empty())
+        A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
    }
  }
 };
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@ -29,9 +29,9 @@
 ; CHECK-DAG:   icmp eq i8* %5, @__omp_outlined__1_wrapper.ID
 ; CHECK-DAG:   icmp eq i8* %7, @__omp_outlined__3_wrapper.ID

-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* @__omp_outlined__1_wrapper.ID, i8** %2, i64 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %2, i64 noundef 0)
 ; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0)
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* @__omp_outlined__3_wrapper.ID, i8** %3, i64 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef %3, i64 noundef 0)


 %struct.ident_t = type { i32, i32, i32, i32, i8* }
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64"
+
+@S = external local_unnamed_addr global i8*
+
+define void @kernel() {
+; CHECK-LABEL: define {{[^@]+}}@kernel() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @foo() #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @bar() #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @foo()
+  call void @bar()
+  ret void
+}
+
+define internal void @foo() {
+; CHECK-LABEL: define {{[^@]+}}@foo
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i8, i64 4, align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i8* @__kmpc_alloc_shared(i64 4)
+  call void @use(i8* %0)
+  call void @__kmpc_free_shared(i8* %0)
+  ret void
+}
+
+define internal void @bar() {
+; CHECK-LABEL: define {{[^@]+}}@bar
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8* @__kmpc_alloc_shared(i64 noundef 4) #[[ATTR0]]
+; CHECK-NEXT:    call void @share(i8* nofree writeonly [[TMP0]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[TMP0]]) #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = call i8* @__kmpc_alloc_shared(i64 4)
+  call void @share(i8* %0)
+  call void @__kmpc_free_shared(i8* %0)
+  ret void
+}
+
+define internal void @use(i8* %x) {
+entry:
+  ret void
+}
+
+define internal void @share(i8* %x) {
+; CHECK-LABEL: define {{[^@]+}}@share
+; CHECK-SAME: (i8* nofree writeonly [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i8* [[X]], i8** @S, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i8* %x, i8** @S
+  ret void
+}
+
+; CHECK: declare i8* @__kmpc_alloc_shared(i64)
+declare i8* @__kmpc_alloc_shared(i64)
+
+; CHECK: declare void @__kmpc_free_shared(i8* nocapture)
+declare void @__kmpc_free_shared(i8*)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!nvvm.annotations = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{void ()* @kernel, !"kernel", i32 1}
--- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
+++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
@ -3,6 +3,8 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"

@.str = private unnamed_addr constant [13 x i8] c"Alloc Shared\00", align 1

+@S = external local_unnamed_addr global i8*
+
 ; MODULE: remark: openmp_opt_module.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.

 define void @foo() {
@ -18,11 +20,11 @@ entry:
 define void @use(i8* %0) {
 entry:
  %.addr = alloca i8*, align 8
-  store i8* %0, i8** %.addr, align 8
+  store i8* %0, i8** @S
  ret void
 }

-define internal i8* @__kmpc_alloc_shared(i64 %DataSize) {
+define weak i8* @__kmpc_alloc_shared(i64 %DataSize) {
 entry:
  %call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11
  ret i8* %call