[OpenMP] Internalize functions in OpenMPOpt to improve IPO passes

Summary: Currently the attributor needs to give up if a function has external linkage. This means that the optimization introduced in D97818 will only apply to static functions. This change uses the Attributor to internalize OpenMP device routines by making a copy of each function with private linkage and replacing the uses in the module with it. This allows for the optimization to be applied to any regular function. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D102824
2021-05-19 22:57:24 -04:00 · 2021-05-19 22:57:24 -04:00 · 03d7e61c87
parent c747b7d1d9
commit 03d7e61c87
7 changed files with 82 additions and 45 deletions
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@ -1,13 +1,13 @@
-// RUN: %clang_cc1                                 -verify=host      -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1                                 -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1                                 -verify=host      -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe  -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out

 // host-no-diagnostics

 void bar1(void) {
 #pragma omp parallel // #0
                     // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
-                     // safe-remark@#0 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+                     // safe-remark@#0 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
                     // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
  {
  }
@ -15,7 +15,7 @@ void bar1(void) {
 void bar2(void) {
 #pragma omp parallel // #1
                     // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
-                     // safe-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+                     // safe-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
                     // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
  {
  }
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@ -1,13 +1,13 @@
-// RUN: %clang_cc1                                 -verify=host -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1                                 -verify      -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1                                 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1                                 -verify      -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify      -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out

 // host-no-diagnostics

 void bar(void) {
 #pragma omp parallel // #1                                                                                                                                                                                                                                                                                                                                           \
                     // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
-                     // expected-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+                     // expected-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
  {
  }
 }
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@ -1083,13 +1083,15 @@ struct Attributor {
  ///                  the abstract attributes.
  /// \param CGUpdater Helper to update an underlying call graph.
  /// \param Allowed If not null, a set limiting the attribute opportunities.
-  /// \param DeleteFns Whether to delete functions
+  /// \param DeleteFns Whether to delete functions.
+  /// \param RewriteSignatures Whether to rewrite function signatures.
  Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
             CallGraphUpdater &CGUpdater,
-             DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true)
+             DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
+             bool RewriteSignatures = true)
      : Allocator(InfoCache.Allocator), Functions(Functions),
        InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
-        DeleteFns(DeleteFns) {}
+        DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures) {}

  ~Attributor();

@ -1665,6 +1667,21 @@ public:
  ///
  static void createShallowWrapper(Function &F);

+  /// Make another copy of the function \p F such that the copied version has
+  /// internal linkage afterwards and can be analysed. Then we replace all uses
+  /// of the original function to the copied one
+  ///
+  /// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
+  /// linkage can be internalized because these linkages guarantee that other
+  /// definitions with the same name have the same semantics as this one.
+  ///
+  /// This will only be run if the `attributor-allow-deep-wrappers` option is
+  /// set, or if the function is called with \p Force set to true.
+  ///
+  /// If the function \p F failed to be internalized the return value will be a
+  /// null pointer.
+  static Function *internalizeFunction(Function &F, bool Force = false);
+
  /// Return the data layout associated with the anchor scope.
  const DataLayout &getDataLayout() const { return InfoCache.DL; }

@ -1777,6 +1794,9 @@ private:
  /// Whether to delete functions.
  const bool DeleteFns;

+  /// Whether to rewrite signatures.
+  const bool RewriteSignatures;
+
  /// A set to remember the functions we already assume to be live and visited.
  DenseSet<const Function *> VisitedFunctions;

--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@ -1621,19 +1621,12 @@ void Attributor::createShallowWrapper(Function &F) {
  NumFnShallowWrappersCreated++;
 }

-/// Make another copy of the function \p F such that the copied version has
-/// internal linkage afterwards and can be analysed. Then we replace all uses
-/// of the original function to the copied one
-///
-/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr`
-/// linkage can be internalized because these linkages guarantee that other
-/// definitions with the same name have the same semantics as this one
-///
-static Function *internalizeFunction(Function &F) {
-  assert(AllowDeepWrapper && "Cannot create a copy if not allowed.");
-  assert(!F.isDeclaration() && !F.hasExactDefinition() &&
-         !GlobalValue::isInterposableLinkage(F.getLinkage()) &&
-         "Trying to internalize function which cannot be internalized.");
+Function *Attributor::internalizeFunction(Function &F, bool Force) {
+  if (!AllowDeepWrapper && !Force)
+    return nullptr;
+  if (F.isDeclaration() || F.hasLocalLinkage() ||
+      GlobalValue::isInterposableLinkage(F.getLinkage()))
+    return nullptr;

  Module &M = *F.getParent();
  FunctionType *FnTy = F.getFunctionType();
@ -1663,7 +1656,8 @@ static Function *internalizeFunction(Function &F) {
  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
  F.getAllMetadata(MDs);
  for (auto MDIt : MDs)
-    Copied->addMetadata(MDIt.first, *MDIt.second);
+    if (!Copied->hasMetadata())
+      Copied->addMetadata(MDIt.first, *MDIt.second);

  M.getFunctionList().insert(F.getIterator(), Copied);
  F.replaceAllUsesWith(Copied);
@ -1675,6 +1669,9 @@ static Function *internalizeFunction(Function &F) {
 bool Attributor::isValidFunctionSignatureRewrite(
    Argument &Arg, ArrayRef<Type *> ReplacementTypes) {

+  if (!RewriteSignatures)
+    return false;
+
  auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
    // Forbid the call site to cast the function return type. If we need to
    // rewrite these functions we need to re-create a cast for the new call site
@ -2459,7 +2456,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
      Function *F = Functions[u];
      if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() &&
          !GlobalValue::isInterposableLinkage(F->getLinkage())) {
-        Function *NewF = internalizeFunction(*F);
+        Function *NewF = Attributor::internalizeFunction(*F);
+        assert(NewF && "Could not internalize function.");
        Functions.insert(NewF);

        // Update call graph
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@ -1623,9 +1623,9 @@ private:
    };
    GlobalizationRFI.foreachUse(SCC, CreateAA);

-    for (auto &F : M) {
-      if (!F.isDeclaration())
-        A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
+    for (auto *F : SCC) {
+      if (!F->isDeclaration())
+        A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
    }
  }
 };
@ -2620,11 +2620,19 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
  if (DisableOpenMPOptimizations)
    return PreservedAnalyses::all();

-  // Look at every function definition in the Module.
+  // Create internal copies of each function if this is a kernel Module.
+  DenseSet<const Function *> InternalizedFuncs;
+  if (!OMPInModule.getKernels().empty())
+    for (Function &F : M)
+      if (!F.isDeclaration() && !OMPInModule.getKernels().contains(&F))
+        if (Attributor::internalizeFunction(F, /* Force */ true))
+          InternalizedFuncs.insert(&F);
+
+  // Look at every function definition in the Module that wasn't internalized.
  SmallVector<Function *, 16> SCC;
-  for (Function &Fn : M)
-    if (!Fn.isDeclaration())
-      SCC.push_back(&Fn);
+  for (Function &F : M)
+    if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
+      SCC.push_back(&F);

  if (SCC.empty())
    return PreservedAnalyses::all();
@ -2645,7 +2653,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
  OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions,
                                OMPInModule.getKernels());

-  Attributor A(Functions, InfoCache, CGUpdater);
+  Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false);

  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
  bool Changed = OMPOpt.run(true);
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@ -2,6 +2,8 @@
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"

+@S = external local_unnamed_addr global i8*
+
 ; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
 ; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef

@ -67,7 +69,7 @@ exit:
 define void @use(i8* %x) {
 entry:
  %addr = alloca i8*
-  store i8* %x, i8** %addr
+  store i8* %x, i8** @S
  ret void
 }

--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@ -1,8 +1,8 @@
-; RUN: opt -passes=openmp-opt-cgscc -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=openmp-opt -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ; ModuleID = 'single_threaded_exeuction.c'

-define void @kernel() {
+define weak void @kernel() {
  call void @__kmpc_kernel_init(i32 512, i16 1)
  call void @nvptx()
  call void @amdgcn()
@ -12,14 +12,15 @@ define void @kernel() {
 ; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
 ; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define dso_local void @nvptx() {
+; Function Attrs: noinline
+define internal void @nvptx() {
 entry:
  %call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %cmp = icmp eq i32 %call, 0
  br i1 %cmp, label %if.then, label %if.end

 if.then:
+  call void @foo()
  call void @bar()
  br label %if.end

@ -30,14 +31,15 @@ if.end:
 ; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
 ; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define dso_local void @amdgcn() {
+; Function Attrs: noinline
+define internal void @amdgcn() {
 entry:
  %call = call i32 @llvm.amdgcn.workitem.id.x()
  %cmp = icmp eq i32 %call, 0
  br i1 %cmp, label %if.then, label %if.end

 if.then:
+  call void @foo()
  call void @bar()
  br label %if.end

@ -45,9 +47,16 @@ if.end:
  ret void
 }

-; CHECK: [openmp-opt] Basic block @bar entry is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define internal void @bar() {
+; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread.
+; Function Attrs: noinline
+define internal void @foo() {
+entry:
+  ret void
+}
+
+; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread.
+; Function Attrs: noinline
+define void @bar() {
 entry:
  ret void
 }