[OpenMP] Internalize functions in OpenMPOpt to improve IPO passes

Summary:
Currently the attributor needs to give up if a function has external linkage.
This means that the optimization introduced in D97818 will only apply to static
functions. This change uses the Attributor to internalize OpenMP device
routines by making a copy of each function with private linkage and replacing
the uses in the module with it. This allows for the optimization to be applied
to any regular function.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D102824
This commit is contained in:
Joseph Huber 2021-05-19 22:57:24 -04:00 committed by Huber, Joseph
parent c747b7d1d9
commit 03d7e61c87
7 changed files with 82 additions and 45 deletions

View File

@ -1,13 +1,13 @@
// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// host-no-diagnostics
void bar1(void) {
#pragma omp parallel // #0
// all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
// safe-remark@#0 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
// safe-remark@#0 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
// force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
{
}
@ -15,7 +15,7 @@ void bar1(void) {
void bar2(void) {
#pragma omp parallel // #1
// all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
// safe-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
// safe-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
// force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
{
}

View File

@ -1,13 +1,13 @@
// RUN: %clang_cc1 -verify=host -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// host-no-diagnostics
void bar(void) {
#pragma omp parallel // #1 \
// expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
// expected-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
// expected-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
{
}
}

View File

@ -1083,13 +1083,15 @@ struct Attributor {
/// the abstract attributes.
/// \param CGUpdater Helper to update an underlying call graph.
/// \param Allowed If not null, a set limiting the attribute opportunities.
/// \param DeleteFns Whether to delete functions
/// \param DeleteFns Whether to delete functions.
/// \param RewriteSignatures Whether to rewrite function signatures.
Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
CallGraphUpdater &CGUpdater,
DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true)
DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
bool RewriteSignatures = true)
: Allocator(InfoCache.Allocator), Functions(Functions),
InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
DeleteFns(DeleteFns) {}
DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures) {}
~Attributor();
@ -1665,6 +1667,21 @@ public:
///
static void createShallowWrapper(Function &F);
/// Make another copy of the function \p F such that the copied version has
/// internal linkage afterwards and can be analysed. Then we replace all uses
/// of the original function to the copied one
///
/// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
/// linkage can be internalized because these linkages guarantee that other
/// definitions with the same name have the same semantics as this one.
///
/// This will only be run if the `attributor-allow-deep-wrappers` option is
/// set, or if the function is called with \p Force set to true.
///
/// If the function \p F failed to be internalized the return value will be a
/// null pointer.
static Function *internalizeFunction(Function &F, bool Force = false);
/// Return the data layout associated with the anchor scope.
const DataLayout &getDataLayout() const { return InfoCache.DL; }
@ -1777,6 +1794,9 @@ private:
/// Whether to delete functions.
const bool DeleteFns;
/// Whether to rewrite signatures.
const bool RewriteSignatures;
/// A set to remember the functions we already assume to be live and visited.
DenseSet<const Function *> VisitedFunctions;

View File

@ -1621,19 +1621,12 @@ void Attributor::createShallowWrapper(Function &F) {
NumFnShallowWrappersCreated++;
}
/// Make another copy of the function \p F such that the copied version has
/// internal linkage afterwards and can be analysed. Then we replace all uses
/// of the original function to the copied one
///
/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr`
/// linkage can be internalized because these linkages guarantee that other
/// definitions with the same name have the same semantics as this one
///
static Function *internalizeFunction(Function &F) {
assert(AllowDeepWrapper && "Cannot create a copy if not allowed.");
assert(!F.isDeclaration() && !F.hasExactDefinition() &&
!GlobalValue::isInterposableLinkage(F.getLinkage()) &&
"Trying to internalize function which cannot be internalized.");
Function *Attributor::internalizeFunction(Function &F, bool Force) {
if (!AllowDeepWrapper && !Force)
return nullptr;
if (F.isDeclaration() || F.hasLocalLinkage() ||
GlobalValue::isInterposableLinkage(F.getLinkage()))
return nullptr;
Module &M = *F.getParent();
FunctionType *FnTy = F.getFunctionType();
@ -1663,7 +1656,8 @@ static Function *internalizeFunction(Function &F) {
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
F.getAllMetadata(MDs);
for (auto MDIt : MDs)
Copied->addMetadata(MDIt.first, *MDIt.second);
if (!Copied->hasMetadata())
Copied->addMetadata(MDIt.first, *MDIt.second);
M.getFunctionList().insert(F.getIterator(), Copied);
F.replaceAllUsesWith(Copied);
@ -1675,6 +1669,9 @@ static Function *internalizeFunction(Function &F) {
bool Attributor::isValidFunctionSignatureRewrite(
Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
if (!RewriteSignatures)
return false;
auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
// Forbid the call site to cast the function return type. If we need to
// rewrite these functions we need to re-create a cast for the new call site
@ -2459,7 +2456,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
Function *F = Functions[u];
if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() &&
!GlobalValue::isInterposableLinkage(F->getLinkage())) {
Function *NewF = internalizeFunction(*F);
Function *NewF = Attributor::internalizeFunction(*F);
assert(NewF && "Could not internalize function.");
Functions.insert(NewF);
// Update call graph

View File

@ -1623,9 +1623,9 @@ private:
};
GlobalizationRFI.foreachUse(SCC, CreateAA);
for (auto &F : M) {
if (!F.isDeclaration())
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
for (auto *F : SCC) {
if (!F->isDeclaration())
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
}
}
};
@ -2620,11 +2620,19 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
if (DisableOpenMPOptimizations)
return PreservedAnalyses::all();
// Look at every function definition in the Module.
// Create internal copies of each function if this is a kernel Module.
DenseSet<const Function *> InternalizedFuncs;
if (!OMPInModule.getKernels().empty())
for (Function &F : M)
if (!F.isDeclaration() && !OMPInModule.getKernels().contains(&F))
if (Attributor::internalizeFunction(F, /* Force */ true))
InternalizedFuncs.insert(&F);
// Look at every function definition in the Module that wasn't internalized.
SmallVector<Function *, 16> SCC;
for (Function &Fn : M)
if (!Fn.isDeclaration())
SCC.push_back(&Fn);
for (Function &F : M)
if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
SCC.push_back(&F);
if (SCC.empty())
return PreservedAnalyses::all();
@ -2645,7 +2653,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions,
OMPInModule.getKernels());
Attributor A(Functions, InfoCache, CGUpdater);
Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false);
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
bool Changed = OMPOpt.run(true);

View File

@ -2,6 +2,8 @@
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64"
@S = external local_unnamed_addr global i8*
; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
@ -67,7 +69,7 @@ exit:
define void @use(i8* %x) {
entry:
%addr = alloca i8*
store i8* %x, i8** %addr
store i8* %x, i8** @S
ret void
}

View File

@ -1,8 +1,8 @@
; RUN: opt -passes=openmp-opt-cgscc -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
; RUN: opt -passes=openmp-opt -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
; ModuleID = 'single_threaded_exeuction.c'
define void @kernel() {
define weak void @kernel() {
call void @__kmpc_kernel_init(i32 512, i16 1)
call void @nvptx()
call void @amdgcn()
@ -12,14 +12,15 @@ define void @kernel() {
; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
; Function Attrs: noinline nounwind uwtable
define dso_local void @nvptx() {
; Function Attrs: noinline
define internal void @nvptx() {
entry:
%call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%cmp = icmp eq i32 %call, 0
br i1 %cmp, label %if.then, label %if.end
if.then:
call void @foo()
call void @bar()
br label %if.end
@ -30,14 +31,15 @@ if.end:
; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
; Function Attrs: noinline nounwind uwtable
define dso_local void @amdgcn() {
; Function Attrs: noinline
define internal void @amdgcn() {
entry:
%call = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %call, 0
br i1 %cmp, label %if.then, label %if.end
if.then:
call void @foo()
call void @bar()
br label %if.end
@ -45,9 +47,16 @@ if.end:
ret void
}
; CHECK: [openmp-opt] Basic block @bar entry is executed by a single thread.
; Function Attrs: noinline nounwind uwtable
define internal void @bar() {
; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread.
; Function Attrs: noinline
define internal void @foo() {
entry:
ret void
}
; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread.
; Function Attrs: noinline
define void @bar() {
entry:
ret void
}