forked from OSchip/llvm-project
[OpenMP] Internalize functions in OpenMPOpt to improve IPO passes
Summary: Currently the attributor needs to give up if a function has external linkage. This means that the optimization introduced in D97818 will only apply to static functions. This change uses the Attributor to internalize OpenMP device routines by making a copy of each function with private linkage and replacing the uses in the module with it. This allows for the optimization to be applied to any regular function. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D102824
This commit is contained in:
parent
c747b7d1d9
commit
03d7e61c87
|
@ -1,13 +1,13 @@
|
|||
// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
|
||||
// host-no-diagnostics
|
||||
|
||||
void bar1(void) {
|
||||
#pragma omp parallel // #0
|
||||
// all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
|
||||
// safe-remark@#0 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
|
||||
// safe-remark@#0 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
|
||||
// force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
|
||||
{
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ void bar1(void) {
|
|||
void bar2(void) {
|
||||
#pragma omp parallel // #1
|
||||
// all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
|
||||
// safe-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
|
||||
// safe-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
|
||||
// force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
|
||||
{
|
||||
}
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
// RUN: %clang_cc1 -verify=host -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
|
||||
|
||||
// host-no-diagnostics
|
||||
|
||||
void bar(void) {
|
||||
#pragma omp parallel // #1 \
|
||||
// expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
|
||||
// expected-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
|
||||
// expected-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
|
||||
{
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1083,13 +1083,15 @@ struct Attributor {
|
|||
/// the abstract attributes.
|
||||
/// \param CGUpdater Helper to update an underlying call graph.
|
||||
/// \param Allowed If not null, a set limiting the attribute opportunities.
|
||||
/// \param DeleteFns Whether to delete functions
|
||||
/// \param DeleteFns Whether to delete functions.
|
||||
/// \param RewriteSignatures Whether to rewrite function signatures.
|
||||
Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
|
||||
CallGraphUpdater &CGUpdater,
|
||||
DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true)
|
||||
DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
|
||||
bool RewriteSignatures = true)
|
||||
: Allocator(InfoCache.Allocator), Functions(Functions),
|
||||
InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
|
||||
DeleteFns(DeleteFns) {}
|
||||
DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures) {}
|
||||
|
||||
~Attributor();
|
||||
|
||||
|
@ -1665,6 +1667,21 @@ public:
|
|||
///
|
||||
static void createShallowWrapper(Function &F);
|
||||
|
||||
/// Make another copy of the function \p F such that the copied version has
|
||||
/// internal linkage afterwards and can be analysed. Then we replace all uses
|
||||
/// of the original function to the copied one
|
||||
///
|
||||
/// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
|
||||
/// linkage can be internalized because these linkages guarantee that other
|
||||
/// definitions with the same name have the same semantics as this one.
|
||||
///
|
||||
/// This will only be run if the `attributor-allow-deep-wrappers` option is
|
||||
/// set, or if the function is called with \p Force set to true.
|
||||
///
|
||||
/// If the function \p F failed to be internalized the return value will be a
|
||||
/// null pointer.
|
||||
static Function *internalizeFunction(Function &F, bool Force = false);
|
||||
|
||||
/// Return the data layout associated with the anchor scope.
|
||||
const DataLayout &getDataLayout() const { return InfoCache.DL; }
|
||||
|
||||
|
@ -1777,6 +1794,9 @@ private:
|
|||
/// Whether to delete functions.
|
||||
const bool DeleteFns;
|
||||
|
||||
/// Whether to rewrite signatures.
|
||||
const bool RewriteSignatures;
|
||||
|
||||
/// A set to remember the functions we already assume to be live and visited.
|
||||
DenseSet<const Function *> VisitedFunctions;
|
||||
|
||||
|
|
|
@ -1621,19 +1621,12 @@ void Attributor::createShallowWrapper(Function &F) {
|
|||
NumFnShallowWrappersCreated++;
|
||||
}
|
||||
|
||||
/// Make another copy of the function \p F such that the copied version has
|
||||
/// internal linkage afterwards and can be analysed. Then we replace all uses
|
||||
/// of the original function to the copied one
|
||||
///
|
||||
/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr`
|
||||
/// linkage can be internalized because these linkages guarantee that other
|
||||
/// definitions with the same name have the same semantics as this one
|
||||
///
|
||||
static Function *internalizeFunction(Function &F) {
|
||||
assert(AllowDeepWrapper && "Cannot create a copy if not allowed.");
|
||||
assert(!F.isDeclaration() && !F.hasExactDefinition() &&
|
||||
!GlobalValue::isInterposableLinkage(F.getLinkage()) &&
|
||||
"Trying to internalize function which cannot be internalized.");
|
||||
Function *Attributor::internalizeFunction(Function &F, bool Force) {
|
||||
if (!AllowDeepWrapper && !Force)
|
||||
return nullptr;
|
||||
if (F.isDeclaration() || F.hasLocalLinkage() ||
|
||||
GlobalValue::isInterposableLinkage(F.getLinkage()))
|
||||
return nullptr;
|
||||
|
||||
Module &M = *F.getParent();
|
||||
FunctionType *FnTy = F.getFunctionType();
|
||||
|
@ -1663,7 +1656,8 @@ static Function *internalizeFunction(Function &F) {
|
|||
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
|
||||
F.getAllMetadata(MDs);
|
||||
for (auto MDIt : MDs)
|
||||
Copied->addMetadata(MDIt.first, *MDIt.second);
|
||||
if (!Copied->hasMetadata())
|
||||
Copied->addMetadata(MDIt.first, *MDIt.second);
|
||||
|
||||
M.getFunctionList().insert(F.getIterator(), Copied);
|
||||
F.replaceAllUsesWith(Copied);
|
||||
|
@ -1675,6 +1669,9 @@ static Function *internalizeFunction(Function &F) {
|
|||
bool Attributor::isValidFunctionSignatureRewrite(
|
||||
Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
|
||||
|
||||
if (!RewriteSignatures)
|
||||
return false;
|
||||
|
||||
auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
|
||||
// Forbid the call site to cast the function return type. If we need to
|
||||
// rewrite these functions we need to re-create a cast for the new call site
|
||||
|
@ -2459,7 +2456,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
|
|||
Function *F = Functions[u];
|
||||
if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() &&
|
||||
!GlobalValue::isInterposableLinkage(F->getLinkage())) {
|
||||
Function *NewF = internalizeFunction(*F);
|
||||
Function *NewF = Attributor::internalizeFunction(*F);
|
||||
assert(NewF && "Could not internalize function.");
|
||||
Functions.insert(NewF);
|
||||
|
||||
// Update call graph
|
||||
|
|
|
@ -1623,9 +1623,9 @@ private:
|
|||
};
|
||||
GlobalizationRFI.foreachUse(SCC, CreateAA);
|
||||
|
||||
for (auto &F : M) {
|
||||
if (!F.isDeclaration())
|
||||
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
|
||||
for (auto *F : SCC) {
|
||||
if (!F->isDeclaration())
|
||||
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -2620,11 +2620,19 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
|
|||
if (DisableOpenMPOptimizations)
|
||||
return PreservedAnalyses::all();
|
||||
|
||||
// Look at every function definition in the Module.
|
||||
// Create internal copies of each function if this is a kernel Module.
|
||||
DenseSet<const Function *> InternalizedFuncs;
|
||||
if (!OMPInModule.getKernels().empty())
|
||||
for (Function &F : M)
|
||||
if (!F.isDeclaration() && !OMPInModule.getKernels().contains(&F))
|
||||
if (Attributor::internalizeFunction(F, /* Force */ true))
|
||||
InternalizedFuncs.insert(&F);
|
||||
|
||||
// Look at every function definition in the Module that wasn't internalized.
|
||||
SmallVector<Function *, 16> SCC;
|
||||
for (Function &Fn : M)
|
||||
if (!Fn.isDeclaration())
|
||||
SCC.push_back(&Fn);
|
||||
for (Function &F : M)
|
||||
if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
|
||||
SCC.push_back(&F);
|
||||
|
||||
if (SCC.empty())
|
||||
return PreservedAnalyses::all();
|
||||
|
@ -2645,7 +2653,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
|
|||
OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions,
|
||||
OMPInModule.getKernels());
|
||||
|
||||
Attributor A(Functions, InfoCache, CGUpdater);
|
||||
Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false);
|
||||
|
||||
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
|
||||
bool Changed = OMPOpt.run(true);
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
||||
target triple = "nvptx64"
|
||||
|
||||
@S = external local_unnamed_addr global i8*
|
||||
|
||||
; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
|
||||
; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
|
||||
|
||||
|
@ -67,7 +69,7 @@ exit:
|
|||
define void @use(i8* %x) {
|
||||
entry:
|
||||
%addr = alloca i8*
|
||||
store i8* %x, i8** %addr
|
||||
store i8* %x, i8** @S
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
; RUN: opt -passes=openmp-opt-cgscc -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt -passes=openmp-opt -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
; ModuleID = 'single_threaded_exeuction.c'
|
||||
|
||||
define void @kernel() {
|
||||
define weak void @kernel() {
|
||||
call void @__kmpc_kernel_init(i32 512, i16 1)
|
||||
call void @nvptx()
|
||||
call void @amdgcn()
|
||||
|
@ -12,14 +12,15 @@ define void @kernel() {
|
|||
; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
|
||||
; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
|
||||
; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
|
||||
; Function Attrs: noinline nounwind uwtable
|
||||
define dso_local void @nvptx() {
|
||||
; Function Attrs: noinline
|
||||
define internal void @nvptx() {
|
||||
entry:
|
||||
%call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
%cmp = icmp eq i32 %call, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then:
|
||||
call void @foo()
|
||||
call void @bar()
|
||||
br label %if.end
|
||||
|
||||
|
@ -30,14 +31,15 @@ if.end:
|
|||
; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
|
||||
; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
|
||||
; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
|
||||
; Function Attrs: noinline nounwind uwtable
|
||||
define dso_local void @amdgcn() {
|
||||
; Function Attrs: noinline
|
||||
define internal void @amdgcn() {
|
||||
entry:
|
||||
%call = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%cmp = icmp eq i32 %call, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
|
||||
if.then:
|
||||
call void @foo()
|
||||
call void @bar()
|
||||
br label %if.end
|
||||
|
||||
|
@ -45,9 +47,16 @@ if.end:
|
|||
ret void
|
||||
}
|
||||
|
||||
; CHECK: [openmp-opt] Basic block @bar entry is executed by a single thread.
|
||||
; Function Attrs: noinline nounwind uwtable
|
||||
define internal void @bar() {
|
||||
; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread.
|
||||
; Function Attrs: noinline
|
||||
define internal void @foo() {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread.
|
||||
; Function Attrs: noinline
|
||||
define void @bar() {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue