Support repeated machine outlining

Summary: The following change is to allow the machine outlining can be applied for Nth times, where N is specified by the compiler option. By default the value of N is 1. The motivation is that the repeated machine outlining can further reduce code size.  Please refer to the presentation "Improving Swift Binary Size via Link Time Optimization" in LLVM Developers' Meeting in 2019.

Reviewers: aschwaighofer, tellenbach, paquette

Reviewed By: paquette

Subscribers: tellenbach, hiraditya, llvm-commits, jinlin

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71027
This commit is contained in:
Jin Lin 2020-03-17 15:40:26 -07:00
parent e6a74803d4
commit 0d896278c8
2 changed files with 193 additions and 3 deletions

View File

@ -97,6 +97,13 @@ static cl::opt<bool> EnableLinkOnceODROutlining(
cl::desc("Enable the machine outliner on linkonceodr functions"),
cl::init(false));
// Set the number of times to repeatedly apply outlining.
// Defaults to 1, but more repetitions can save additional size.
static cl::opt<unsigned>
NumRepeat("machine-outline-runs", cl::Hidden,
cl::desc("The number of times to apply machine outlining"),
cl::init(1));
namespace {
/// Represents an undefined index in the suffix tree.
@ -842,6 +849,9 @@ struct MachineOutliner : public ModulePass {
/// linkonceodr linkage.
bool OutlineFromLinkOnceODRs = false;
/// The current repeat number of machine outlining.
unsigned OutlineRepeatedNum = 0;
/// Set to true if the outliner should run on all functions in the module
/// considered safe for outlining.
/// Set to true by default for compatibility with llc's -run-pass option.
@ -900,9 +910,12 @@ struct MachineOutliner : public ModulePass {
InstructionMapper &Mapper,
unsigned Name);
/// Calls 'doOutline()'.
/// Calls runOnceOnModule NumRepeat times
bool runOnModule(Module &M) override;
/// Calls 'doOutline()'.
bool runOnceOnModule(Module &M, unsigned Iter);
/// Construct a suffix tree on the instructions in \p M and outline repeated
/// strings from that tree.
bool doOutline(Module &M, unsigned &OutlinedFunctionNum);
@ -1099,7 +1112,13 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
// Create the function name. This should be unique.
// FIXME: We should have a better naming scheme. This should be stable,
// regardless of changes to the outliner's cost model/traversal order.
std::string FunctionName = ("OUTLINED_FUNCTION_" + Twine(Name)).str();
std::string FunctionName;
if (OutlineRepeatedNum > 0)
FunctionName = ("OUTLINED_FUNCTION_" + Twine(OutlineRepeatedNum + 1) + "_" +
Twine(Name))
.str();
else
FunctionName = ("OUTLINED_FUNCTION_" + Twine(Name)).str();
// Create the function using an IR-level function.
LLVMContext &C = M.getContext();
@ -1438,12 +1457,14 @@ void MachineOutliner::emitInstrCountChangedRemark(
}
}
bool MachineOutliner::runOnModule(Module &M) {
bool MachineOutliner::runOnceOnModule(Module &M, unsigned Iter) {
// Check if there's anything in the module. If it's empty, then there's
// nothing to outline.
if (M.empty())
return false;
OutlineRepeatedNum = Iter;
// Number to append to the current outlined function.
unsigned OutlinedFunctionNum = 0;
@ -1507,3 +1528,23 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
return OutlinedSomething;
}
// Apply machine outlining for NumRepeat times.
bool MachineOutliner::runOnModule(Module &M) {
if (NumRepeat < 1)
report_fatal_error("Expect NumRepeat for machine outlining "
"to be greater than or equal to 1!\n");
bool Changed = false;
for (unsigned I = 0; I < NumRepeat; I++) {
if (!runOnceOnModule(M, I)) {
LLVM_DEBUG(dbgs() << "Stopped outlining at iteration " << I
<< " because no changes were found.\n";);
return Changed;
}
Changed = true;
}
LLVM_DEBUG(dbgs() << "Stopped outlining because iteration is "
"equal to " << NumRepeat << "\n";);
return Changed;
}

View File

@ -0,0 +1,149 @@
# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -machine-outline-runs=2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix TWO-RUNS
# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -machine-outline-runs=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix ONE-RUN
# RUN: llc -mtriple=aarch64 -run-pass=machine-outliner -machine-outline-runs=4 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix FOUR-RUNS
# Example of Repeated Instruction Sequence - Iterative Machine Outlining
#
#; define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) {
# ... ... ...
# %8 = load i1, i1* %7 %8 = load i1, i1* %7
# %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 %9 = load i4, i4*, %6
# store i4 %9, i4* %5 store i4 %9, i4* %5 store i4 %9, i4* %5
# ... ... ...
# } } }
#
# After machine outliner (1st time)
#
# define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) {
# ... ... ...
# %8 = load i1, i1* %7 %8 = load i1, i1* %7
# call void @outlined_function_1_1 call void @outlined_function_1_1 call void @outlined_function_1_1
# ... ... ...
# } } }
#
# After machine outliner (2nd time)
#
# define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) {
# ... ... ...
# call void @outlined_function_2_1 call void @outlined_function_1_1 call void @outlined_function_2_1
# ... ... ...
# } } }
#
# Check whether machine outliner can further find the outlining opportunity after machine
# outlining has performed.
#
--- |
declare void @foo() local_unnamed_addr
declare void @widget() local_unnamed_addr
; Function Attrs: minsize noredzone optsize
define void @baz.14() #0 {
ret void
}
; Function Attrs: minsize noredzone optsize
define void @baz.15() #0 {
ret void
}
; Function Attrs: minsize noredzone optsize
define void @baz.16() #0 {
ret void
}
attributes #0 = { minsize noredzone optsize }
...
---
name: baz.14
tracksRegLiveness: true
stack:
- { id: 0, offset: -8, size: 8 }
- { id: 1, offset: -16, size: 8 }
body: |
bb.0:
liveins: $x0, $x19, $lr
early-clobber $sp = frame-setup STPXpre killed $lr, killed $x19, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $w19, -8
frame-setup CFI_INSTRUCTION offset $w30, -16
renamable $x19 = COPY $x0
renamable $x0 = nuw ADDXri $x0, 48, 0
$x1 = ADDXri $sp, 0, 0
dead $w2 = MOVi32imm 33, implicit-def $x2
$x3 = COPY $xzr
BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit killed $x1, implicit killed $x2, implicit killed $x3, implicit-def $sp
$x0 = COPY killed renamable $x19
BL @widget, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0)
RET_ReallyLR
...
---
name: baz.15
tracksRegLiveness: true
stack:
- { id: 0, offset: -8, size: 8 }
- { id: 1, offset: -16, size: 8 }
body: |
bb.0:
liveins: $x0, $x19, $lr
early-clobber $sp = frame-setup STPXpre killed $lr, killed $x19, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $w19, -8
frame-setup CFI_INSTRUCTION offset $w30, -16
renamable $x19 = COPY $x0
renamable $x0 = nuw ADDXri killed renamable $x0, 16, 0
$x1 = ADDXri $sp, 0, 0
dead $w2 = MOVi32imm 33, implicit-def $x2
$x3 = COPY $xzr
BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit $x1, implicit killed $x2, implicit killed $x3, implicit-def $sp
$x0 = COPY killed renamable $x19
BL @widget, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0)
RET_ReallyLR
...
---
name: baz.16
tracksRegLiveness: true
stack:
- { id: 0, offset: -8, size: 8 }
- { id: 1, offset: -16, size: 8 }
body: |
bb.0:
liveins: $x0, $x19, $lr
early-clobber $sp = frame-setup STPXpre killed $lr, killed $x19, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
frame-setup CFI_INSTRUCTION def_cfa_offset 16
frame-setup CFI_INSTRUCTION offset $w19, -8
frame-setup CFI_INSTRUCTION offset $w30, -16
renamable $x19 = COPY $x0
renamable $x0 = nuw ADDXri $x0, 48, 0
$x1 = ADDXri $sp, 0, 0
dead $w2 = MOVi32imm 33, implicit-def $x2
$x3 = COPY $xzr
BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit killed $x1, implicit killed $x2, implicit killed $x3, implicit-def $sp
$x0 = COPY killed renamable $x19
BL @widget, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
early-clobber $sp, $lr, $x19 = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0)
RET_ReallyLR
...
# TWO-RUNS: name: OUTLINED_FUNCTION_2_0
# TWO-RUNS-DAG: bb.0:
# TWO-RUNS-DAG: renamable $x19 = COPY $x0
# TWO-RUNS-NEXT: renamable $x0 = nuw ADDXri $x0, 48, 0
# TWO-RUNS-NEXT: TCRETURNdi @OUTLINED_FUNCTION_0, 0, implicit $sp
#
# The machine outliner is expected to stop at the 1st iteration for case ONE-RUN
# since machine-outline-runs is specified as 1.
# ONE-RUN-NOT: [[OUTLINED:OUTLINED_FUNCTION_2_[0-9]+]]
#
# The machine outliner is expected to stop at the 3rd iteration for case FOUR-RUNS
# since the MIR has no change at the 3rd iteration.
# FOUR-RUNS-NOT: [[OUTLINED:OUTLINED_FUNCTION_3_[0-9]+]]
# FOUR-RUNS-NOT: [[OUTLINED:OUTLINED_FUNCTION_4_[0-9]+]]