[ARM][MachineOutliner] Do not overestimate LR liveness in return block

The `LiveRegUnits` utility (as well as `LivePhysRegs`) considers
callee-saved registers to be alive at the point after the return
instruction in a block. In the ARM backend, the `LR` register is
classified as callee-saved, which is not really correct (from an ARM
eABI or just common sense point of view).  These two conditions cause
the `MachineOutliner` to overestimate the liveness of `LR`, which
results in unnecessary saves/restores of `LR` around calls to outlined
sequences.  It also causes the `MachineVerifer` to crash in some
cases, because the save instruction reads a dead `LR`, for example
when the following program:

int h(int, int);

int f(int a, int b, int c, int d) {
  a = h(a + 1, b - 1);
  b = b + c;
  return 1 + (2 * a + b) * (c - d) / (a - b) * (c + d);
}

int g(int a, int b, int c, int d) {
  a = h(a - 1, b + 1);
  b = b + c;
  return 2 + (2 * a + b) * (c - d) / (a - b) * (c + d);
}

is compiled with `-target arm-eabi -march=armv7-m -Oz`.

This patch computes the liveness of `LR` in return blocks only, while
taking into account the few ARM instructions, which read `LR`, but
nevertheless the register is not mentioned (explicitly or implicitly)
in the instruction operands.

Differential Revision: https://reviews.llvm.org/D89189
This commit is contained in:
Momchil Velikov 2020-11-02 16:26:10 +00:00
parent 2fc704a0a5
commit 7360d6d921
3 changed files with 156 additions and 3 deletions

View File

@ -5663,6 +5663,37 @@ ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
return 0u;
}
// Compute liveness of LR at the point after the interval [I, E), which
// denotes a *backward* iteration through instructions. Used only for return
// basic blocks, which do not end with a tail call.
static bool isLRAvailable(const TargetRegisterInfo &TRI,
MachineBasicBlock::reverse_iterator I,
MachineBasicBlock::reverse_iterator E) {
// At the end of the function LR dead.
bool Live = false;
for (; I != E; ++I) {
const MachineInstr &MI = *I;
// Check defs of LR.
if (MI.modifiesRegister(ARM::LR, &TRI))
Live = false;
// Check uses of LR.
unsigned Opcode = MI.getOpcode();
if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR ||
Opcode == ARM::SUBS_PC_LR || Opcode == ARM::tBX_RET ||
Opcode == ARM::tBXNS_RET) {
// These instructions use LR, but it's not an (explicit or implicit)
// operand.
Live = true;
continue;
}
if (MI.readsRegister(ARM::LR, &TRI))
Live = true;
}
return !Live;
}
outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
@ -5755,8 +5786,15 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
for (outliner::Candidate &C : RepeatedSequenceLocs) {
C.initLRU(TRI);
// Is LR available? If so, we don't need a save.
if (C.LRU.available(ARM::LR)) {
// LR liveness is overestimated in return blocks, unless they end with a
// tail call.
const auto Last = C.getMBB()->rbegin();
const bool LRIsAvailable =
C.getMBB()->isReturnBlock() && !Last->isCall()
? isLRAvailable(TRI, Last,
(MachineBasicBlock::reverse_iterator)C.front())
: C.LRU.available(ARM::LR);
if (LRIsAvailable) {
FrameID = MachineOutlinerNoLRSave;
NumBytesNoStackCalls += Costs.CallNoLRSave;
C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave);
@ -5867,7 +5905,13 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
Flags |= MachineOutlinerMBBFlags::HasCalls;
if (!LRU.available(ARM::LR))
// LR liveness is overestimated in return blocks.
bool LRIsAvailable =
MBB.isReturnBlock() && !MBB.back().isCall()
? isLRAvailable(getRegisterInfo(), MBB.rbegin(), MBB.rend())
: LRU.available(ARM::LR);
if (!LRIsAvailable)
Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
return true;

View File

@ -0,0 +1,57 @@
; RUN: llc --verify-machineinstrs %s -o - | FileCheck %s
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7m-unknown-unknown-eabi"
declare dso_local i32 @h0(i32, i32) local_unnamed_addr #1
define dso_local i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
entry:
%add = add nsw i32 %a, 1
%sub = add nsw i32 %b, -1
%call = tail call i32 @h0(i32 %add, i32 %sub) #0
%add1 = add nsw i32 %c, %b
%mul = shl nsw i32 %call, 1
%add2 = add nsw i32 %mul, %add1
%sub3 = sub nsw i32 %c, %d
%mul4 = mul nsw i32 %add2, %sub3
%sub5 = sub nsw i32 %call, %add1
%div = sdiv i32 %mul4, %sub5
%add6 = add nsw i32 %d, %c
%mul7 = mul nsw i32 %div, %add6
%add8 = add nsw i32 %mul7, 1
ret i32 %add8
}
; CHECK-LABEL: f:
; CHECK: bl h0
; CHECK-NEXT: bl OUTLINED_FUNCTION_0
; CHECK-NEXT: adds r0, #1
; CHECK-NEXT: pop {r4, r5, r6, pc}
define dso_local i32 @g(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
entry:
%sub = add nsw i32 %a, -1
%add = add nsw i32 %b, 1
%call = tail call i32 @h0(i32 %sub, i32 %add) #0
%add1 = add nsw i32 %c, %b
%mul = shl nsw i32 %call, 1
%add2 = add nsw i32 %mul, %add1
%sub3 = sub nsw i32 %c, %d
%mul4 = mul nsw i32 %add2, %sub3
%sub5 = sub nsw i32 %call, %add1
%div = sdiv i32 %mul4, %sub5
%add6 = add nsw i32 %d, %c
%mul7 = mul nsw i32 %div, %add6
%add8 = add nsw i32 %mul7, 2
ret i32 %add8
}
; CHECK-LABEL: g:
; CHECK: bl h0
; CHECK-NEXT: bl OUTLINED_FUNCTION_0
; CHECK-NEXT: adds r0, #2
; CHECK-NEXT: pop {r4, r5, r6, pc}
attributes #0 = { minsize nounwind optsize }
attributes #1 = { minsize optsize }

View File

@ -0,0 +1,52 @@
; RUN: llc -verify-machineinstrs %s -o - | FileCheck %s
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv7m-unknown-unknown-eabi"
declare dso_local i32 @t(i32) local_unnamed_addr #0
define dso_local i32 @f(i32 %a, i32 %b, i32 %c) local_unnamed_addr #0 {
entry:
%mul = mul nsw i32 %a, 3
%add = add nsw i32 %mul, 1
%sub = add nsw i32 %b, -1
%div = sdiv i32 %add, %sub
%sub1 = sub nsw i32 %a, %c
%div2 = sdiv i32 %div, %sub1
%mul3 = mul nsw i32 %div2, %b
%add4 = add nsw i32 %mul3, 1
%call = tail call i32 @t(i32 %add4) #0
ret i32 %call
}
; CHECK-LABEL: f:
; CHECK: str lr, [sp, #-8]!
; CHECK-NEXT: bl OUTLINED_FUNCTION_0
; CHECK-NEXT: ldr lr, [sp], #8
; CHECK-NEXT: adds r0, #1
; CHECK-NEXT: b t
define dso_local i32 @g(i32 %a, i32 %b, i32 %c) local_unnamed_addr #0 {
entry:
%mul = mul nsw i32 %a, 3
%add = add nsw i32 %mul, 1
%sub = add nsw i32 %b, -1
%div = sdiv i32 %add, %sub
%sub1 = sub nsw i32 %a, %c
%div2 = sdiv i32 %div, %sub1
%mul3 = mul nsw i32 %div2, %b
%add4 = add nsw i32 %mul3, 3
%call = tail call i32 @t(i32 %add4) #0
ret i32 %call
}
; CHECK-LABEL: g:
; CHECK: str lr, [sp, #-8]!
; CHECK-NEXT: bl OUTLINED_FUNCTION_0
; CHECK-NEXT: ldr lr, [sp], #8
; CHECK-NEXT: adds r0, #3
; CHECK-NEXT: b t
; CHECK-LABEL: OUTLINED_FUNCTION_0:
; CHECK-NOT: lr
; CHECK: bx lr
attributes #0 = { minsize nounwind optsize }