From 7360d6d921c67e92cc82635112e72fb2a59aacf4 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 2 Nov 2020 16:26:10 +0000 Subject: [PATCH] [ARM][MachineOutliner] Do not overestimate LR liveness in return block The `LiveRegUnits` utility (as well as `LivePhysRegs`) considers callee-saved registers to be alive at the point after the return instruction in a block. In the ARM backend, the `LR` register is classified as callee-saved, which is not really correct (from an ARM eABI or just common sense point of view). These two conditions cause the `MachineOutliner` to overestimate the liveness of `LR`, which results in unnecessary saves/restores of `LR` around calls to outlined sequences. It also causes the `MachineVerifer` to crash in some cases, because the save instruction reads a dead `LR`, for example when the following program: int h(int, int); int f(int a, int b, int c, int d) { a = h(a + 1, b - 1); b = b + c; return 1 + (2 * a + b) * (c - d) / (a - b) * (c + d); } int g(int a, int b, int c, int d) { a = h(a - 1, b + 1); b = b + c; return 2 + (2 * a + b) * (c - d) / (a - b) * (c + d); } is compiled with `-target arm-eabi -march=armv7-m -Oz`. This patch computes the liveness of `LR` in return blocks only, while taking into account the few ARM instructions, which read `LR`, but nevertheless the register is not mentioned (explicitly or implicitly) in the instruction operands. Differential Revision: https://reviews.llvm.org/D89189 --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 50 +++++++++++++++- .../CodeGen/ARM/machine-outliner-return-1.ll | 57 +++++++++++++++++++ .../CodeGen/ARM/machine-outliner-return-2.ll | 52 +++++++++++++++++ 3 files changed, 156 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/ARM/machine-outliner-return-1.ll create mode 100644 llvm/test/CodeGen/ARM/machine-outliner-return-2.ll diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index c0d5ae4dec27..6844b02b8528 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -5663,6 +5663,37 @@ ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { return 0u; } +// Compute liveness of LR at the point after the interval [I, E), which +// denotes a *backward* iteration through instructions. Used only for return +// basic blocks, which do not end with a tail call. +static bool isLRAvailable(const TargetRegisterInfo &TRI, + MachineBasicBlock::reverse_iterator I, + MachineBasicBlock::reverse_iterator E) { + // At the end of the function LR dead. + bool Live = false; + for (; I != E; ++I) { + const MachineInstr &MI = *I; + + // Check defs of LR. + if (MI.modifiesRegister(ARM::LR, &TRI)) + Live = false; + + // Check uses of LR. + unsigned Opcode = MI.getOpcode(); + if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR || + Opcode == ARM::SUBS_PC_LR || Opcode == ARM::tBX_RET || + Opcode == ARM::tBXNS_RET) { + // These instructions use LR, but it's not an (explicit or implicit) + // operand. + Live = true; + continue; + } + if (MI.readsRegister(ARM::LR, &TRI)) + Live = true; + } + return !Live; +} + outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( std::vector &RepeatedSequenceLocs) const { outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; @@ -5755,8 +5786,15 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( for (outliner::Candidate &C : RepeatedSequenceLocs) { C.initLRU(TRI); - // Is LR available? If so, we don't need a save. - if (C.LRU.available(ARM::LR)) { + // LR liveness is overestimated in return blocks, unless they end with a + // tail call. + const auto Last = C.getMBB()->rbegin(); + const bool LRIsAvailable = + C.getMBB()->isReturnBlock() && !Last->isCall() + ? isLRAvailable(TRI, Last, + (MachineBasicBlock::reverse_iterator)C.front()) + : C.LRU.available(ARM::LR); + if (LRIsAvailable) { FrameID = MachineOutlinerNoLRSave; NumBytesNoStackCalls += Costs.CallNoLRSave; C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave); @@ -5867,7 +5905,13 @@ bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) Flags |= MachineOutlinerMBBFlags::HasCalls; - if (!LRU.available(ARM::LR)) + // LR liveness is overestimated in return blocks. + + bool LRIsAvailable = + MBB.isReturnBlock() && !MBB.back().isCall() + ? isLRAvailable(getRegisterInfo(), MBB.rbegin(), MBB.rend()) + : LRU.available(ARM::LR); + if (!LRIsAvailable) Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; return true; diff --git a/llvm/test/CodeGen/ARM/machine-outliner-return-1.ll b/llvm/test/CodeGen/ARM/machine-outliner-return-1.ll new file mode 100644 index 000000000000..30d2c151b517 --- /dev/null +++ b/llvm/test/CodeGen/ARM/machine-outliner-return-1.ll @@ -0,0 +1,57 @@ +; RUN: llc --verify-machineinstrs %s -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-unknown-unknown-eabi" + +declare dso_local i32 @h0(i32, i32) local_unnamed_addr #1 + +define dso_local i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 { +entry: + %add = add nsw i32 %a, 1 + %sub = add nsw i32 %b, -1 + %call = tail call i32 @h0(i32 %add, i32 %sub) #0 + %add1 = add nsw i32 %c, %b + %mul = shl nsw i32 %call, 1 + %add2 = add nsw i32 %mul, %add1 + %sub3 = sub nsw i32 %c, %d + %mul4 = mul nsw i32 %add2, %sub3 + %sub5 = sub nsw i32 %call, %add1 + %div = sdiv i32 %mul4, %sub5 + %add6 = add nsw i32 %d, %c + %mul7 = mul nsw i32 %div, %add6 + %add8 = add nsw i32 %mul7, 1 + ret i32 %add8 +} +; CHECK-LABEL: f: +; CHECK: bl h0 +; CHECK-NEXT: bl OUTLINED_FUNCTION_0 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: pop {r4, r5, r6, pc} + + +define dso_local i32 @g(i32 %a, i32 %b, i32 %c, i32 %d) local_unnamed_addr #0 { +entry: + %sub = add nsw i32 %a, -1 + %add = add nsw i32 %b, 1 + %call = tail call i32 @h0(i32 %sub, i32 %add) #0 + %add1 = add nsw i32 %c, %b + %mul = shl nsw i32 %call, 1 + %add2 = add nsw i32 %mul, %add1 + %sub3 = sub nsw i32 %c, %d + %mul4 = mul nsw i32 %add2, %sub3 + %sub5 = sub nsw i32 %call, %add1 + %div = sdiv i32 %mul4, %sub5 + %add6 = add nsw i32 %d, %c + %mul7 = mul nsw i32 %div, %add6 + %add8 = add nsw i32 %mul7, 2 + ret i32 %add8 +} +; CHECK-LABEL: g: +; CHECK: bl h0 +; CHECK-NEXT: bl OUTLINED_FUNCTION_0 +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: pop {r4, r5, r6, pc} + + +attributes #0 = { minsize nounwind optsize } +attributes #1 = { minsize optsize } diff --git a/llvm/test/CodeGen/ARM/machine-outliner-return-2.ll b/llvm/test/CodeGen/ARM/machine-outliner-return-2.ll new file mode 100644 index 000000000000..dc695f5e928f --- /dev/null +++ b/llvm/test/CodeGen/ARM/machine-outliner-return-2.ll @@ -0,0 +1,52 @@ +; RUN: llc -verify-machineinstrs %s -o - | FileCheck %s +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7m-unknown-unknown-eabi" + +declare dso_local i32 @t(i32) local_unnamed_addr #0 + +define dso_local i32 @f(i32 %a, i32 %b, i32 %c) local_unnamed_addr #0 { +entry: + %mul = mul nsw i32 %a, 3 + %add = add nsw i32 %mul, 1 + %sub = add nsw i32 %b, -1 + %div = sdiv i32 %add, %sub + %sub1 = sub nsw i32 %a, %c + %div2 = sdiv i32 %div, %sub1 + %mul3 = mul nsw i32 %div2, %b + %add4 = add nsw i32 %mul3, 1 + %call = tail call i32 @t(i32 %add4) #0 + ret i32 %call +} +; CHECK-LABEL: f: +; CHECK: str lr, [sp, #-8]! +; CHECK-NEXT: bl OUTLINED_FUNCTION_0 +; CHECK-NEXT: ldr lr, [sp], #8 +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: b t + +define dso_local i32 @g(i32 %a, i32 %b, i32 %c) local_unnamed_addr #0 { +entry: + %mul = mul nsw i32 %a, 3 + %add = add nsw i32 %mul, 1 + %sub = add nsw i32 %b, -1 + %div = sdiv i32 %add, %sub + %sub1 = sub nsw i32 %a, %c + %div2 = sdiv i32 %div, %sub1 + %mul3 = mul nsw i32 %div2, %b + %add4 = add nsw i32 %mul3, 3 + %call = tail call i32 @t(i32 %add4) #0 + ret i32 %call +} + +; CHECK-LABEL: g: +; CHECK: str lr, [sp, #-8]! +; CHECK-NEXT: bl OUTLINED_FUNCTION_0 +; CHECK-NEXT: ldr lr, [sp], #8 +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: b t + +; CHECK-LABEL: OUTLINED_FUNCTION_0: +; CHECK-NOT: lr +; CHECK: bx lr + +attributes #0 = { minsize nounwind optsize }