From b9ed8ebe0e2ffa803b0bda60f9bbc9bb26f95000 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Wed, 20 Jan 2021 15:55:26 +0000
Subject: [PATCH] [ARM][RegisterScavenging] Don't consider LR liveout if it is
 not reloaded

https://bugs.llvm.org/show_bug.cgi?id=48232

When PrologEpilogInserter writes callee-saved registers to the stack, LR is not reloaded but is instead loaded directly into PC.
This was not taken into account when determining if each callee-saved register was liveout for the block.
When frame elimination inserts virtual registers, and the register scavenger tries to scavenge LR, it considers it liveout and tries to spill again.
However there is no emergency spill slot to use, and it fails with an error:

    fatal error: error in backend: Error while trying to spill LR from class GPR: Cannot scavenge register without an emergency spill slot!

This patch pervents any callee-saved registers which are not reloaded (including LR) from being marked liveout.
They are therefore available to scavenge without requiring an extra spill.
---
 llvm/lib/CodeGen/LiveRegUnits.cpp             |  13 +-
 llvm/test/CodeGen/AArch64/scavenge-lr.mir     | 221 ++++++++++++++++++
 .../test/CodeGen/Thumb2/mve-multivec-spill.ll |   8 +-
 3 files changed, 236 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/scavenge-lr.mir

diff --git a/llvm/lib/CodeGen/LiveRegUnits.cpp b/llvm/lib/CodeGen/LiveRegUnits.cpp
index ea2075bc139d..d8d8bd5d61a2 100644
--- a/llvm/lib/CodeGen/LiveRegUnits.cpp
+++ b/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -81,8 +81,17 @@ static void addBlockLiveIns(LiveRegUnits &LiveUnits,
 static void addCalleeSavedRegs(LiveRegUnits &LiveUnits,
                                const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR)
-    LiveUnits.addReg(*CSR);
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) {
+    const unsigned N = *CSR;
+
+    const auto &CSI = MFI.getCalleeSavedInfo();
+    auto Info =
+        llvm::find_if(CSI, [N](auto Info) { return Info.getReg() == N; });
+    // If we have no info for this callee-saved register, assume it is liveout
+    if (Info == CSI.end() || Info->isRestored())
+      LiveUnits.addReg(N);
+  }
 }
 
 void LiveRegUnits::addPristines(const MachineFunction &MF) {
diff --git a/llvm/test/CodeGen/AArch64/scavenge-lr.mir b/llvm/test/CodeGen/AArch64/scavenge-lr.mir
new file mode 100644
index 000000000000..a2296c12eb60
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/scavenge-lr.mir
@@ -0,0 +1,221 @@
+# RUN: llc -mtriple=thumbv7-unknown-linux-android30 -run-pass=prologepilog -verify-machineinstrs %s -o - | FileCheck %s
+
+# When saving and restoring callee-saved registers, LR is saved but not restored,
+# because it is reloaded directly into PC. Therefore it should be available to scavenge
+# without requiring an emergency spill slot.
+
+# Used to result in
+#   LLVM ERROR: Error while trying to spill LR from class GPR: Cannot scavenge register without an emergency spill slot!
+
+# Check that LR is considered live in
+# CHECK: liveins: {{.*}}$lr
+
+# Check that LR is saved to the stack
+# CHECK: frame-setup t2STMDB_UPD {{.*}} killed $lr
+# CHECK: frame-setup CFI_INSTRUCTION offset $lr,
+
+# Check that LR was successfully scavenged somewhere in the function
+# CHECK:  $lr = t2ADDri
+# CHECK: VSTMQIA $q11, killed $lr
+
+# Check that LR is not restored at the end of the function
+# CHECK-NOT: $lr = frame-destroy
+# CHECK-NOT: frame-destroy VLDMDIA_UPD {{.*}} def $lr
+# CHECK-NOT: frame-destroy t2LDMIA_RET {{.*}} def $lr
+# CHECK: frame-destroy t2LDMIA_RET {{.*}} def $pc
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+  %S = type { [32 x i8] }
+
+  define void @f(%S* %arg) {
+  entry:
+    %ppp..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -8
+    %ppp..sroa_cast248 = bitcast %S* %ppp..sroa_idx to <8 x float>*
+    %ppp.copyload = load <8 x float>, <8 x float>* %ppp..sroa_cast248, align 32
+
+    %xxx..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -5
+    %xxx..sroa_cast248 = bitcast %S* %xxx..sroa_idx to <8 x float>*
+    %xxx.copyload = load <8 x float>, <8 x float>* %xxx..sroa_cast248, align 32
+
+    %yyy..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -2
+    %yyy..sroa_cast244 = bitcast %S* %yyy..sroa_idx to <8 x float>*
+    %yyy.copyload = load <8 x float>, <8 x float>* %yyy..sroa_cast244, align 32
+
+    %zzz..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -7
+    %zzz..sroa_cast241 = bitcast %S* %zzz..sroa_idx to <8 x float>*
+    %zzz.copyload = load <8 x float>, <8 x float>* %zzz..sroa_cast241, align 32
+
+    %www..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -4
+    %www..sroa_cast238 = bitcast %S* %www..sroa_idx to <8 x float>*
+    %www.copyload = load <8 x float>, <8 x float>* %www..sroa_cast238, align 32
+
+    %uuu..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 1
+    %uuu..sroa_cast235 = bitcast %S* %uuu..sroa_idx to <8 x float>*
+    %uuu.copyload = load <8 x float>, <8 x float>* %uuu..sroa_cast235, align 32
+
+    %vvv..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -6
+    %vvv..sroa_cast230 = bitcast %S* %vvv..sroa_idx to <8 x float>*
+    %vvv.copyload = load <8 x float>, <8 x float>* %vvv..sroa_cast230, align 32
+
+    %ttt..sroa_idx = getelementptr inbounds %S, %S* %arg, i32 -3
+    %ttt..sroa_cast226 = bitcast %S* %ttt..sroa_idx to <8 x float>*
+    %ttt.copyload = load <8 x float>, <8 x float>* %ttt..sroa_cast226, align 32
+
+    %sss..sroa_cast223 = bitcast %S* %arg to <8 x float>*
+    %sss.copyload = load <8 x float>, <8 x float>* %sss..sroa_cast223, align 32
+
+    %mul.i = fmul <8 x float> %ppp.copyload, %www.copyload
+    %mul.i185 = fmul <8 x float> %xxx.copyload, %uuu.copyload
+    %mul.i179 = fmul <8 x float> %mul.i185, %vvv.copyload
+    %mul.i173 = fmul <8 x float> %mul.i179, %ttt.copyload
+    %mul.i167 = fmul <8 x float> %zzz.copyload, %mul.i173
+    %add.i = fadd <8 x float> %mul.i, %mul.i167
+    %div.i = fdiv <8 x float> zeroinitializer, %add.i
+    %mul.i153 = fmul <8 x float> %uuu.copyload, %div.i
+
+    store <8 x float> %mul.i153, <8 x float>* %ppp..sroa_cast248, align 32
+
+    %mul.i147 = fmul <8 x float> %uuu.copyload, %vvv.copyload
+    %mul.i141 = fmul <8 x float> %zzz.copyload, %sss.copyload
+    %mul.i135 = fmul <8 x float> %mul.i141, %div.i
+    %sub.i129 = fsub <8 x float> %mul.i147, %mul.i135
+
+    store <8 x float> %sub.i129, <8 x float>* %zzz..sroa_cast241, align 32
+    store <8 x float> %div.i, <8 x float>* %vvv..sroa_cast230, align 32
+    store <8 x float> %div.i, <8 x float>* %xxx..sroa_cast248, align 32
+
+    %mul.i123 = fmul <8 x float> %yyy.copyload, %vvv.copyload
+    %mul.i117 = fmul <8 x float> %mul.i123, %div.i
+    %sub.i111 = fsub <8 x float> %sss.copyload, %mul.i117
+    store <8 x float> %sub.i111, <8 x float>* %www..sroa_cast238, align 32
+
+    %mul.i105 = fmul <8 x float> %ppp.copyload, %ttt.copyload
+    %mul.i99 = fmul <8 x float> %mul.i105, %div.i
+    %sub.i93 = fsub <8 x float> %xxx.copyload, %mul.i99
+    store <8 x float> %sub.i93, <8 x float>* %ttt..sroa_cast226, align 32
+
+    %mul.i81 = fmul <8 x float> %yyy.copyload, %www.copyload
+    %mul.i75 = fmul <8 x float> %mul.i81, %div.i
+    %sub.i = fsub <8 x float> %mul.i185, %mul.i75
+    store <8 x float> %sub.i, <8 x float>* %yyy..sroa_cast244, align 32
+
+    ret void
+  }
+...
+---
+name:            f
+alignment:       2
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0' }
+frameInfo:
+  maxAlignment:    16
+  maxCallFrameSize: 0
+stack:
+  - { id: 0, type: spill-slot, size: 16, alignment: 16 }
+  - { id: 1, type: spill-slot, size: 16, alignment: 16 }
+  - { id: 2, type: spill-slot, size: 16, alignment: 16 }
+  - { id: 3, type: spill-slot, size: 16, alignment: 16 }
+constants:
+  - id:              0
+    value:           'float 0.000000e+00'
+    alignment:       4
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $r0
+    $r2 = t2SUBri $r0, 128, 14 /* CC::al */, $noreg, $noreg
+    $q8 = VLD1q64 $r2, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.www..sroa_cast238, align 32)
+    VSTMQIA $q8, %stack.0, 14 /* CC::al */, $noreg :: (store 16 into %stack.0)
+    $r12 = t2SUBri $r0, 256, 14 /* CC::al */, $noreg, $noreg
+    $q12 = VLD1q64 $r12, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.ppp..sroa_cast248, align 32)
+    $q1 = VMULfq $q12, killed $q8, 14 /* CC::al */, $noreg
+    $r3 = nuw t2ADDri $r0, 32, 14 /* CC::al */, $noreg, $noreg
+    $q10 = VLD1q64 killed $r3, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.uuu..sroa_cast235, align 32)
+    $r5 = t2SUBri $r0, 160, 14 /* CC::al */, $noreg, $noreg
+    $q15 = VLD1q64 $r5, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.xxx..sroa_cast248, align 32)
+    $q14 = VMULfq $q15, $q10, 14 /* CC::al */, $noreg
+    $r6 = t2SUBri $r0, 192, 14 /* CC::al */, $noreg, $noreg
+    $q13 = VLD1q64 $r6, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.vvv..sroa_cast230, align 32)
+    $q8 = VMULfq $q14, $q13, 14 /* CC::al */, $noreg
+    $r4 = t2SUBri $r0, 96, 14 /* CC::al */, $noreg, $noreg
+    $q6 = VLD1q64 $r4, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.ttt..sroa_cast226, align 32)
+    $q8 = VMULfq killed $q8, $q6, 14 /* CC::al */, $noreg
+    $r3 = t2SUBri $r0, 224, 14 /* CC::al */, $noreg, $noreg
+    $q5 = VLD1q64 $r3, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.zzz..sroa_cast241, align 32)
+    $q1 = VMLAfq killed $q1, $q5, killed $q8, 14 /* CC::al */, $noreg
+    $s8 = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
+    $s3 = VDIVS $s8, $s7, 14 /* CC::al */, $noreg, implicit-def $q0
+    $s2 = VDIVS $s8, $s6, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0
+    $s1 = VDIVS $s8, $s5, 14 /* CC::al */, $noreg, implicit killed $q0, implicit-def $q0
+    $s0 = VDIVS $s8, $s4, 14 /* CC::al */, $noreg, implicit killed $q1, implicit killed $q0, implicit-def $q0
+    $r7 = t2SUBri $r0, 64, 14 /* CC::al */, $noreg, $noreg
+    $q8 = VLD1q64 $r7, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.yyy..sroa_cast244, align 32)
+    VSTMQIA $q8, %stack.1, 14 /* CC::al */, $noreg :: (store 16 into %stack.1)
+    $q8 = VMULfq killed $q8, $q13, 14 /* CC::al */, $noreg
+    $r1 = t2ADDri $r0, 48, 14 /* CC::al */, $noreg, $noreg
+    $q9, $r0 = VLD1q32wb_fixed killed $r0, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.sss..sroa_cast223, align 32)
+    $q11 = COPY $q9
+    $q11 = VMLSfq killed $q11, killed $q8, $q0, 14 /* CC::al */, $noreg
+    $r2 = VST1q32wb_fixed killed $r2, 16, killed $q11, 14 /* CC::al */, $noreg :: (store 16 into %ir.www..sroa_cast238, align 32)
+    $q8 = VLD1q64 $r2, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.www..sroa_cast238 + 16, basealign 32)
+    VSTMQIA $q8, %stack.3, 14 /* CC::al */, $noreg :: (store 16 into %stack.3)
+    $q11 = VMULfq $q10, $q0, 14 /* CC::al */, $noreg
+    $r12 = VST1q32wb_fixed killed $r12, 16, killed $q11, 14 /* CC::al */, $noreg :: (store 16 into %ir.ppp..sroa_cast248, align 32)
+    $q11 = VLD1q64 $r12, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.ppp..sroa_cast248 + 16, basealign 32)
+    VSTMQIA $q11, %stack.2, 14 /* CC::al */, $noreg :: (store 16 into %stack.2)
+    $q1 = VMULfq killed $q11, killed $q8, 14 /* CC::al */, $noreg
+    $r5 = VST1q32wb_fixed killed $r5, 16, $q0, 14 /* CC::al */, $noreg :: (store 16 into %ir.xxx..sroa_cast248, align 32)
+    $q4 = VLD1q64 $r5, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.xxx..sroa_cast248 + 16, basealign 32)
+    $q11 = VLD1q64 killed $r1, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.uuu..sroa_cast235 + 16, basealign 32)
+    $q7 = VMULfq $q4, $q11, 14 /* CC::al */, $noreg
+    $r6 = VST1q32wb_fixed killed $r6, 16, $q0, 14 /* CC::al */, $noreg :: (store 16 into %ir.vvv..sroa_cast230, align 32)
+    $q3 = VLD1q64 $r6, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.vvv..sroa_cast230 + 16, basealign 32)
+    $q8 = VMULfq $q7, $q3, 14 /* CC::al */, $noreg
+    $q12 = VMULfq killed $q12, killed $q6, 14 /* CC::al */, $noreg
+    $q15 = VMLSfq killed $q15, killed $q12, $q0, 14 /* CC::al */, $noreg
+    $r4 = VST1q32wb_fixed killed $r4, 16, killed $q15, 14 /* CC::al */, $noreg :: (store 16 into %ir.ttt..sroa_cast226, align 32)
+    $q12 = VLD1q64 $r4, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.ttt..sroa_cast226 + 16, basealign 32)
+    $q8 = VMULfq killed $q8, $q12, 14 /* CC::al */, $noreg
+    $q9 = VMULfq killed $q5, killed $q9, 14 /* CC::al */, $noreg
+    $q10 = VMULfq killed $q10, killed $q13, 14 /* CC::al */, $noreg
+    $q10 = VMLSfq killed $q10, killed $q9, $q0, 14 /* CC::al */, $noreg
+    $r3 = VST1q32wb_fixed killed $r3, 16, killed $q10, 14 /* CC::al */, $noreg :: (store 16 into %ir.zzz..sroa_cast241, align 32)
+    $q10 = VLD1q64 $r3, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.zzz..sroa_cast241 + 16, basealign 32)
+    $q1 = VMLAfq killed $q1, $q10, killed $q8, 14 /* CC::al */, $noreg
+    $s23 = VDIVS $s8, $s7, 14 /* CC::al */, $noreg, implicit-def $q5
+    $s22 = VDIVS $s8, $s6, 14 /* CC::al */, $noreg, implicit killed $q5, implicit-def $q5
+    $s21 = VDIVS $s8, $s5, 14 /* CC::al */, $noreg, implicit killed $q5, implicit-def $q5
+    $s20 = VDIVS killed $s8, $s4, 14 /* CC::al */, $noreg, implicit killed $q1, implicit killed $q5, implicit-def $q5
+    VST1q64 killed $r5, 16, $q5, 14 /* CC::al */, $noreg :: (store 16 into %ir.xxx..sroa_cast248 + 16, basealign 32)
+    VST1q64 killed $r6, 16, $q5, 14 /* CC::al */, $noreg :: (store 16 into %ir.vvv..sroa_cast230 + 16, basealign 32)
+    $q8 = VLDMQIA %stack.0, 14 /* CC::al */, $noreg :: (load 16 from %stack.0)
+    $q9 = VLDMQIA %stack.1, 14 /* CC::al */, $noreg :: (load 16 from %stack.1)
+    $q8 = VMULfq killed $q9, killed $q8, 14 /* CC::al */, $noreg
+    $q14 = VMLSfq killed $q14, killed $q8, killed $q0, 14 /* CC::al */, $noreg
+    $r7 = VST1q32wb_fixed killed $r7, 16, killed $q14, 14 /* CC::al */, $noreg :: (store 16 into %ir.yyy..sroa_cast244, align 32)
+    $q8 = VLD1q64 $r7, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.yyy..sroa_cast244 + 16, basealign 32)
+    $q9 = VLDMQIA %stack.3, 14 /* CC::al */, $noreg :: (load 16 from %stack.3)
+    $q9 = VMULfq $q8, killed $q9, 14 /* CC::al */, $noreg
+    $q7 = VMLSfq killed $q7, killed $q9, $q5, 14 /* CC::al */, $noreg
+    VST1q64 killed $r7, 16, killed $q7, 14 /* CC::al */, $noreg :: (store 16 into %ir.yyy..sroa_cast244 + 16, basealign 32)
+    $q9 = VLDMQIA %stack.2, 14 /* CC::al */, $noreg :: (load 16 from %stack.2)
+    $q9 = VMULfq killed $q9, killed $q12, 14 /* CC::al */, $noreg
+    $q4 = VMLSfq killed $q4, killed $q9, $q5, 14 /* CC::al */, $noreg
+    VST1q64 killed $r4, 16, killed $q4, 14 /* CC::al */, $noreg :: (store 16 into %ir.ttt..sroa_cast226 + 16, basealign 32)
+    $q8 = VMULfq killed $q8, $q3, 14 /* CC::al */, $noreg
+    $q9 = VLD1q64 killed $r0, 16, 14 /* CC::al */, $noreg :: (load 16 from %ir.sss..sroa_cast223 + 16, basealign 32)
+    $q12 = COPY $q9
+    $q12 = VMLSfq killed $q12, killed $q8, $q5, 14 /* CC::al */, $noreg
+    VST1q64 killed $r2, 16, killed $q12, 14 /* CC::al */, $noreg :: (store 16 into %ir.www..sroa_cast238 + 16, basealign 32)
+    $q8 = VMULfq $q11, killed $q3, 14 /* CC::al */, $noreg
+    $q9 = VMULfq killed $q10, killed $q9, 14 /* CC::al */, $noreg
+    $q8 = VMLSfq killed $q8, killed $q9, $q5, 14 /* CC::al */, $noreg
+    VST1q64 killed $r3, 16, killed $q8, 14 /* CC::al */, $noreg :: (store 16 into %ir.zzz..sroa_cast241 + 16, basealign 32)
+    $q8 = VMULfq killed $q11, killed $q5, 14 /* CC::al */, $noreg
+    VST1q64 killed $r12, 16, killed $q8, 14 /* CC::al */, $noreg :: (store 16 into %ir.ppp..sroa_cast248 + 16, basealign 32)
+    tBX_RET 14 /* CC::al */, $noreg
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll
index a24637870b31..8449b4a9989b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-multivec-spill.ll
@@ -35,18 +35,18 @@ define arm_aapcs_vfpcc void @spill_multivector(<4 x i32>* %p) {
 ; CHECK-NEXT:    vld21.32 {q4, q5}, [r0]
 ; CHECK-NEXT:    bl external_function
 ; CHECK-NEXT:    vldmia sp, {d2, d3, d4, d5} @ 32-byte Reload
-; CHECK-NEXT:    add r0, sp, #32
+; CHECK-NEXT:    add.w lr, sp, #32
 ; CHECK-NEXT:    vstrw.32 q2, [r4, #80]
 ; CHECK-NEXT:    vstrw.32 q5, [r4, #144]
 ; CHECK-NEXT:    vstrw.32 q4, [r4, #128]
 ; CHECK-NEXT:    vstrw.32 q7, [r4, #112]
 ; CHECK-NEXT:    vstrw.32 q1, [r4, #64]
-; CHECK-NEXT:    vldmia r0, {d2, d3, d4, d5} @ 32-byte Reload
-; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vldmia lr, {d2, d3, d4, d5} @ 32-byte Reload
+; CHECK-NEXT:    add.w lr, sp, #64
 ; CHECK-NEXT:    vstrw.32 q2, [r4, #48]
 ; CHECK-NEXT:    vstrw.32 q6, [r4, #96]
 ; CHECK-NEXT:    vstrw.32 q1, [r5]
-; CHECK-NEXT:    vldmia r0, {d2, d3, d4, d5} @ 32-byte Reload
+; CHECK-NEXT:    vldmia lr, {d2, d3, d4, d5} @ 32-byte Reload
 ; CHECK-NEXT:    vstrw.32 q2, [r4, #16]
 ; CHECK-NEXT:    vstrw.32 q1, [r4]
 ; CHECK-NEXT:    add sp, #112