[mips] Use register scavenging with MSA.

MSA stores and loads to the stack are more likely to require an emergency GPR spill slot due to the smaller offsets available with those instructions. Handle this by overestimating the size of the stack by determining the largest offset presuming that all callee save registers are spilled and accounting of incoming arguments when determining whether an emergency spill slot is required. Reviewers: atanasyan Differential Revision: https://reviews.llvm.org/D39056 llvm-svn: 317204
2017-11-02 12:47:22 +00:00 · 2017-11-02 12:47:22 +00:00 · 725acb2d91
parent 0e142499a9
commit 725acb2d91
4 changed files with 272 additions and 41 deletions
--- a/llvm/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@ -107,38 +107,31 @@ bool MipsFrameLowering::hasBP(const MachineFunction &MF) const {
  return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
 }

+// Estimate the size of the stack, including the incoming arguments. We need to
+// account for register spills, local objects, reserved call frame and incoming
+// arguments. This is required to determine the largest possible positive offset
+// from $sp so that it can be determined if an emergency spill slot for stack
+// addresses is required.
 uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
  const MachineFrameInfo &MFI = MF.getFrameInfo();
  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();

-  int64_t Offset = 0;
+  int64_t Size = 0;

-  // Iterate over fixed sized objects.
+  // Iterate over fixed sized objects which are incoming arguments.
  for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    Offset = std::max(Offset, -MFI.getObjectOffset(I));
+    if (MFI.getObjectOffset(I) > 0)
+      Size += MFI.getObjectSize(I);

  // Conservatively assume all callee-saved registers will be saved.
  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
-    unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
-    Offset = alignTo(Offset + Size, Size);
+    unsigned RegSize = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
+    Size = alignTo(Size + RegSize, RegSize);
  }

-  unsigned MaxAlign = MFI.getMaxAlignment();
-
-  // Check that MaxAlign is not zero if there is a stack object that is not a
-  // callee-saved spill.
-  assert(!MFI.getObjectIndexEnd() || MaxAlign);
-
-  // Iterate over other objects.
-  for (unsigned I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
-    Offset = alignTo(Offset + MFI.getObjectSize(I), MaxAlign);
-
-  // Call frame.
-  if (MFI.adjustsStack() && hasReservedCallFrame(MF))
-    Offset = alignTo(Offset + MFI.getMaxCallFrameSize(),
-                     std::max(MaxAlign, getStackAlignment()));
-
-  return alignTo(Offset, getStackAlignment());
+  // Get the size of the rest of the frame objects and any possible reserved
+  // call frame, accounting for alignment.
+  return Size + MFI.estimateStackSize(MF);
 }

 // Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@ -893,10 +893,12 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
  }

  // Set scavenging frame index if necessary.
-  uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
-    estimateStackSize(MF);
+  uint64_t MaxSPOffset = estimateStackSize(MF);

-  if (isInt<16>(MaxSPOffset))
+  // MSA has a minimum offset of 10 bits signed. If there is a variable
+  // sized object on the stack, the estimation cannot account for it.
+  if (isIntN(STI.hasMSA() ? 10 : 16, MaxSPOffset) &&
+      !MF.getFrameInfo().hasVarSizedObjects())
    return;

  const TargetRegisterClass &RC =
--- a/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
+++ b/llvm/test/CodeGen/Mips/msa/emergency-spill.mir
@ -0,0 +1,221 @@
+# RUN: llc %s -start-after=shrink-wrap -march=mips64 -mcpu=mips64r6 -mattr=+fp64,+msa -o /dev/null
+
+# Test that estimated size of the stack leads to the creation of an emergency
+# spill when MSA is in use. Previously, this test case would fail during
+# register scavenging due to the lack of a spill slot.
+--- |
+  define inreg { i64, i64 } @test(i64 inreg %a.coerce0, i64 inreg %a.coerce1, i64 inreg %b.coerce0, i64 inreg %b.coerce1, i32 signext %c) #0 {
+  entry:
+    %retval = alloca <16 x i8>, align 16
+    %a = alloca <16 x i8>, align 16
+    %b = alloca <16 x i8>, align 16
+    %a.addr = alloca <16 x i8>, align 16
+    %b.addr = alloca <16 x i8>, align 16
+    %c.addr = alloca i32, align 4
+    %g = alloca <16 x i8>*, align 8
+    %d = alloca i8*, align 8
+    %0 = bitcast <16 x i8>* %a to { i64, i64 }*
+    %1 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 0
+    store i64 %a.coerce0, i64* %1, align 16
+    %2 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %0, i32 0, i32 1
+    store i64 %a.coerce1, i64* %2, align 8
+    %a1 = load <16 x i8>, <16 x i8>* %a, align 16
+    %3 = bitcast <16 x i8>* %b to { i64, i64 }*
+    %4 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 0
+    store i64 %b.coerce0, i64* %4, align 16
+    %5 = getelementptr inbounds { i64, i64 }, { i64, i64 }* %3, i32 0, i32 1
+    store i64 %b.coerce1, i64* %5, align 8
+    %b2 = load <16 x i8>, <16 x i8>* %b, align 16
+    store <16 x i8> %a1, <16 x i8>* %a.addr, align 16
+    store <16 x i8> %b2, <16 x i8>* %b.addr, align 16
+    store i32 %c, i32* %c.addr, align 4
+    %6 = alloca i8, i64 6400, align 16
+    %7 = bitcast i8* %6 to <16 x i8>*
+    store <16 x i8>* %7, <16 x i8>** %g, align 8
+    %8 = load <16 x i8>*, <16 x i8>** %g, align 8
+    call void @h(<16 x i8>* %b.addr, <16 x i8>* %8)
+    %9 = load <16 x i8>*, <16 x i8>** %g, align 8
+    %10 = bitcast <16 x i8>* %9 to i8*
+    store i8* %10, i8** %d, align 8
+    %11 = load <16 x i8>, <16 x i8>* %a.addr, align 16
+    %12 = load i8*, i8** %d, align 8
+    %arrayidx = getelementptr inbounds i8, i8* %12, i64 0
+    %13 = load i8, i8* %arrayidx, align 1
+    %conv = sext i8 %13 to i32
+    %14 = call <16 x i8> @llvm.mips.fill.b(i32 %conv)
+    %add = add <16 x i8> %11, %14
+    %15 = load i8*, i8** %d, align 8
+    %arrayidx3 = getelementptr inbounds i8, i8* %15, i64 1
+    %16 = load i8, i8* %arrayidx3, align 1
+    %conv4 = sext i8 %16 to i32
+    %17 = call <16 x i8> @llvm.mips.fill.b(i32 %conv4)
+    %add5 = add <16 x i8> %add, %17
+    %18 = load <16 x i8>, <16 x i8>* %b.addr, align 16
+    %add6 = add <16 x i8> %18, %add5
+    store <16 x i8> %add6, <16 x i8>* %b.addr, align 16
+    %19 = load <16 x i8>, <16 x i8>* %b.addr, align 16
+    store <16 x i8> %19, <16 x i8>* %retval, align 16
+    %20 = bitcast <16 x i8>* %retval to { i64, i64 }*
+    %21 = load { i64, i64 }, { i64, i64 }* %20, align 16
+    ret { i64, i64 } %21
+  }
+
+  declare void @h(<16 x i8>*, <16 x i8>*)
+
+  declare <16 x i8> @llvm.mips.fill.b(i32)
+
+  declare void @llvm.stackprotector(i8*, i8**)
+
+...
+---
+name:            test
+alignment:       3
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '%a0_64', virtual-reg: '' }
+  - { reg: '%a1_64', virtual-reg: '' }
+  - { reg: '%a2_64', virtual-reg: '' }
+  - { reg: '%a3_64', virtual-reg: '' }
+  - { reg: '%t0_64', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    false
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: retval, type: default, offset: 0, size: 16, alignment: 16,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 1, name: a, type: default, offset: 0, size: 16, alignment: 16,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 2, name: b, type: default, offset: 0, size: 16, alignment: 16,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 3, name: a.addr, type: default, offset: 0, size: 16, alignment: 16,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 4, name: b.addr, type: default, offset: 0, size: 16, alignment: 16,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 5, name: c.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 6, name: g, type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 7, name: d, type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+  - { id: 8, name: '', type: default, offset: 0, size: 6400,
+      alignment: 16, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      di-variable: '', di-expression: '', di-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: %a0_64, %a1_64, %a2_64, %a3_64, %t0_64
+
+    SD killed %a0_64, %stack.1.a, 0 :: (store 8 into %ir.1, align 16)
+    SD killed %a1_64, %stack.1.a, 8 :: (store 8 into %ir.2)
+    %w0 = LD_B %stack.1.a, 0 :: (dereferenceable load 16 from %ir.a)
+    SD killed %a2_64, %stack.2.b, 0 :: (store 8 into %ir.4, align 16)
+    SD killed %a3_64, %stack.2.b, 8 :: (store 8 into %ir.5)
+    %w1 = LD_B %stack.2.b, 0 :: (dereferenceable load 16 from %ir.b)
+    ST_B killed %w0, %stack.3.a.addr, 0 :: (store 16 into %ir.a.addr)
+    ST_B killed %w1, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr)
+    SW %t0, %stack.5.c.addr, 0, implicit killed %t0_64 :: (store 4 into %ir.c.addr)
+    %at_64 = LEA_ADDiu64 %stack.8, 0
+    SD killed %at_64, %stack.6.g, 0 :: (store 8 into %ir.g)
+    %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp
+    %a0_64 = LEA_ADDiu64 %stack.4.b.addr, 0
+    JAL @h, csr_n64, implicit-def dead %ra, implicit %a0_64, implicit %a1_64, implicit-def %sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp
+    %at_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %v0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %v1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %a0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %a1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %a2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %a3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s0_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s1_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s2_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s3_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s4_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s5_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s6_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %s7_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t8_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %t9_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %ra_64 = LD %stack.6.g, 0 :: (dereferenceable load 8 from %ir.g)
+    %w0 = LD_B %stack.3.a.addr, 0 :: (dereferenceable load 16 from %ir.a.addr)
+    SD %at_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %v0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %v1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %a0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %a1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %a2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %a3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t4_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t5_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t6_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t7_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s0_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s1_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s2_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s3_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s4_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s5_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s6_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %s7_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t8_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %t9_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    SD %ra_64, %stack.7.d, 0 :: (store 8 into %ir.d)
+    %at_64 = LD %stack.7.d, 0 :: (dereferenceable load 8 from %ir.d)
+    %v0 = LB %at_64, 0 :: (load 1 from %ir.arrayidx)
+    %w1 = FILL_B killed %v0
+    %w0 = ADDV_B killed %w0, killed %w1
+    %at = LB killed %at_64, 1 :: (load 1 from %ir.arrayidx3)
+    %w1 = FILL_B killed %at
+    %w0 = ADDV_B killed %w0, killed %w1
+    %w1 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr)
+    %w0 = ADDV_B killed %w1, killed %w0
+    ST_B killed %w0, %stack.4.b.addr, 0 :: (store 16 into %ir.b.addr)
+    %w0 = LD_B %stack.4.b.addr, 0 :: (dereferenceable load 16 from %ir.b.addr)
+    ST_B killed %w0, %stack.0.retval, 0 :: (store 16 into %ir.retval)
+    %v0_64 = LD %stack.0.retval, 0 :: (dereferenceable load 8 from %ir.20, align 16)
+    %v1_64 = LD %stack.0.retval, 8 :: (dereferenceable load 8 from %ir.20 + 8, align 16)
+    RetRA implicit %v0_64, implicit %v1_64
+
+...
--- a/llvm/test/CodeGen/Mips/msa/frameindex.ll
+++ b/llvm/test/CodeGen/Mips/msa/frameindex.ll
@ -18,7 +18,8 @@ define void @loadstore_v16i8_just_under_simm10() nounwind {
  ; MIPS32-AE: loadstore_v16i8_just_under_simm10:

  %1 = alloca <16 x i8>
-  %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes
+  %2 = alloca [492 x i8] ; Push the frame--acounting for the emergency spill
+                         ; slot--right up to 512 bytes

  %3 = load volatile <16 x i8>, <16 x i8>* %1
  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp)
@ -33,7 +34,8 @@ define void @loadstore_v16i8_just_over_simm10() nounwind {
  ; MIPS32-AE: loadstore_v16i8_just_over_simm10:

  %1 = alloca <16 x i8>
-  %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
+  %2 = alloca [497 x i8] ; Push the frame--acounting for the emergency spill
+                         ; slot--right up to 512 bytes

  %3 = load volatile <16 x i8>, <16 x i8>* %1
  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 512
@ -50,7 +52,8 @@ define void @loadstore_v16i8_just_under_simm16() nounwind {
  ; MIPS32-AE: loadstore_v16i8_just_under_simm16:

  %1 = alloca <16 x i8>
-  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+  %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--right up to 32768 bytes

  %3 = load volatile <16 x i8>, <16 x i8>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -69,7 +72,8 @@ define void @loadstore_v16i8_just_over_simm16() nounwind {
  ; MIPS32-AE: loadstore_v16i8_just_over_simm16:

  %1 = alloca <16 x i8>
-  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+  %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--just over 32768 bytes

  %3 = load volatile <16 x i8>, <16 x i8>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -121,7 +125,8 @@ define void @loadstore_v8i16_just_under_simm10() nounwind {
  ; MIPS32-AE: loadstore_v8i16_just_under_simm10:

  %1 = alloca <8 x i16>
-  %2 = alloca [1008 x i8] ; Push the frame right up to 1024 bytes
+  %2 = alloca [1004 x i8] ; Push the frame--acounting for the emergency spill
+                          ; slot--right up to 1024 bytes

  %3 = load volatile <8 x i16>, <8 x i16>* %1
  ; MIPS32-AE: ld.h [[R1:\$w[0-9]+]], 1008($sp)
@ -136,7 +141,8 @@ define void @loadstore_v8i16_just_over_simm10() nounwind {
  ; MIPS32-AE: loadstore_v8i16_just_over_simm10:

  %1 = alloca <8 x i16>
-  %2 = alloca [1009 x i8] ; Push the frame just over 1024 bytes
+  %2 = alloca [1009 x i8] ; Push the frame--acounting for the emergency spill
+                          ; slot--just over 1024 bytes

  %3 = load volatile <8 x i16>, <8 x i16>* %1
  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 1024
@ -153,7 +159,8 @@ define void @loadstore_v8i16_just_under_simm16() nounwind {
  ; MIPS32-AE: loadstore_v8i16_just_under_simm16:

  %1 = alloca <8 x i16>
-  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+  %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--right up to 32768 bytes

  %3 = load volatile <8 x i16>, <8 x i16>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -172,7 +179,8 @@ define void @loadstore_v8i16_just_over_simm16() nounwind {
  ; MIPS32-AE: loadstore_v8i16_just_over_simm16:

  %1 = alloca <8 x i16>
-  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+  %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--just over 32768 bytes

  %3 = load volatile <8 x i16>, <8 x i16>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -224,7 +232,8 @@ define void @loadstore_v4i32_just_under_simm10() nounwind {
  ; MIPS32-AE: loadstore_v4i32_just_under_simm10:

  %1 = alloca <4 x i32>
-  %2 = alloca [2032 x i8] ; Push the frame right up to 2048 bytes
+  %2 = alloca [2028 x i8] ; Push the frame--acounting for the emergency spill
+                          ; slot--right up to 2048 bytes

  %3 = load volatile <4 x i32>, <4 x i32>* %1
  ; MIPS32-AE: ld.w [[R1:\$w[0-9]+]], 2032($sp)
@ -239,7 +248,8 @@ define void @loadstore_v4i32_just_over_simm10() nounwind {
  ; MIPS32-AE: loadstore_v4i32_just_over_simm10:

  %1 = alloca <4 x i32>
-  %2 = alloca [2033 x i8] ; Push the frame just over 2048 bytes
+  %2 = alloca [2033 x i8] ; Push the frame--acounting for the emergency spill
+                          ; slot--just over 2048 bytes

  %3 = load volatile <4 x i32>, <4 x i32>* %1
  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 2048
@ -256,7 +266,8 @@ define void @loadstore_v4i32_just_under_simm16() nounwind {
  ; MIPS32-AE: loadstore_v4i32_just_under_simm16:

  %1 = alloca <4 x i32>
-  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+  %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot-- right up to 32768 bytes

  %3 = load volatile <4 x i32>, <4 x i32>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -275,7 +286,8 @@ define void @loadstore_v4i32_just_over_simm16() nounwind {
  ; MIPS32-AE: loadstore_v4i32_just_over_simm16:

  %1 = alloca <4 x i32>
-  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+  %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--just over 32768 bytes

  %3 = load volatile <4 x i32>, <4 x i32>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -327,8 +339,8 @@ define void @loadstore_v2i64_just_under_simm10() nounwind {
  ; MIPS32-AE: loadstore_v2i64_just_under_simm10:

  %1 = alloca <2 x i64>
-  %2 = alloca [4080 x i8] ; Push the frame right up to 4096 bytes
-
+  %2 = alloca [4076 x i8] ; Push the frame--acounting for the emergency spill
+                          ; slot--right up to 4096 bytes
  %3 = load volatile <2 x i64>, <2 x i64>* %1
  ; MIPS32-AE: ld.d [[R1:\$w[0-9]+]], 4080($sp)
  store volatile <2 x i64> %3, <2 x i64>* %1
@ -342,7 +354,8 @@ define void @loadstore_v2i64_just_over_simm10() nounwind {
  ; MIPS32-AE: loadstore_v2i64_just_over_simm10:

  %1 = alloca <2 x i64>
-  %2 = alloca [4081 x i8] ; Push the frame just over 4096 bytes
+  %2 = alloca [4081 x i8] ; Push the frame--acounting for the emergency spill
+                          ; slot--just over 4096 bytes

  %3 = load volatile <2 x i64>, <2 x i64>* %1
  ; MIPS32-AE: addiu [[BASE:\$([0-9]+|gp)]], $sp, 4096
@ -359,7 +372,8 @@ define void @loadstore_v2i64_just_under_simm16() nounwind {
  ; MIPS32-AE: loadstore_v2i64_just_under_simm16:

  %1 = alloca <2 x i64>
-  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+  %2 = alloca [32752 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--right up to 32768 bytes

  %3 = load volatile <2 x i64>, <2 x i64>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768
@ -378,7 +392,8 @@ define void @loadstore_v2i64_just_over_simm16() nounwind {
  ; MIPS32-AE: loadstore_v2i64_just_over_simm16:

  %1 = alloca <2 x i64>
-  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+  %2 = alloca [32753 x i8] ; Push the frame--acounting for the emergency spill
+                           ; slot--just over 32768 bytes

  %3 = load volatile <2 x i64>, <2 x i64>* %1
  ; MIPS32-AE: ori [[R2:\$([0-9]+|gp)]], $zero, 32768