[RISCV] Fix missing emergency slots for scalable stack offsets

This patch adds an additional emergency spill slot to RVV code. This is
required as RVV stack offsets may require an additional register to compute.

This patch includes an optimization by @HsiangKai <kai.wang@sifive.com>
to reduce the number of registers required for the computation of stack
offsets from 3 to 2. Otherwise we'd need two additional emergency spill
slots.

Reviewed By: HsiangKai

Differential Revision: https://reviews.llvm.org/D100574
This commit is contained in:
Fraser Cormack 2021-04-15 17:02:20 +01:00
parent 86729538bd
commit b4a358a7ba
6 changed files with 275 additions and 37 deletions

View File

@ -859,6 +859,13 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
RegInfo->getSpillAlign(*RC), false);
RS->addScavengingFrameIndex(RegScavFI);
// For RVV, scalable stack offsets require up to two scratch registers to
// compute the final offset. Reserve an additional emergency spill slot.
if (RVVStackSize != 0) {
int RVVRegScavFI = MFI.CreateStackObject(
RegInfo->getSpillSize(*RC), RegInfo->getSpillAlign(*RC), false);
RS->addScavengingFrameIndex(RVVRegScavFI);
}
}
if (MFI.getCalleeSavedInfo().empty() || RVFI->useSaveRestoreLibCalls(MF)) {

View File

@ -1366,33 +1366,32 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
DebugLoc DL = II->getDebugLoc();
int64_t NumOfVReg = Amount / 8;
Register SizeOfVector = MRI.createVirtualRegister(&RISCV::GPRRegClass);
BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), SizeOfVector);
Register FactorRegister = MRI.createVirtualRegister(&RISCV::GPRRegClass);
Register VL = MRI.createVirtualRegister(&RISCV::GPRRegClass);
BuildMI(MBB, II, DL, TII->get(RISCV::PseudoReadVLENB), VL);
assert(isInt<12>(NumOfVReg) &&
"Expect the number of vector registers within 12-bits.");
if (isPowerOf2_32(NumOfVReg)) {
uint32_t ShiftAmount = Log2_32(NumOfVReg);
if (ShiftAmount == 0)
return SizeOfVector;
BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), FactorRegister)
.addReg(SizeOfVector, RegState::Kill)
return VL;
BuildMI(MBB, II, DL, TII->get(RISCV::SLLI), VL)
.addReg(VL, RegState::Kill)
.addImm(ShiftAmount);
} else {
Register VN = MRI.createVirtualRegister(&RISCV::GPRRegClass);
BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), VN)
Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
BuildMI(MBB, II, DL, TII->get(RISCV::ADDI), N)
.addReg(RISCV::X0)
.addImm(NumOfVReg);
if (!MF.getSubtarget<RISCVSubtarget>().hasStdExtM())
MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
MF.getFunction(),
"M-extension must be enabled to calculate the vscaled size/offset."});
BuildMI(MBB, II, DL, TII->get(RISCV::MUL), FactorRegister)
.addReg(SizeOfVector, RegState::Kill)
.addReg(VN, RegState::Kill);
BuildMI(MBB, II, DL, TII->get(RISCV::MUL), VL)
.addReg(VL, RegState::Kill)
.addReg(N, RegState::Kill);
}
return FactorRegister;
return VL;
}
Optional<std::pair<unsigned, unsigned>>

View File

@ -30,20 +30,20 @@ declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(
define <vscale x 1 x i64> @access_fixed_and_vector_objects(i64 *%val) {
; RV64IV-LABEL: access_fixed_and_vector_objects:
; RV64IV: # %bb.0:
; RV64IV-NEXT: addi sp, sp, -528
; RV64IV-NEXT: .cfi_def_cfa_offset 528
; RV64IV-NEXT: addi sp, sp, -544
; RV64IV-NEXT: .cfi_def_cfa_offset 544
; RV64IV-NEXT: csrr a0, vlenb
; RV64IV-NEXT: sub sp, sp, a0
; RV64IV-NEXT: addi a0, sp, 8
; RV64IV-NEXT: addi a0, sp, 24
; RV64IV-NEXT: vl1re64.v v25, (a0)
; RV64IV-NEXT: ld a0, 520(sp)
; RV64IV-NEXT: addi a1, sp, 528
; RV64IV-NEXT: ld a0, 536(sp)
; RV64IV-NEXT: addi a1, sp, 544
; RV64IV-NEXT: vl1re64.v v26, (a1)
; RV64IV-NEXT: vsetvli a0, a0, e64,m1,ta,mu
; RV64IV-NEXT: vadd.vv v8, v25, v26
; RV64IV-NEXT: csrr a0, vlenb
; RV64IV-NEXT: add sp, sp, a0
; RV64IV-NEXT: addi sp, sp, 528
; RV64IV-NEXT: addi sp, sp, 544
; RV64IV-NEXT: ret
%local = alloca i64
%vector = alloca <vscale x 1 x i64>

View File

@ -250,18 +250,18 @@ define void @lmul4_and_2_x2_1() nounwind {
define void @gpr_and_lmul1_and_2() nounwind {
; CHECK-LABEL: gpr_and_lmul1_and_2:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a1, zero, 3
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, zero, 3
; CHECK-NEXT: sd a0, 8(sp)
; CHECK-NEXT: sd a0, 24(sp)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a1, zero, 3
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: ret
%x1 = alloca i64
%v1 = alloca <vscale x 1 x i64>
@ -273,21 +273,21 @@ define void @gpr_and_lmul1_and_2() nounwind {
define void @gpr_and_lmul1_and_4() nounwind {
; CHECK-LABEL: gpr_and_lmul1_and_4:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 32
; CHECK-NEXT: addi sp, sp, -64
; CHECK-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
; CHECK-NEXT: addi s0, sp, 64
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: addi a1, zero, 5
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: andi sp, sp, -32
; CHECK-NEXT: addi a0, zero, 3
; CHECK-NEXT: sd a0, 8(sp)
; CHECK-NEXT: addi sp, s0, -32
; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: sd a0, 40(sp)
; CHECK-NEXT: addi sp, s0, -64
; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; CHECK-NEXT: addi sp, sp, 64
; CHECK-NEXT: ret
%x1 = alloca i64
%v1 = alloca <vscale x 1 x i64>

View File

@ -0,0 +1,232 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple riscv64 -mattr=+m,+experimental-v -run-pass=prologepilog -o - \
# RUN: -verify-machineinstrs %s | FileCheck %s
--- |
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
define void @spillslot() {
ret void
}
...
---
name: spillslot
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$x12', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 0
offsetAdjustment: 0
maxAlignment: 128
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 4294967295
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
hasTailCall: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack:
- { id: 0, name: '', type: default, offset: 0, size: 2048, alignment: 128,
stack-id: default, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 2, name: '', type: spill-slot, offset: 0, size: 400, alignment: 8,
stack-id: scalable-vector, callee-saved-register: '', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
body: |
; CHECK-LABEL: name: spillslot
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: liveins: $x12, $x1, $x9, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27
; CHECK: $x2 = frame-setup ADDI $x2, -2032
; CHECK: CFI_INSTRUCTION def_cfa_offset 2032
; CHECK: SD killed $x1, $x2, 2024 :: (store 8 into %stack.3)
; CHECK: SD killed $x8, $x2, 2016 :: (store 8 into %stack.4)
; CHECK: SD killed $x9, $x2, 2008 :: (store 8 into %stack.5)
; CHECK: SD killed $x18, $x2, 2000 :: (store 8 into %stack.6)
; CHECK: SD killed $x19, $x2, 1992 :: (store 8 into %stack.7)
; CHECK: SD killed $x20, $x2, 1984 :: (store 8 into %stack.8)
; CHECK: SD killed $x21, $x2, 1976 :: (store 8 into %stack.9)
; CHECK: SD killed $x22, $x2, 1968 :: (store 8 into %stack.10)
; CHECK: SD killed $x23, $x2, 1960 :: (store 8 into %stack.11)
; CHECK: SD killed $x24, $x2, 1952 :: (store 8 into %stack.12)
; CHECK: SD killed $x25, $x2, 1944 :: (store 8 into %stack.13)
; CHECK: SD killed $x26, $x2, 1936 :: (store 8 into %stack.14)
; CHECK: SD killed $x27, $x2, 1928 :: (store 8 into %stack.15)
; CHECK: CFI_INSTRUCTION offset $x1, -8
; CHECK: CFI_INSTRUCTION offset $x8, -16
; CHECK: CFI_INSTRUCTION offset $x9, -24
; CHECK: CFI_INSTRUCTION offset $x18, -32
; CHECK: CFI_INSTRUCTION offset $x19, -40
; CHECK: CFI_INSTRUCTION offset $x20, -48
; CHECK: CFI_INSTRUCTION offset $x21, -56
; CHECK: CFI_INSTRUCTION offset $x22, -64
; CHECK: CFI_INSTRUCTION offset $x23, -72
; CHECK: CFI_INSTRUCTION offset $x24, -80
; CHECK: CFI_INSTRUCTION offset $x25, -88
; CHECK: CFI_INSTRUCTION offset $x26, -96
; CHECK: CFI_INSTRUCTION offset $x27, -104
; CHECK: $x8 = frame-setup ADDI $x2, 2032
; CHECK: CFI_INSTRUCTION def_cfa $x8, 0
; CHECK: $x2 = frame-setup ADDI $x2, -272
; CHECK: $x10 = PseudoReadVLENB
; CHECK: $x11 = ADDI $x0, 51
; CHECK: $x10 = MUL killed $x10, killed $x11
; CHECK: $x2 = SUB $x2, killed $x10
; CHECK: $x2 = ANDI $x2, -128
; CHECK: dead renamable $x15 = PseudoVSETIVLI 1, 72, implicit-def $vl, implicit-def $vtype
; CHECK: renamable $v25 = PseudoVMV_V_X_M1 killed renamable $x12, $noreg, 16, implicit $vl, implicit $vtype
; CHECK: $x10 = LUI 1
; CHECK: $x10 = ADDIW killed $x10, -1896
; CHECK: $x10 = ADD $x2, killed $x10
; CHECK: $x11 = PseudoReadVLENB
; CHECK: $x12 = ADDI $x0, 50
; CHECK: $x11 = MUL killed $x11, killed $x12
; CHECK: $x10 = ADD killed $x10, killed $x11
; CHECK: PseudoVSPILL_M1 killed renamable $v25, killed $x10 :: (store unknown-size into %stack.1, align 8)
; CHECK: renamable $x1 = ADDI $x0, 255
; CHECK: renamable $x5 = nuw ADDI $x2, 384
; CHECK: renamable $x6 = ADDI $x2, 512
; CHECK: renamable $x7 = nuw ADDI $x2, 640
; CHECK: renamable $x10 = ADDI $x0, 128
; CHECK: renamable $x12 = nuw ADDI $x2, 256
; CHECK: renamable $x14 = COPY $x0
; CHECK: renamable $x17 = nuw ADDI $x2, 256
; CHECK: renamable $x18 = ADDI $x2, 1280
; CHECK: renamable $x19 = ADDI $x2, 1408
; CHECK: renamable $x20 = ADDI $x2, 1536
; CHECK: renamable $x21 = ADDI $x2, 1664
; CHECK: renamable $x22 = ADDI $x2, 1792
; CHECK: renamable $x23 = ADDI $x2, 1920
; CHECK: SD killed $x1, $x2, 8 :: (store 8 into %stack.16)
; CHECK: SD killed $x5, $x2, 0 :: (store 8 into %stack.17)
; CHECK: $x11 = LUI 1
; CHECK: $x11 = ADDIW killed $x11, -2048
; CHECK: $x24 = ADD $x2, killed $x11
; CHECK: renamable $x25 = ADDI $x2, 128
; CHECK: renamable $x26 = ADDI $x2, 128
; CHECK: renamable $x27 = ADDI $x0, 2
; CHECK: renamable $x28 = ADDI $x2, 768
; CHECK: renamable $x29 = ADDI $x2, 896
; CHECK: renamable $x30 = ADDI $x2, 1024
; CHECK: renamable $x31 = nuw ADDI $x2, 1152
; CHECK: renamable $x15 = ADDIW renamable $x14, 0
; CHECK: renamable $x11 = ANDI renamable $x15, 255
; CHECK: renamable $x13 = SLLI renamable $x11, 3
; CHECK: renamable $x13 = ADD renamable $x26, killed renamable $x13
; CHECK: renamable $x13 = LD killed renamable $x13, 0 :: (load 8)
; CHECK: renamable $x9 = SRAI renamable $x13, 63
; CHECK: renamable $x9 = SRLI killed renamable $x9, 62
; CHECK: renamable $x9 = ADD renamable $x13, killed renamable $x9
; CHECK: renamable $x9 = ANDI killed renamable $x9, -4
; CHECK: renamable $x16 = SUB killed renamable $x13, killed renamable $x9
; CHECK: dead renamable $x13 = PseudoVSETIVLI 1, 64, implicit-def $vl, implicit-def $vtype
; CHECK: renamable $x13 = nsw ADDI renamable $x16, -2
; CHECK: $x5 = LUI 1
; CHECK: $x9 = ADDIW killed $x5, -1896
; CHECK: $x9 = ADD $x2, killed $x9
; CHECK: $x1 = PseudoReadVLENB
; CHECK: $x5 = ADDI $x0, 50
; CHECK: $x1 = MUL killed $x1, killed $x5
; CHECK: $x5 = LD $x2, 0 :: (load 8 from %stack.17)
; CHECK: $x9 = ADD killed $x9, killed $x1
; CHECK: $x1 = LD $x2, 8 :: (load 8 from %stack.16)
; CHECK: renamable $v0 = PseudoVRELOAD_M1 killed $x9 :: (load unknown-size from %stack.1, align 8)
; CHECK: renamable $v0 = PseudoVSLIDEDOWN_VX_M1 undef renamable $v0, killed renamable $v0, killed renamable $x13, $noreg, 8, implicit $vl, implicit $vtype
; CHECK: renamable $x13 = PseudoVMV_X_S_M1 killed renamable $v0, 8, implicit $vl, implicit $vtype
; CHECK: BLT killed renamable $x16, renamable $x27, %bb.2
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: liveins: $x1, $x5, $x6, $x7, $x10, $x11, $x12, $x13, $x14, $x15, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31
; CHECK: renamable $x9 = COPY killed renamable $x13
; CHECK: PseudoBR %bb.2
; CHECK: bb.2:
; CHECK: $x10 = frame-destroy LUI 1
; CHECK: $x10 = frame-destroy ADDIW killed $x10, -1792
; CHECK: $x2 = frame-destroy SUB $x8, killed $x10
; CHECK: $x2 = frame-destroy ADDI $x2, 272
; CHECK: $x27 = LD $x2, 1928 :: (load 8 from %stack.15)
; CHECK: $x26 = LD $x2, 1936 :: (load 8 from %stack.14)
; CHECK: $x25 = LD $x2, 1944 :: (load 8 from %stack.13)
; CHECK: $x24 = LD $x2, 1952 :: (load 8 from %stack.12)
; CHECK: $x23 = LD $x2, 1960 :: (load 8 from %stack.11)
; CHECK: $x22 = LD $x2, 1968 :: (load 8 from %stack.10)
; CHECK: $x21 = LD $x2, 1976 :: (load 8 from %stack.9)
; CHECK: $x20 = LD $x2, 1984 :: (load 8 from %stack.8)
; CHECK: $x19 = LD $x2, 1992 :: (load 8 from %stack.7)
; CHECK: $x18 = LD $x2, 2000 :: (load 8 from %stack.6)
; CHECK: $x9 = LD $x2, 2008 :: (load 8 from %stack.5)
; CHECK: $x8 = LD $x2, 2016 :: (load 8 from %stack.4)
; CHECK: $x1 = LD $x2, 2024 :: (load 8 from %stack.3)
; CHECK: $x2 = frame-destroy ADDI $x2, 2032
; CHECK: PseudoRET
bb.0:
successors: %bb.1, %bb.2
liveins: $x12
dead renamable $x15 = PseudoVSETIVLI 1, 72, implicit-def $vl, implicit-def $vtype
renamable $v25 = PseudoVMV_V_X_M1 killed renamable $x12, $noreg, 16, implicit $vl, implicit $vtype
PseudoVSPILL_M1 killed renamable $v25, %stack.1 :: (store unknown-size into %stack.1, align 8)
renamable $x1 = ADDI $x0, 255
renamable $x5 = nuw ADDI %stack.0, 256
renamable $x6 = ADDI %stack.0, 384
renamable $x7 = nuw ADDI %stack.0, 512
renamable $x10 = ADDI $x0, 128
renamable $x12 = nuw ADDI %stack.0, 128
renamable $x14 = COPY $x0
renamable $x17 = nuw ADDI %stack.0, 128
renamable $x18 = ADDI %stack.0, 1152
renamable $x19 = ADDI %stack.0, 1280
renamable $x20 = ADDI %stack.0, 1408
renamable $x21 = ADDI %stack.0, 1536
renamable $x22 = ADDI %stack.0, 1664
renamable $x23 = ADDI %stack.0, 1792
renamable $x24 = ADDI %stack.0, 1920
renamable $x25 = ADDI %stack.0, 0
renamable $x26 = ADDI %stack.0, 0
renamable $x27 = ADDI $x0, 2
renamable $x28 = ADDI %stack.0, 640
renamable $x29 = ADDI %stack.0, 768
renamable $x30 = ADDI %stack.0, 896
renamable $x31 = nuw ADDI %stack.0, 1024
renamable $x15 = ADDIW renamable $x14, 0
renamable $x11 = ANDI renamable $x15, 255
renamable $x13 = SLLI renamable $x11, 3
renamable $x13 = ADD renamable $x26, killed renamable $x13
renamable $x13 = LD killed renamable $x13, 0 :: (load 8)
renamable $x9 = SRAI renamable $x13, 63
renamable $x9 = SRLI killed renamable $x9, 62
renamable $x9 = ADD renamable $x13, killed renamable $x9
renamable $x9 = ANDI killed renamable $x9, -4
renamable $x16 = SUB killed renamable $x13, killed renamable $x9
dead renamable $x13 = PseudoVSETIVLI 1, 64, implicit-def $vl, implicit-def $vtype
renamable $x13 = nsw ADDI renamable $x16, -2
renamable $v0 = PseudoVRELOAD_M1 %stack.1 :: (load unknown-size from %stack.1, align 8)
renamable $v0 = PseudoVSLIDEDOWN_VX_M1 undef renamable $v0, killed renamable $v0, killed renamable $x13, $noreg, 8, implicit $vl, implicit $vtype
renamable $x13 = PseudoVMV_X_S_M1 killed renamable $v0, 8, implicit $vl, implicit $vtype
BLT killed renamable $x16, renamable $x27, %bb.2
bb.1:
successors: %bb.2
liveins: $x1, $x5, $x6, $x7, $x10, $x11, $x12, $x13, $x14, $x15, $x17, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28, $x29, $x30, $x31
renamable $x9 = COPY killed renamable $x13
PseudoBR %bb.2
bb.2:
PseudoRET
...

View File

@ -153,24 +153,24 @@ define void @local_var_m8() {
define void @local_var_m2_mix_local_scalar() {
; RV64IV-LABEL: local_var_m2_mix_local_scalar:
; RV64IV: # %bb.0:
; RV64IV-NEXT: addi sp, sp, -16
; RV64IV-NEXT: .cfi_def_cfa_offset 16
; RV64IV-NEXT: addi sp, sp, -32
; RV64IV-NEXT: .cfi_def_cfa_offset 32
; RV64IV-NEXT: csrr a0, vlenb
; RV64IV-NEXT: slli a0, a0, 2
; RV64IV-NEXT: sub sp, sp, a0
; RV64IV-NEXT: lw a0, 12(sp)
; RV64IV-NEXT: lw a0, 28(sp)
; RV64IV-NEXT: csrr a0, vlenb
; RV64IV-NEXT: slli a0, a0, 1
; RV64IV-NEXT: add a0, sp, a0
; RV64IV-NEXT: addi a0, a0, 16
; RV64IV-NEXT: addi a0, a0, 32
; RV64IV-NEXT: vl2r.v v26, (a0)
; RV64IV-NEXT: addi a0, sp, 16
; RV64IV-NEXT: addi a0, sp, 32
; RV64IV-NEXT: vl2r.v v26, (a0)
; RV64IV-NEXT: lw a0, 8(sp)
; RV64IV-NEXT: lw a0, 24(sp)
; RV64IV-NEXT: csrr a0, vlenb
; RV64IV-NEXT: slli a0, a0, 2
; RV64IV-NEXT: add sp, sp, a0
; RV64IV-NEXT: addi sp, sp, 16
; RV64IV-NEXT: addi sp, sp, 32
; RV64IV-NEXT: ret
%local_scalar0 = alloca i32
%local0 = alloca <vscale x 16 x i8>