[GlobalISel] Replace hard coded dynamic alloca handling with G_DYN_STACKALLOC.

This change moves the actual stack pointer manipulation into the legalizer,
available to targets via lower(). The codegen is slightly different because
we're using explicit masks instead of G_PTRMASK, and using G_SUB rather than
adding a negative amount via G_GEP.

Differential Revision: https://reviews.llvm.org/D66678

llvm-svn: 370104
This commit is contained in:
Amara Emerson 2019-08-27 19:54:27 +00:00
parent 86a4a530f4
commit e20b91c265
8 changed files with 282 additions and 57 deletions

View File

@ -227,6 +227,7 @@ public:
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
LegalizeResult lowerUnmergeValues(MachineInstr &MI);
LegalizeResult lowerShuffleVector(MachineInstr &MI);
LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
private:
MachineRegisterInfo &MRI;

View File

@ -370,6 +370,17 @@ public:
/// given. Convert "llvm.dbg.label Label" to "DBG_LABEL Label".
MachineInstrBuilder buildDbgLabel(const MDNode *Label);
/// Build and insert \p Res = G_DYN_STACKALLOC \p Size, \p Align
///
/// G_DYN_STACKALLOC does a dynamic stack allocation and writes the address of
/// the allocated memory into \p Res.
/// \pre setBasicBlock or setMI must have been called.
/// \pre \p Res must be a generic virtual register with pointer type.
///
/// \return a MachineInstrBuilder for the newly created instruction.
MachineInstrBuilder buildDynStackAlloc(const DstOp &Res, const SrcOp &Size,
unsigned Align);
/// Build and insert \p Res = G_FRAME_INDEX \p Idx
///
/// G_FRAME_INDEX materializes the address of an alloca value or other

View File

@ -1781,36 +1781,25 @@ bool IRTranslator::translateAlloca(const User &U,
Register AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
Register TySize =
getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, DL->getTypeAllocSize(Ty)));
MIRBuilder.buildMul(AllocSize, NumElts, TySize);
LLT PtrTy = getLLTForType(*AI.getType(), *DL);
auto &TLI = *MF->getSubtarget().getTargetLowering();
Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
Register SPTmp = MRI->createGenericVirtualRegister(PtrTy);
MIRBuilder.buildCopy(SPTmp, SPReg);
Register AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
// Handle alignment. We have to realign if the allocation granule was smaller
// than stack alignment, or the specific alloca requires more than stack
// alignment.
unsigned StackAlign =
MF->getSubtarget().getFrameLowering()->getStackAlignment();
Align = std::max(Align, StackAlign);
if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
// Round the size of the allocation up to the stack alignment size
// by add SA-1 to the size. This doesn't overflow because we're computing
// an address inside an alloca.
Register AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
AllocTmp = AlignedAlloc;
}
if (Align <= StackAlign)
Align = 0;
MIRBuilder.buildCopy(SPReg, AllocTmp);
MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
// Round the size of the allocation up to the stack alignment size
// by add SA-1 to the size. This doesn't overflow because we're computing
// an address inside an alloca.
auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign - 1);
auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
MachineInstr::NoUWrap);
auto AlignCst =
MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign - 1));
auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Align);
MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
assert(MF->getFrameInfo().hasVarSizedObjects());

View File

@ -17,6 +17,7 @@
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
@ -2153,6 +2154,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
}
case G_SHUFFLE_VECTOR:
return lowerShuffleVector(MI);
case G_DYN_STACKALLOC:
return lowerDynStackAlloc(MI);
}
}
@ -3913,3 +3916,38 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
MI.eraseFromParent();
return Legalized;
}
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
Register AllocSize = MI.getOperand(1).getReg();
unsigned Align = MI.getOperand(2).getImm();
const auto &MF = *MI.getMF();
const auto &TLI = *MF.getSubtarget().getTargetLowering();
LLT PtrTy = MRI.getType(Dst);
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
// Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
// have to generate an extra instruction to negate the alloc and then use
// G_GEP to add the negative offset.
auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
if (Align) {
APInt AlignMask(IntPtrTy.getSizeInBits(), Align, true);
AlignMask.negate();
auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
}
SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
MIRBuilder.buildCopy(SPReg, SPTmp);
MIRBuilder.buildCopy(Dst, SPTmp);
MI.eraseFromParent();
return Legalized;
}

View File

@ -160,6 +160,17 @@ MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
return MIB.addMetadata(Label);
}
MachineInstrBuilder MachineIRBuilder::buildDynStackAlloc(const DstOp &Res,
const SrcOp &Size,
unsigned Align) {
assert(Res.getLLTTy(*getMRI()).isPointer() && "expected ptr dst type");
auto MIB = buildInstr(TargetOpcode::G_DYN_STACKALLOC);
Res.addDefToMIB(*getMRI(), MIB);
Size.addSrcToMIB(MIB);
MIB.addImm(Align);
return MIB;
}
MachineInstrBuilder MachineIRBuilder::buildFrameIndex(const DstOp &Res,
int Idx) {
assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type");

View File

@ -605,6 +605,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
return Query.Types[0] == p0 && Query.Types[1] == s64;
});
getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
computeTables();
verify(*ST.getInstrInfo());
}

View File

@ -1,48 +1,59 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=aarch64 -global-isel %s -o - -stop-after=irtranslator | FileCheck %s
; CHECK-LABEL: name: test_simple_alloca
; CHECK: [[NUMELTS:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[TYPE_SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
; CHECK: [[NUMELTS_64:%[0-9]+]]:_(s64) = G_ZEXT [[NUMELTS]](s32)
; CHECK: [[NUMBYTES:%[0-9]+]]:_(s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]]
; CHECK: [[SP_TMP:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[ALLOC:%[0-9]+]]:_(p0) = G_GEP [[SP_TMP]], [[NUMBYTES]]
; CHECK: [[ALIGNED_ALLOC:%[0-9]+]]:_(p0) = G_PTR_MASK [[ALLOC]], 4
; CHECK: $sp = COPY [[ALIGNED_ALLOC]]
; CHECK: [[ALLOC:%[0-9]+]]:_(p0) = COPY [[ALIGNED_ALLOC]]
; CHECK: $x0 = COPY [[ALLOC]]
define i8* @test_simple_alloca(i32 %numelts) {
; CHECK-LABEL: name: test_simple_alloca
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $w0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
; CHECK: [[DYN_STACKALLOC:%[0-9]+]]:_(p0) = G_DYN_STACKALLOC [[AND]](s64), 0
; CHECK: $x0 = COPY [[DYN_STACKALLOC]](p0)
; CHECK: RET_ReallyLR implicit $x0
%addr = alloca i8, i32 %numelts
ret i8* %addr
}
; CHECK-LABEL: name: test_aligned_alloca
; CHECK: [[NUMELTS:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[TYPE_SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
; CHECK: [[NUMELTS_64:%[0-9]+]]:_(s64) = G_ZEXT [[NUMELTS]](s32)
; CHECK: [[NUMBYTES:%[0-9]+]]:_(s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]]
; CHECK: [[SP_TMP:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[ALLOC:%[0-9]+]]:_(p0) = G_GEP [[SP_TMP]], [[NUMBYTES]]
; CHECK: [[ALIGNED_ALLOC:%[0-9]+]]:_(p0) = G_PTR_MASK [[ALLOC]], 5
; CHECK: $sp = COPY [[ALIGNED_ALLOC]]
; CHECK: [[ALLOC:%[0-9]+]]:_(p0) = COPY [[ALIGNED_ALLOC]]
; CHECK: $x0 = COPY [[ALLOC]]
define i8* @test_aligned_alloca(i32 %numelts) {
; CHECK-LABEL: name: test_aligned_alloca
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $w0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
; CHECK: [[DYN_STACKALLOC:%[0-9]+]]:_(p0) = G_DYN_STACKALLOC [[AND]](s64), 32
; CHECK: $x0 = COPY [[DYN_STACKALLOC]](p0)
; CHECK: RET_ReallyLR implicit $x0
%addr = alloca i8, i32 %numelts, align 32
ret i8* %addr
}
; CHECK-LABEL: name: test_natural_alloca
; CHECK: [[NUMELTS:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[TYPE_SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[NUMELTS_64:%[0-9]+]]:_(s64) = G_ZEXT [[NUMELTS]](s32)
; CHECK: [[NUMBYTES:%[0-9]+]]:_(s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]]
; CHECK: [[SP_TMP:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[ALLOC:%[0-9]+]]:_(p0) = G_GEP [[SP_TMP]], [[NUMBYTES]]
; CHECK: $sp = COPY [[ALLOC]]
; CHECK: [[ALLOC_TMP:%[0-9]+]]:_(p0) = COPY [[ALLOC]]
; CHECK: $x0 = COPY [[ALLOC_TMP]]
define i128* @test_natural_alloca(i32 %numelts) {
; CHECK-LABEL: name: test_natural_alloca
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $w0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
; CHECK: [[DYN_STACKALLOC:%[0-9]+]]:_(p0) = G_DYN_STACKALLOC [[AND]](s64), 0
; CHECK: $x0 = COPY [[DYN_STACKALLOC]](p0)
; CHECK: RET_ReallyLR implicit $x0
%addr = alloca i128, i32 %numelts
ret i128* %addr
}

View File

@ -0,0 +1,162 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=arm64-unknown-unknown -global-isel -global-isel-abort=1 -O0 -run-pass=legalizer %s -o - | FileCheck %s
--- |
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
define i8* @test_simple_alloca(i32 %numelts) {
%addr = alloca i8, i32 %numelts
ret i8* %addr
}
define i8* @test_aligned_alloca(i32 %numelts) {
%addr = alloca i8, i32 %numelts, align 32
ret i8* %addr
}
define i128* @test_natural_alloca(i32 %numelts) {
%addr = alloca i128, i32 %numelts
ret i128* %addr
}
...
---
name: test_simple_alloca
alignment: 2
tracksRegLiveness: true
liveins:
- { reg: '$w0' }
frameInfo:
maxAlignment: 1
stack:
- { id: 0, name: addr, type: variable-sized, alignment: 1 }
machineFunctionInfo: {}
body: |
bb.1 (%ir-block.0):
liveins: $w0
; CHECK-LABEL: name: test_simple_alloca
; CHECK: liveins: $w0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
; CHECK: %5:_(s64) = nuw G_ADD [[MUL]], [[C1]]
; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND %5, [[C2]]
; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64)
; CHECK: $sp = COPY [[INTTOPTR]](p0)
; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
; CHECK: $x0 = COPY [[COPY2]](p0)
; CHECK: RET_ReallyLR implicit $x0
%0:_(s32) = COPY $w0
%3:_(s64) = G_CONSTANT i64 1
%1:_(s64) = G_ZEXT %0(s32)
%2:_(s64) = G_MUL %1, %3
%4:_(s64) = G_CONSTANT i64 15
%5:_(s64) = nuw G_ADD %2, %4
%6:_(s64) = G_CONSTANT i64 -16
%7:_(s64) = G_AND %5, %6
%8:_(p0) = G_DYN_STACKALLOC %7(s64), 0
$x0 = COPY %8(p0)
RET_ReallyLR implicit $x0
...
---
name: test_aligned_alloca
alignment: 2
tracksRegLiveness: true
liveins:
- { reg: '$w0' }
frameInfo:
maxAlignment: 32
stack:
- { id: 0, name: addr, type: variable-sized, alignment: 32 }
machineFunctionInfo: {}
body: |
bb.1 (%ir-block.0):
liveins: $w0
; CHECK-LABEL: name: test_aligned_alloca
; CHECK: liveins: $w0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
; CHECK: %5:_(s64) = nuw G_ADD [[MUL]], [[C1]]
; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND %5, [[C2]]
; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32
; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]]
; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64)
; CHECK: $sp = COPY [[INTTOPTR]](p0)
; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
; CHECK: $x0 = COPY [[COPY2]](p0)
; CHECK: RET_ReallyLR implicit $x0
%0:_(s32) = COPY $w0
%3:_(s64) = G_CONSTANT i64 1
%1:_(s64) = G_ZEXT %0(s32)
%2:_(s64) = G_MUL %1, %3
%4:_(s64) = G_CONSTANT i64 15
%5:_(s64) = nuw G_ADD %2, %4
%6:_(s64) = G_CONSTANT i64 -16
%7:_(s64) = G_AND %5, %6
%8:_(p0) = G_DYN_STACKALLOC %7(s64), 32
$x0 = COPY %8(p0)
RET_ReallyLR implicit $x0
...
---
name: test_natural_alloca
alignment: 2
tracksRegLiveness: true
liveins:
- { reg: '$w0' }
frameInfo:
maxAlignment: 1
stack:
- { id: 0, name: addr, type: variable-sized, alignment: 1 }
machineFunctionInfo: {}
body: |
bb.1 (%ir-block.0):
liveins: $w0
; CHECK-LABEL: name: test_natural_alloca
; CHECK: liveins: $w0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
; CHECK: %5:_(s64) = nuw G_ADD [[MUL]], [[C1]]
; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND %5, [[C2]]
; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64)
; CHECK: $sp = COPY [[INTTOPTR]](p0)
; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
; CHECK: $x0 = COPY [[COPY2]](p0)
; CHECK: RET_ReallyLR implicit $x0
%0:_(s32) = COPY $w0
%3:_(s64) = G_CONSTANT i64 16
%1:_(s64) = G_ZEXT %0(s32)
%2:_(s64) = G_MUL %1, %3
%4:_(s64) = G_CONSTANT i64 15
%5:_(s64) = nuw G_ADD %2, %4
%6:_(s64) = G_CONSTANT i64 -16
%7:_(s64) = G_AND %5, %6
%8:_(p0) = G_DYN_STACKALLOC %7(s64), 0
$x0 = COPY %8(p0)
RET_ReallyLR implicit $x0
...