[AArch64][GlobalISel] Implement partial support for G_SHUFFLE_VECTOR

This change makes some basic type combinations for G_SHUFFLE_VECTOR legal, and implements them with a very pessimistic TBL2 instruction in the selector. For TBL2, support is also needed to generate constant pool entries and load from them in order to materialize the mask register. Currently supports <2 x s64> and <4 x s32> result types. Differential Revision: https://reviews.llvm.org/D58466 llvm-svn: 354521
2019-02-20 22:11:39 +00:00 · 2019-02-20 22:11:39 +00:00 · a946d057b4
parent 55cc7eb5cb
commit a946d057b4
5 changed files with 359 additions and 1 deletions
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@ -75,6 +76,14 @@ private:
  bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
  bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;

+  void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
+                                 SmallVectorImpl<int> &Idxs) const;
+  bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
+
+  unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
+  MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
+                                         MachineIRBuilder &MIRBuilder) const;
+
  ComplexRendererFns selectArithImmed(MachineOperand &Root) const;

  ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
@ -1696,6 +1705,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
    return selectMergeValues(I, MRI);
  case TargetOpcode::G_UNMERGE_VALUES:
    return selectUnmergeValues(I, MRI);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return selectShuffleVector(I, MRI);
  }

  return false;
@ -1913,6 +1924,125 @@ bool AArch64InstructionSelector::selectUnmergeValues(
  return true;
 }

+void AArch64InstructionSelector::collectShuffleMaskIndices(
+    MachineInstr &I, MachineRegisterInfo &MRI,
+    SmallVectorImpl<int> &Idxs) const {
+  MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
+  assert(
+      MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+      "G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
+  // Find the constant indices.
+  for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
+    MachineInstr *ScalarDef = MRI.getVRegDef(MaskDef->getOperand(i).getReg());
+    assert(ScalarDef && "Could not find vreg def of shufflevec index op");
+    // Look through copies.
+    while (ScalarDef->getOpcode() == TargetOpcode::COPY) {
+      ScalarDef = MRI.getVRegDef(ScalarDef->getOperand(1).getReg());
+      assert(ScalarDef && "Could not find def of copy operand");
+    }
+    assert(ScalarDef->getOpcode() == TargetOpcode::G_CONSTANT);
+    Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
+  }
+}
+
+unsigned
+AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
+                                                  MachineFunction &MF) const {
+  Type *CPTy = CPVal->getType()->getPointerTo();
+  unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy);
+  if (Align == 0)
+    Align = MF.getDataLayout().getTypeAllocSize(CPTy);
+
+  MachineConstantPool *MCP = MF.getConstantPool();
+  return MCP->getConstantPoolIndex(CPVal, Align);
+}
+
+MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
+    Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
+  unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
+
+  auto Adrp =
+      MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
+          .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
+  auto Load =
+      MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
+          .addConstantPoolIndex(CPIdx, 0,
+                                AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
+  return &*Load;
+}
+
+bool AArch64InstructionSelector::selectShuffleVector(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+  unsigned Src1Reg = I.getOperand(1).getReg();
+  const LLT Src1Ty = MRI.getType(Src1Reg);
+  unsigned Src2Reg = I.getOperand(2).getReg();
+  const LLT Src2Ty = MRI.getType(Src2Reg);
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+
+  // G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
+  // operand, it comes in as a normal vector value which we have to analyze to
+  // find the mask indices.
+  SmallVector<int, 8> Mask;
+  collectShuffleMaskIndices(I, MRI, Mask);
+  assert(!Mask.empty() && "Expected to find mask indices");
+
+  // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
+  // it's originated from a <1 x T> type. Those should have been lowered into
+  // G_BUILD_VECTOR earlier.
+  if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
+    LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
+    return false;
+  }
+
+  unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
+
+  SmallVector<Constant *, 64> CstIdxs;
+  for (int Val : Mask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
+    }
+  }
+
+  if (DstTy.getSizeInBits() != 128) {
+    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
+    // This case can be done with TBL1.
+    return false;
+  }
+
+  // Use a constant pool to load the index vector for TBL.
+  Constant *CPVal = ConstantVector::get(CstIdxs);
+  MachineIRBuilder MIRBuilder(I);
+  MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
+  if (!IndexLoad) {
+    LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
+    return false;
+  }
+
+  // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
+  // Q registers for regalloc.
+  auto RegSeq = MIRBuilder
+                    .buildInstr(TargetOpcode::REG_SEQUENCE,
+                                {&AArch64::QQRegClass}, {Src1Reg})
+                    .addImm(AArch64::qsub0)
+                    .addUse(Src2Reg)
+                    .addImm(AArch64::qsub1);
+
+  auto TBL2 =
+      MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()},
+                            {RegSeq, IndexLoad->getOperand(0).getReg()});
+  constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::selectBuildVector(
    MachineInstr &I, MachineRegisterInfo &MRI) const {
  assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@ -461,6 +461,29 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
      {s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
      .scalarize(1);

+  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
+      .legalIf([=](const LegalityQuery &Query) {
+        const LLT &DstTy = Query.Types[0];
+        const LLT &SrcTy = Query.Types[1];
+        // For now just support the TBL2 variant which needs the source vectors
+        // to be the same size as the dest.
+        if (DstTy != SrcTy)
+          return false;
+        ArrayRef<LLT> SupportedDstTys = {v2s32, v4s32, v2s64};
+        for (auto &Ty : SupportedDstTys) {
+          if (DstTy == Ty)
+            return true;
+        }
+        return false;
+      })
+      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
+      // just want those lowered into G_BUILD_VECTOR
+      .lowerIf([=](const LegalityQuery &Query) {
+        return !Query.Types[1].isVector();
+      })
+      .clampNumElements(0, v4s32, v4s32)
+      .clampNumElements(0, v2s64, v2s64);
+
  computeTables();
  verify(*ST.getInstrInfo());
 }
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
+---
+name:            shuffle_v4i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: shuffle_v4i32
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
+    ; CHECK: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK: $q0 = COPY [[SHUF]](<4 x s32>)
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s32>) = COPY $q1
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32), %4(s32), %4(s32)
+    %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>)
+    $q0 = COPY %2(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            shuffle_v2i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: shuffle_v2i64
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
+    ; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s64>), [[COPY1]], [[BUILD_VECTOR]](<2 x s32>)
+    ; CHECK: $q0 = COPY [[SHUF]](<2 x s64>)
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = COPY $q1
+    %4:_(s32) = G_CONSTANT i32 0
+    %3:_(<2 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32)
+    %2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, %3(<2 x s32>)
+    $q0 = COPY %2(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@ -313,7 +313,7 @@
 # DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SHUFFLE_VECTOR (opcode {{[0-9]+}}): 3 type indices
-# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+# DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shuffle-vector.mir
@ -0,0 +1,151 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs %s -global-isel-abort=1 -o - | FileCheck %s
+--- |
+  ; ModuleID = 'shufflevec-only-legal.ll'
+  source_filename = "shufflevec-only-legal.ll"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64"
+
+  define <4 x i32> @shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
+    %shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
+    ret <4 x i32> %shuf
+  }
+
+  define <4 x i32> @shuffle_tbl_v4i32(<4 x i32> %a, <4 x i32> %b) {
+    %shuf = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 7, i32 1, i32 0>
+    ret <4 x i32> %shuf
+  }
+
+  define <2 x i64> @shuffle_v2i64(<2 x i64> %a, <2 x i64> %b) {
+    %shuf = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> zeroinitializer
+    ret <2 x i64> %shuf
+  }
+
+...
+---
+name:            shuffle_v4i32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: shuffle_v4i32
+    ; CHECK: constants:
+    ; CHECK: value:           '<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3>'
+    ; CHECK: alignment:       8
+    ; CHECK: isTargetSpecific: false
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
+    ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
+    ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1
+    ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
+    ; CHECK: $q0 = COPY [[TBLv16i8Two]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<4 x s32>) = COPY $q0
+    %1:fpr(<4 x s32>) = COPY $q1
+    %4:gpr(s32) = G_CONSTANT i32 0
+    %5:gpr(s32) = G_CONSTANT i32 1
+    %6:gpr(s32) = G_CONSTANT i32 3
+    %3:fpr(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %4(s32)
+    %2:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>)
+    $q0 = COPY %2(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            shuffle_tbl_v4i32
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+  - { id: 7, class: gpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: shuffle_tbl_v4i32
+    ; CHECK: constants:
+    ; CHECK: value:           '<16 x i8> <i8 20, i8 21, i8 22, i8 23, i8 28, i8 29, i8 30, i8 31, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3>'
+    ; CHECK: alignment:       8
+    ; CHECK: isTargetSpecific: false
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
+    ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
+    ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1
+    ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
+    ; CHECK: $q0 = COPY [[TBLv16i8Two]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<4 x s32>) = COPY $q0
+    %1:fpr(<4 x s32>) = COPY $q1
+    %4:gpr(s32) = G_CONSTANT i32 5
+    %5:gpr(s32) = G_CONSTANT i32 7
+    %6:gpr(s32) = G_CONSTANT i32 1
+    %7:gpr(s32) = G_CONSTANT i32 0
+    %3:fpr(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32)
+    %2:fpr(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, %3(<4 x s32>)
+    $q0 = COPY %2(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            shuffle_v2i64
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: gpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: shuffle_v2i64
+    ; CHECK: constants:
+    ; CHECK: value:           '<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>'
+    ; CHECK: alignment:       8
+    ; CHECK: isTargetSpecific: false
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
+    ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
+    ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
+    ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:qq = REG_SEQUENCE [[COPY]], %subreg.qsub0, [[COPY1]], %subreg.qsub1
+    ; CHECK: [[TBLv16i8Two:%[0-9]+]]:fpr128 = TBLv16i8Two [[REG_SEQUENCE]], [[LDRQui]]
+    ; CHECK: $q0 = COPY [[TBLv16i8Two]]
+    ; CHECK: RET_ReallyLR implicit $q0
+    %0:fpr(<2 x s64>) = COPY $q0
+    %1:fpr(<2 x s64>) = COPY $q1
+    %4:gpr(s32) = G_CONSTANT i32 0
+    %3:fpr(<2 x s32>) = G_BUILD_VECTOR %4(s32), %4(s32)
+    %2:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %1, %3(<2 x s32>)
+    $q0 = COPY %2(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...