[WebAssembly] Fix RegStackify and ExplicitLocals to handle multivalue

Summary: There is still room for improvement in the handling of multivalue nodes in both passes, but the current algorithm is at least correct and optimizes some simpler cases. In order to make future optimizations of these passes easier and build confidence that the current algorithms are correct, this CL also adds a script that automatically and exhaustively generates interesting multivalue test cases. Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, hiraditya, sunfish, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D72902
2020-01-14 14:22:49 -08:00 · 2020-01-14 14:22:49 -08:00 · 5286180999
parent b91d9ec0bb
commit 5286180999
5 changed files with 3760 additions and 63 deletions
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@ -178,11 +178,18 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
 /// start of the expression tree.
 static MachineInstr *findStartOfTree(MachineOperand &MO,
                                     MachineRegisterInfo &MRI,
-                                     WebAssemblyFunctionInfo &MFI) {
+                                     const WebAssemblyFunctionInfo &MFI) {
  Register Reg = MO.getReg();
  assert(MFI.isVRegStackified(Reg));
  MachineInstr *Def = MRI.getVRegDef(Reg);

+  // If this instruction has any non-stackified defs, it is the start
+  for (auto DefReg : Def->defs()) {
+    if (!MFI.isVRegStackified(DefReg.getReg())) {
+      return Def;
+    }
+  }
+
  // Find the first stackified use and proceed from there.
  for (MachineOperand &DefMO : Def->explicit_uses()) {
    if (!DefMO.isReg())
@ -243,6 +250,12 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
      if (MI.isDebugInstr() || MI.isLabel())
        continue;

+      if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
+        MI.eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
      // Replace tee instructions with local.tee. The difference is that tee
      // instructions have two defs, while local.tee instructions have one def
      // and an index of a local to write to.
@ -279,20 +292,13 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
        continue;
      }

-      // Insert local.sets for any defs that aren't stackified yet. Currently
-      // we handle at most one def.
-      assert(MI.getDesc().getNumDefs() <= 1);
-      if (MI.getDesc().getNumDefs() == 1) {
-        Register OldReg = MI.getOperand(0).getReg();
+      // Insert local.sets for any defs that aren't stackified yet.
+      for (auto &Def : MI.defs()) {
+        Register OldReg = Def.getReg();
        if (!MFI.isVRegStackified(OldReg)) {
          const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
          Register NewReg = MRI.createVirtualRegister(RC);
          auto InsertPt = std::next(MI.getIterator());
-          if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
-            MI.eraseFromParent();
-            Changed = true;
-            continue;
-          }
          if (UseEmpty[Register::virtReg2Index(OldReg)]) {
            unsigned Opc = getDropOpcode(RC);
            MachineInstr *Drop =
@ -310,11 +316,11 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                .addImm(LocalId)
                .addReg(NewReg);
          }
-          MI.getOperand(0).setReg(NewReg);
          // This register operand of the original instruction is now being used
          // by the inserted drop or local.set instruction, so make it not dead
          // yet.
-          MI.getOperand(0).setIsDead(false);
+          Def.setReg(NewReg);
+          Def.setIsDead(false);
          MFI.stackifyVReg(NewReg);
          Changed = true;
        }
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@ -36,6 +36,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <iterator>
 using namespace llvm;

 #define DEBUG_TYPE "wasm-reg-stackify"
@ -120,6 +121,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
        Type::getDoubleTy(MF.getFunction().getContext())));
    MI->addOperand(MachineOperand::CreateFPImm(Val));
  } else if (RegClass == &WebAssembly::V128RegClass) {
+    // TODO: Replace this with v128.const 0 once that is supported in V8
    Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
    MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32));
    MI->addOperand(MachineOperand::CreateReg(TempReg, false));
@ -312,25 +314,59 @@ static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
 // walking the block.
 // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
 // more precise.
-static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
-                         AliasAnalysis &AA, const MachineRegisterInfo &MRI) {
-  assert(Def->getParent() == Insert->getParent());
+static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
+                         const MachineInstr *Insert, AliasAnalysis &AA,
+                         const WebAssemblyFunctionInfo &MFI,
+                         const MachineRegisterInfo &MRI) {
+  const MachineInstr *DefI = Def->getParent();
+  const MachineInstr *UseI = Use->getParent();
+  assert(DefI->getParent() == Insert->getParent());
+  assert(UseI->getParent() == Insert->getParent());
+
+  // The first def of a multivalue instruction can be stackified by moving,
+  // since the later defs can always be placed into locals if necessary. Later
+  // defs can only be stackified if all previous defs are already stackified
+  // since ExplicitLocals will not know how to place a def in a local if a
+  // subsequent def is stackified. But only one def can be stackified by moving
+  // the instruction, so it must be the first one.
+  //
+  // TODO: This could be loosened to be the first *live* def, but care would
+  // have to be taken to ensure the drops of the initial dead defs can be
+  // placed. This would require checking that no previous defs are used in the
+  // same instruction as subsequent defs.
+  if (Def != DefI->defs().begin())
+    return false;
+
+  // If any subsequent def is used prior to the current value by the same
+  // instruction in which the current value is used, we cannot
+  // stackify. Stackifying in this case would require that def moving below the
+  // current def in the stack, which cannot be achieved, even with locals.
+  for (const auto &SubsequentDef : drop_begin(DefI->defs(), 1)) {
+    for (const auto &PriorUse : UseI->uses()) {
+      if (&PriorUse == Use)
+        break;
+      if (PriorUse.isReg() && SubsequentDef.getReg() == PriorUse.getReg())
+        return false;
+    }
+  }
+
+  // If moving is a semantic nop, it is always allowed
+  const MachineBasicBlock *MBB = DefI->getParent();
+  auto NextI = std::next(MachineBasicBlock::const_iterator(DefI));
+  for (auto E = MBB->end(); NextI != E && NextI->isDebugInstr(); ++NextI)
+    ;
+  if (NextI == Insert)
+    return true;

  // 'catch' and 'extract_exception' should be the first instruction of a BB and
  // cannot move.
-  if (Def->getOpcode() == WebAssembly::CATCH ||
-      Def->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) {
-    const MachineBasicBlock *MBB = Def->getParent();
-    auto NextI = std::next(MachineBasicBlock::const_iterator(Def));
-    for (auto E = MBB->end(); NextI != E && NextI->isDebugInstr(); ++NextI)
-      ;
-    if (NextI != Insert)
-      return false;
-  }
+  if (DefI->getOpcode() == WebAssembly::CATCH ||
+      DefI->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32)
+    return false;

  // Check for register dependencies.
  SmallVector<unsigned, 4> MutableRegisters;
-  for (const MachineOperand &MO : Def->operands()) {
+  for (const MachineOperand &MO : DefI->operands()) {
    if (!MO.isReg() || MO.isUndef())
      continue;
    Register Reg = MO.getReg();
@ -360,7 +396,7 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
  }

  bool Read = false, Write = false, Effects = false, StackPointer = false;
-  query(*Def, AA, Read, Write, Effects, StackPointer);
+  query(*DefI, AA, Read, Write, Effects, StackPointer);

  // If the instruction does not access memory and has no side effects, it has
  // no additional dependencies.
@ -368,8 +404,8 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
  if (!Read && !Write && !Effects && !StackPointer && !HasMutableRegisters)
    return true;

-  // Scan through the intervening instructions between Def and Insert.
-  MachineBasicBlock::const_iterator D(Def), I(Insert);
+  // Scan through the intervening instructions between DefI and Insert.
+  MachineBasicBlock::const_iterator D(DefI), I(Insert);
  for (--I; I != D; --I) {
    bool InterveningRead = false;
    bool InterveningWrite = false;
@ -800,32 +836,32 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
      CommutingState Commuting;
      TreeWalkerState TreeWalker(Insert);
      while (!TreeWalker.done()) {
-        MachineOperand &Op = TreeWalker.pop();
+        MachineOperand &Use = TreeWalker.pop();

        // We're only interested in explicit virtual register operands.
-        if (!Op.isReg())
+        if (!Use.isReg())
          continue;

-        Register Reg = Op.getReg();
-        assert(Op.isUse() && "explicit_uses() should only iterate over uses");
-        assert(!Op.isImplicit() &&
+        Register Reg = Use.getReg();
+        assert(Use.isUse() && "explicit_uses() should only iterate over uses");
+        assert(!Use.isImplicit() &&
               "explicit_uses() should only iterate over explicit operands");
        if (Register::isPhysicalRegister(Reg))
          continue;

        // Identify the definition for this register at this point.
-        MachineInstr *Def = getVRegDef(Reg, Insert, MRI, LIS);
-        if (!Def)
+        MachineInstr *DefI = getVRegDef(Reg, Insert, MRI, LIS);
+        if (!DefI)
          continue;

        // Don't nest an INLINE_ASM def into anything, because we don't have
        // constraints for $pop outputs.
-        if (Def->isInlineAsm())
+        if (DefI->isInlineAsm())
          continue;

        // Argument instructions represent live-in registers and not real
        // instructions.
-        if (WebAssembly::isArgument(Def->getOpcode()))
+        if (WebAssembly::isArgument(DefI->getOpcode()))
          continue;

        // Currently catch's return value register cannot be stackified, because
@ -842,34 +878,38 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
        // register should be assigned to a local to be propagated across
        // 'block' boundary now.
        //
-        // TODO Fix this once we support the multi-value proposal.
-        if (Def->getOpcode() == WebAssembly::CATCH)
+        // TODO: Fix this once we support the multivalue blocks
+        if (DefI->getOpcode() == WebAssembly::CATCH)
          continue;

+        MachineOperand *Def = DefI->findRegisterDefOperand(Reg);
+        assert(Def != nullptr);
+
        // Decide which strategy to take. Prefer to move a single-use value
        // over cloning it, and prefer cloning over introducing a tee.
        // For moving, we require the def to be in the same block as the use;
        // this makes things simpler (LiveIntervals' handleMove function only
        // supports intra-block moves) and it's MachineSink's job to catch all
        // the sinking opportunities anyway.
-        bool SameBlock = Def->getParent() == &MBB;
-        bool CanMove = SameBlock && isSafeToMove(Def, Insert, AA, MRI) &&
+        bool SameBlock = DefI->getParent() == &MBB;
+        bool CanMove = SameBlock &&
+                       isSafeToMove(Def, &Use, Insert, AA, MFI, MRI) &&
                       !TreeWalker.isOnStack(Reg);
-        if (CanMove && hasOneUse(Reg, Def, MRI, MDT, LIS)) {
-          Insert = moveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
+        if (CanMove && hasOneUse(Reg, DefI, MRI, MDT, LIS)) {
+          Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI);

          // If we are removing the frame base reg completely, remove the debug
          // info as well.
          // TODO: Encode this properly as a stackified value.
          if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
            MFI.clearFrameBaseVreg();
-        } else if (shouldRematerialize(*Def, AA, TII)) {
+        } else if (shouldRematerialize(*DefI, AA, TII)) {
          Insert =
-              rematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+              rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(),
                                    LIS, MFI, MRI, TII, TRI);
-        } else if (CanMove &&
-                   oneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
-          Insert = moveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+        } else if (CanMove && oneUseDominatesOtherUses(Reg, Use, MBB, MRI, MDT,
+                                                       LIS, MFI)) {
+          Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, LIS, MFI,
                                         MRI, TII);
        } else {
          // We failed to stackify the operand. If the problem was ordering
@ -880,6 +920,25 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
          continue;
        }

+        // Stackifying a multivalue def may unlock in-place stackification of
+        // subsequent defs. TODO: Handle the case where the consecutive uses are
+        // not all in the same instruction.
+        auto *SubsequentDef = DefI->defs().begin();
+        auto *SubsequentUse = &Use;
+        while (SubsequentDef != DefI->defs().end() &&
+               SubsequentUse != Use.getParent()->uses().end()) {
+          if (!SubsequentDef->isReg() || !SubsequentUse->isReg())
+            break;
+          unsigned DefReg = SubsequentDef->getReg();
+          unsigned UseReg = SubsequentUse->getReg();
+          // TODO: This single-use restriction could be relaxed by using tees
+          if (DefReg != UseReg || !MRI.hasOneUse(DefReg))
+            break;
+          MFI.stackifyVReg(DefReg);
+          ++SubsequentDef;
+          ++SubsequentUse;
+        }
+
        // If the instruction we just stackified is an IMPLICIT_DEF, convert it
        // to a constant 0 so that the def is explicit, and the push/pop
        // correspondence is maintained.
@ -917,18 +976,20 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
    for (MachineInstr &MI : MBB) {
      if (MI.isDebugInstr())
        continue;
-      for (MachineOperand &MO : reverse(MI.explicit_operands())) {
+      for (MachineOperand &MO : reverse(MI.explicit_uses())) {
        if (!MO.isReg())
          continue;
        Register Reg = MO.getReg();
-
-        if (MFI.isVRegStackified(Reg)) {
-          if (MO.isDef())
-            Stack.push_back(Reg);
-          else
-            assert(Stack.pop_back_val() == Reg &&
-                   "Register stack pop should be paired with a push");
-        }
+        if (MFI.isVRegStackified(Reg))
+          assert(Stack.pop_back_val() == Reg &&
+                 "Register stack pop should be paired with a push");
+      }
+      for (MachineOperand &MO : MI.defs()) {
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (MFI.isVRegStackified(Reg))
+          Stack.push_back(MO.getReg());
      }
    }
    // TODO: Generalize this code to support keeping values on the stack across
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.py
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.py
@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+
+"""A test case generator for register stackification.
+
+This script exhaustively generates small linear SSA programs, then filters them
+based on heuristics designed to keep interesting multivalue test cases and
+prints them as LLVM IR functions in a FileCheck test file.
+
+The output of this script is meant to be used in conjunction with
+update_llc_test_checks.py.
+
+  ```
+  ./multivalue-stackify.py > multivalue-stackify.ll
+  ../../../utils/update_llc_test_checks.py multivalue-stackify.ll
+  ```
+
+Programs are represented internally as lists of operations, where each operation
+is a pair of tuples, the first of which specifies the operation's uses and the
+second of which specifies its defs.
+
+TODO: Before embarking on a rewrite of the register stackifier, an abstract
+interpreter should be written to automatically check that the test assertions
+generated by update_llc_test_checks.py have the same semantics as the functions
+generated by this script. Once that is done, exhaustive testing can be done by
+making `is_interesting` return True.
+"""
+
+
+from itertools import product
+from collections import deque
+
+
+MAX_PROGRAM_OPS = 4
+MAX_PROGRAM_DEFS = 3
+MAX_OP_USES = 2
+
+
+def get_num_defs(program):
+  num_defs = 0
+  for _, defs in program:
+    num_defs += len(defs)
+  return num_defs
+
+
+def possible_ops(program):
+  program_defs = get_num_defs(program)
+  for num_defs in range(MAX_PROGRAM_DEFS - program_defs + 1):
+    for num_uses in range(MAX_OP_USES + 1):
+      if num_defs == 0 and num_uses == 0:
+        continue
+      for uses in product(range(program_defs), repeat=num_uses):
+        yield uses, tuple(program_defs + i for i in range(num_defs))
+
+
+def generate_programs():
+  queue = deque()
+  queue.append([])
+  program_id = 0
+  while True:
+    program = queue.popleft()
+    if len(program) == MAX_PROGRAM_OPS:
+      break
+    for op in possible_ops(program):
+      program_id += 1
+      new_program = program + [op]
+      queue.append(new_program)
+      yield program_id, new_program
+
+
+def get_num_terminal_ops(program):
+  num_terminal_ops = 0
+  for _, defs in program:
+    if len(defs) == 0:
+      num_terminal_ops += 1
+  return num_terminal_ops
+
+
+def get_max_uses(program):
+  num_uses = [0] * MAX_PROGRAM_DEFS
+  for uses, _ in program:
+    for u in uses:
+      num_uses[u] += 1
+  return max(num_uses)
+
+
+def has_unused_op(program):
+  used = [False] * MAX_PROGRAM_DEFS
+  for uses, defs in program[::-1]:
+    if defs and all(not used[d] for d in defs):
+      return True
+    for u in uses:
+      used[u] = True
+  return False
+
+
+def has_multivalue_use(program):
+  is_multi = [False] * MAX_PROGRAM_DEFS
+  for uses, defs in program:
+    if any(is_multi[u] for u in uses):
+      return True
+    if len(defs) >= 2:
+      for d in defs:
+        is_multi[d] = True
+  return False
+
+
+def has_mvp_use(program):
+  is_mvp = [False] * MAX_PROGRAM_DEFS
+  for uses, defs in program:
+    if uses and all(is_mvp[u] for u in uses):
+      return True
+    if len(defs) <= 1:
+      if any(is_mvp[u] for u in uses):
+        return True
+      for d in defs:
+        is_mvp[d] = True
+  return False
+
+
+def is_interesting(program):
+  # Allow only multivalue single-op programs
+  if len(program) == 1:
+    return len(program[0][1]) > 1
+
+  # Reject programs where the last two instructions are identical
+  if len(program) >= 2 and program[-1][0] == program[-2][0]:
+    return False
+
+  # Reject programs with too many ops that don't produce values
+  if get_num_terminal_ops(program) > 2:
+    return False
+
+  # The third use of a value is no more interesting than the second
+  if get_max_uses(program) >= 3:
+    return False
+
+  # Reject nontrivial programs that have unused instructions
+  if has_unused_op(program):
+    return False
+
+  # Reject programs that have boring MVP uses of MVP defs
+  if has_mvp_use(program):
+    return False
+
+  # Otherwise if it has multivalue usage it is interesting
+  return has_multivalue_use(program)
+
+
+def make_llvm_type(num_defs):
+  if num_defs == 0:
+    return 'void'
+  else:
+    return '{' + ', '.join(['i32'] * num_defs) + '}'
+
+
+def make_llvm_op_name(num_uses, num_defs):
+  return f'op_{num_uses}_to_{num_defs}'
+
+
+def make_llvm_args(first_use, num_uses):
+  return ', '.join([f'i32 %t{first_use + i}' for i in range(num_uses)])
+
+
+def print_llvm_program(program, name):
+  tmp = 0
+  def_data = []
+  print(f'define void @{name}() {{')
+  for uses, defs in program:
+    first_arg = tmp
+    # Extract operands
+    for use in uses:
+      ret_type, var, idx = def_data[use]
+      print(f'  %t{tmp} = extractvalue {ret_type} %t{var}, {idx}')
+      tmp += 1
+    # Print instruction
+    assignment = ''
+    if len(defs) > 0:
+      assignment = f'%t{tmp} = '
+      result_var = tmp
+      tmp += 1
+    ret_type = make_llvm_type(len(defs))
+    op_name = make_llvm_op_name(len(uses), len(defs))
+    args = make_llvm_args(first_arg, len(uses))
+    print(f'  {assignment}call {ret_type} @{op_name}({args})')
+    # Update def_data
+    for i in range(len(defs)):
+      def_data.append((ret_type, result_var, i))
+  print('  ret void')
+  print('}')
+
+
+def print_header():
+  print('; NOTE: Test functions have been generated by multivalue-stackify.py.')
+  print()
+  print('; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue',
+        '| FileCheck %s')
+  print()
+  print('; Test that the multivalue stackification works')
+  print()
+  print('target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"')
+  print('target triple = "wasm32-unknown-unknown"')
+  print()
+  for num_uses in range(MAX_OP_USES + 1):
+    for num_defs in range(MAX_PROGRAM_DEFS + 1):
+      if num_uses == 0 and num_defs == 0:
+        continue
+      ret_type = make_llvm_type(num_defs)
+      op_name = make_llvm_op_name(num_uses, num_defs)
+      args = make_llvm_args(0, num_uses)
+      print(f'declare {ret_type} @{op_name}({args})')
+  print()
+
+
+if __name__ == '__main__':
+  print_header()
+  for i, program in generate_programs():
+    if is_interesting(program):
+      print_llvm_program(program, 'f' + str(i))
+      print()
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@ -8,6 +8,10 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"

 %pair = type { i32, i64 }
+%rpair = type { i64, i32 }
+
+declare void @use_i32(i32)
+declare void @use_i64(i64)

 ; CHECK-LABEL: pair_const:
 ; CHECK-NEXT: .functype pair_const () -> (i32, i64)
@ -27,12 +31,12 @@ define %pair @pair_ident(%pair %p) {
  ret %pair %p
 }

-;; TODO: Multivalue calls are a WIP and the following test cases do
-;; not necessarily produce correct output. For now just check that
-;; they do not crash.
-
 ; CHECK-LABEL: pair_call:
 ; CHECK-NEXT: .functype pair_call () -> ()
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: drop{{$}}
+; CHECK-NEXT: drop{{$}}
+; CHECK-NEXT: end_function{{$}}
 define void @pair_call() {
  %p = call %pair @pair_const()
  ret void
@ -40,6 +44,8 @@ define void @pair_call() {

 ; CHECK-LABEL: pair_call_return:
 ; CHECK-NEXT: .functype pair_call_return () -> (i32, i64)
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: end_function{{$}}
 define %pair @pair_call_return() {
  %p = call %pair @pair_const()
  ret %pair %p
@ -47,7 +53,9 @@ define %pair @pair_call_return() {

 ; CHECK-LABEL: pair_call_indirect:
 ; CHECK-NEXT: .functype pair_call_indirect (i32) -> (i32, i64)
-; CHECK: call_indirect () -> (i32, i64){{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: call_indirect () -> (i32, i64){{$}}
+; CHECK-NEXT: end_function{{$}}
 define %pair @pair_call_indirect(%pair()* %f) {
  %p = call %pair %f()
  ret %pair %p
@ -55,6 +63,8 @@ define %pair @pair_call_indirect(%pair()* %f) {

 ; CHECK-LABEL: pair_tail_call:
 ; CHECK-NEXT: .functype pair_tail_call () -> (i32, i64)
+; CHECK-NEXT: return_call pair_const{{$}}
+; CHECK-NEXT: end_function{{$}}
 define %pair @pair_tail_call() {
  %p = musttail call %pair @pair_const()
  ret %pair %p
@ -62,6 +72,9 @@ define %pair @pair_tail_call() {

 ; CHECK-LABEL: pair_call_return_first:
 ; CHECK-NEXT: .functype pair_call_return_first () -> (i32)
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: drop{{$}}
+; CHECK-NEXT: end_function{{$}}
 define i32 @pair_call_return_first() {
  %p = call %pair @pair_const()
  %v = extractvalue %pair %p, 0
@ -70,20 +83,142 @@ define i32 @pair_call_return_first() {

 ; CHECK-LABEL: pair_call_return_second:
 ; CHECK-NEXT: .functype pair_call_return_second () -> (i64)
+; CHECK-NEXT: .local i64{{$}}
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: local.set 0{{$}}
+; CHECK-NEXT: drop{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: end_function{{$}}
 define i64 @pair_call_return_second() {
  %p = call %pair @pair_const()
  %v = extractvalue %pair %p, 1
  ret i64 %v
 }

+; CHECK-LABEL: pair_call_use_first:
+; CHECK-NEXT: .functype pair_call_use_first () -> ()
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: drop{{$}}
+; CHECK-NEXT: call use_i32{{$}}
+; CHECK-NEXT: end_function{{$}}
+define void @pair_call_use_first() {
+  %p = call %pair @pair_const()
+  %v = extractvalue %pair %p, 0
+  call void @use_i32(i32 %v)
+  ret void
+}
+
+; CHECK-LABEL: pair_call_use_second:
+; CHECK-NEXT: .functype pair_call_use_second () -> ()
+; CHECK-NEXT: .local i64
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: local.set 0{{$}}
+; CHECK-NEXT: drop{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: call use_i64{{$}}
+; CHECK-NEXT: end_function{{$}}
+define void @pair_call_use_second() {
+  %p = call %pair @pair_const()
+  %v = extractvalue %pair %p, 1
+  call void @use_i64(i64 %v)
+  ret void
+}
+
+; CHECK-LABEL: pair_call_use_first_return_second:
+; CHECK-NEXT: .functype pair_call_use_first_return_second () -> (i64)
+; CHECK-NEXT: .local i64{{$}}
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: local.set 0{{$}}
+; CHECK-NEXT: call use_i32{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: end_function{{$}}
+define i64 @pair_call_use_first_return_second() {
+  %p = call %pair @pair_const()
+  %v = extractvalue %pair %p, 0
+  call void @use_i32(i32 %v)
+  %r = extractvalue %pair %p, 1
+  ret i64 %r
+}
+
+; CHECK-LABEL: pair_call_use_second_return_first:
+; CHECK-NEXT: .functype pair_call_use_second_return_first () -> (i32)
+; CHECK-NEXT: .local i32, i64{{$}}
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: local.set 1{{$}}
+; CHECK-NEXT: local.set 0{{$}}
+; CHECK-NEXT: local.get 1{{$}}
+; CHECK-NEXT: call use_i64{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: end_function{{$}}
+define i32 @pair_call_use_second_return_first() {
+  %p = call %pair @pair_const()
+  %v = extractvalue %pair %p, 1
+  call void @use_i64(i64 %v)
+  %r = extractvalue %pair %p, 0
+  ret i32 %r
+}

 ; CHECK-LABEL: pair_pass_through:
 ; CHECK-NEXT: .functype pair_pass_through (i32, i64) -> (i32, i64)
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: call pair_ident{{$}}
+; CHECK-NEXT: end_function{{$}}
 define %pair @pair_pass_through(%pair %p) {
  %r = call %pair @pair_ident(%pair %p)
  ret %pair %r
 }

+; CHECK-LABEL: pair_swap:
+; CHECK-NEXT: .functype pair_swap (i32, i64) -> (i64, i32)
+; CHECK-NEXT: local.get 1{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: end_function{{$}}
+define %rpair @pair_swap(%pair %p) {
+  %first = extractvalue %pair %p, 0
+  %second = extractvalue %pair %p, 1
+  %r1 = insertvalue %rpair undef, i32 %first, 1
+  %r2 = insertvalue %rpair %r1, i64 %second, 0
+  ret %rpair %r2
+}
+
+; CHECK-LABEL: pair_call_swap:
+; CHECK-NEXT: .functype pair_call_swap () -> (i64, i32)
+; CHECK-NEXT: .local i32, i64{{$}}
+; CHECK-NEXT: call pair_const{{$}}
+; CHECK-NEXT: local.set 1{{$}}
+; CHECK-NEXT: local.set 0{{$}}
+; CHECK-NEXT: local.get 1{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: end_function{{$}}
+define %rpair @pair_call_swap() {
+  %p = call %pair @pair_const()
+  %first = extractvalue %pair %p, 0
+  %second = extractvalue %pair %p, 1
+  %r1 = insertvalue %rpair undef, i32 %first, 1
+  %r2 = insertvalue %rpair %r1, i64 %second, 0
+  ret %rpair %r2
+}
+
+; CHECK-LABEL: pair_pass_through_swap:
+; CHECK-NEXT: .functype pair_pass_through_swap (i32, i64) -> (i64, i32)
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: local.get 1{{$}}
+; CHECK-NEXT: call pair_ident{{$}}
+; CHECK-NEXT: local.set 1{{$}}
+; CHECK-NEXT: local.set 0{{$}}
+; CHECK-NEXT: local.get 1{{$}}
+; CHECK-NEXT: local.get 0{{$}}
+; CHECK-NEXT: end_function{{$}}
+define %rpair @pair_pass_through_swap(%pair %p) {
+  %p1 = call %pair @pair_ident(%pair %p)
+  %first = extractvalue %pair %p1, 0
+  %second = extractvalue %pair %p1, 1
+  %r1 = insertvalue %rpair undef, i32 %first, 1
+  %r2 = insertvalue %rpair %r1, i64 %second, 0
+  ret %rpair %r2
+}
+
 ; CHECK-LABEL: minimal_loop:
 ; CHECK-NEXT: .functype minimal_loop (i32) -> (i32, i64)
 ; CHECK-NEXT: .LBB{{[0-9]+}}_1:
@ -91,6 +226,7 @@ define %pair @pair_pass_through(%pair %p) {
 ; CHECK-NEXT: br 0{{$}}
 ; CHECK-NEXT: .LBB{{[0-9]+}}_2:
 ; CHECK-NEXT: end_loop{{$}}
+; CHECK-NEXT: end_function{{$}}
 define %pair @minimal_loop(i32* %p) {
 entry:
  br label %loop
@ -138,3 +274,23 @@ loop:
 ; OBJ-NEXT:         ParamTypes:      []
 ; OBJ-NEXT:         ReturnTypes:
 ; OBJ-NEXT:           - I64
+; OBJ-NEXT:       - Index:           6
+; OBJ-NEXT:         ParamTypes:
+; OBJ-NEXT:           - I32
+; OBJ-NEXT:         ReturnTypes:     []
+; OBJ-NEXT:       - Index:           7
+; OBJ-NEXT:         ParamTypes:
+; OBJ-NEXT:           - I64
+; OBJ-NEXT:         ReturnTypes:     []
+; OBJ-NEXT:       - Index:           8
+; OBJ-NEXT:         ParamTypes:
+; OBJ-NEXT:           - I32
+; OBJ-NEXT:           - I64
+; OBJ-NEXT:         ReturnTypes:
+; OBJ-NEXT:           - I64
+; OBJ-NEXT:           - I32
+; OBJ-NEXT:       - Index:           9
+; OBJ-NEXT:         ParamTypes:      []
+; OBJ-NEXT:         ReturnTypes:
+; OBJ-NEXT:           - I64
+; OBJ-NEXT:           - I32