[IfCvt][ARM] Optimise diamond if-conversion for code size

Currently, the heuristics the if-conversion pass uses for diamond if-conversion are based on execution time, with no consideration for code size. This adds a new set of heuristics to be used when optimising for code size. This is mostly target-independent, because the if-conversion pass can see the code size of the instructions which it is removing. For thumb, there are a few passes (insertion of IT instructions, selection of narrow branches, and selection of CBZ instructions) which are run after if conversion and affect these heuristics, so I've added target hooks to better predict the code-size effect of a proposed if-conversion. Differential revision: https://reviews.llvm.org/D67350 llvm-svn: 374301
2019-10-10 09:58:28 +00:00 · 2019-10-10 09:58:28 +00:00 · 4f454b2275
parent c92a75fec0
commit 4f454b2275
5 changed files with 724 additions and 17 deletions
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@ -778,6 +778,19 @@ public:
    return false;
  }

+  /// Return the increase in code size needed to predicate a contiguous run of
+  /// NumInsts instructions.
+  virtual unsigned extraSizeToPredicateInstructions(const MachineFunction &MF,
+                                                    unsigned NumInsts) const {
+    return 0;
+  }
+
+  /// Return an estimate for the code size reduction (in bytes) which will be
+  /// caused by removing the given branch instruction during if-conversion.
+  virtual unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const {
+    return getInstSizeInBytes(MI);
+  }
+
  /// Return true if it's profitable to unpredicate
  /// one side of a 'diamond', i.e. two sides of if-else predicated on mutually
  /// exclusive predicates.
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@ -285,14 +285,113 @@ namespace {
                                                   Prediction);
    }

-    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB,
-                            unsigned TCycle, unsigned TExtra,
-                            MachineBasicBlock &FBB,
-                            unsigned FCycle, unsigned FExtra,
-                            BranchProbability Prediction) const {
-      return TCycle > 0 && FCycle > 0 &&
-        TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
-                                 Prediction);
+    bool MeetIfcvtSizeLimit(BBInfo &TBBInfo, BBInfo &FBBInfo,
+                            MachineBasicBlock &CommBB, unsigned Dups,
+                            BranchProbability Prediction, bool Forked) const {
+      const MachineFunction &MF = *TBBInfo.BB->getParent();
+      if (MF.getFunction().hasMinSize()) {
+        MachineBasicBlock::iterator TIB = TBBInfo.BB->begin();
+        MachineBasicBlock::iterator FIB = FBBInfo.BB->begin();
+        MachineBasicBlock::iterator TIE = TBBInfo.BB->end();
+        MachineBasicBlock::iterator FIE = FBBInfo.BB->end();
+
+        unsigned Dups1, Dups2;
+        if (!CountDuplicatedInstructions(TIB, FIB, TIE, FIE, Dups1, Dups2,
+                                         *TBBInfo.BB, *FBBInfo.BB,
+                                         /*SkipUnconditionalBranches*/ true))
+          llvm_unreachable("should already have been checked by ValidDiamond");
+
+        unsigned BranchBytes = 0;
+        unsigned CommonBytes = 0;
+
+        // Count common instructions at the start of the true and false blocks.
+        for (auto &I : make_range(TBBInfo.BB->begin(), TIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+        for (auto &I : make_range(FBBInfo.BB->begin(), FIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+
+        // Count instructions at the end of the true and false blocks, after
+        // the ones we plan to predicate. Analyzable branches will be removed
+        // (unless this is a forked diamond), and all other instructions are
+        // common between the two blocks.
+        for (auto &I : make_range(TIE, TBBInfo.BB->end())) {
+          if (I.isBranch() && TBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : make_range(FIE, FBBInfo.BB->end())) {
+          if (I.isBranch() && FBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : CommBB.terminators()) {
+          if (I.isBranch()) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          }
+        }
+
+        // The common instructions in one branch will be eliminated, halving
+        // their code size.
+        CommonBytes /= 2;
+
+        // Count the instructions which we need to predicate.
+        unsigned NumPredicatedInstructions = 0;
+        for (auto &I : make_range(TIB, TIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+        for (auto &I : make_range(FIB, FIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+
+        // Even though we're optimising for size at the expense of performance,
+        // avoid creating really long predicated blocks.
+        if (NumPredicatedInstructions > 15)
+          return false;
+
+        // Some targets (e.g. Thumb2) need to insert extra instructions to
+        // start predicated blocks.
+        unsigned ExtraPredicateBytes = TII->extraSizeToPredicateInstructions(
+            MF, NumPredicatedInstructions);
+
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(BranchBytes=" << BranchBytes
+                          << ", CommonBytes=" << CommonBytes
+                          << ", NumPredicatedInstructions="
+                          << NumPredicatedInstructions
+                          << ", ExtraPredicateBytes=" << ExtraPredicateBytes
+                          << ")\n");
+        return (BranchBytes + CommonBytes) > ExtraPredicateBytes;
+      } else {
+        unsigned TCycle = TBBInfo.NonPredSize + TBBInfo.ExtraCost - Dups;
+        unsigned FCycle = FBBInfo.NonPredSize + FBBInfo.ExtraCost - Dups;
+        bool Res = TCycle > 0 && FCycle > 0 &&
+                   TII->isProfitableToIfCvt(
+                       *TBBInfo.BB, TCycle, TBBInfo.ExtraCost2, *FBBInfo.BB,
+                       FCycle, FBBInfo.ExtraCost2, Prediction);
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(TCycle=" << TCycle
+                          << ", FCycle=" << FCycle
+                          << ", TExtra=" << TBBInfo.ExtraCost2 << ", FExtra="
+                          << FBBInfo.ExtraCost2 << ") = " << Res << "\n");
+        return Res;
+      }
    }

    /// Returns true if Block ends without a terminator.
@ -842,6 +941,8 @@ bool IfConverter::ValidForkedDiamond(

  TrueBBICalc.BB = TrueBBI.BB;
  FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
  if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
    return false;

@ -899,6 +1000,8 @@ bool IfConverter::ValidDiamond(

  TrueBBICalc.BB = TrueBBI.BB;
  FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
  if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
    return false;
  // The size is used to decide whether to if-convert, and the shared portions
@ -1186,13 +1289,9 @@ void IfConverter::AnalyzeBlock(

    if (CanRevCond) {
      BBInfo TrueBBICalc, FalseBBICalc;
-      auto feasibleDiamond = [&]() {
-        bool MeetsSize = MeetIfcvtSizeLimit(
-            *TrueBBI.BB, (TrueBBICalc.NonPredSize - (Dups + Dups2) +
-                          TrueBBICalc.ExtraCost), TrueBBICalc.ExtraCost2,
-            *FalseBBI.BB, (FalseBBICalc.NonPredSize - (Dups + Dups2) +
-                           FalseBBICalc.ExtraCost), FalseBBICalc.ExtraCost2,
-            Prediction);
+      auto feasibleDiamond = [&](bool Forked) {
+        bool MeetsSize = MeetIfcvtSizeLimit(TrueBBICalc, FalseBBICalc, *BB,
+                                            Dups + Dups2, Prediction, Forked);
        bool TrueFeasible = FeasibilityAnalysis(TrueBBI, BBI.BrCond,
                                                /* IsTriangle */ false, /* RevCond */ false,
                                                /* hasCommonTail */ true);
@ -1204,7 +1303,7 @@ void IfConverter::AnalyzeBlock(

      if (ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2,
                       TrueBBICalc, FalseBBICalc)) {
-        if (feasibleDiamond()) {
+        if (feasibleDiamond(false)) {
          // Diamond:
          //   EBB
          //   / \_
@ -1220,7 +1319,7 @@ void IfConverter::AnalyzeBlock(
        }
      } else if (ValidForkedDiamond(TrueBBI, FalseBBI, Dups, Dups2,
                                    TrueBBICalc, FalseBBICalc)) {
-        if (feasibleDiamond()) {
+        if (feasibleDiamond(true)) {
          // ForkedDiamond:
          // if TBB and FBB have a common tail that includes their conditional
          // branch instructions, then we can If Convert this pattern.
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@ -2079,6 +2079,38 @@ isProfitableToIfCvt(MachineBasicBlock &TBB,
  return PredCost <= UnpredCost;
 }

+unsigned
+ARMBaseInstrInfo::extraSizeToPredicateInstructions(const MachineFunction &MF,
+                                                   unsigned NumInsts) const {
+  // Thumb2 needs a 2-byte IT instruction to predicate up to 4 instructions.
+  // ARM has a condition code field in every predicable instruction, using it
+  // doesn't change code size.
+  return Subtarget.isThumb2() ? divideCeil(NumInsts, 4) * 2 : 0;
+}
+
+unsigned
+ARMBaseInstrInfo::predictBranchSizeForIfCvt(MachineInstr &MI) const {
+  // If this branch is likely to be folded into the comparison to form a
+  // CB(N)Z, then removing it won't reduce code size at all, because that will
+  // just replace the CB(N)Z with a CMP.
+  if (MI.getOpcode() == ARM::t2Bcc &&
+      findCMPToFoldIntoCBZ(&MI, &getRegisterInfo()))
+    return 0;
+
+  unsigned Size = getInstSizeInBytes(MI);
+
+  // For Thumb2, all branches are 32-bit instructions during the if conversion
+  // pass, but may be replaced with 16-bit instructions during size reduction.
+  // Since the branches considered by if conversion tend to be forward branches
+  // over small basic blocks, they are very likely to be in range for the
+  // narrow instructions, so we assume the final code size will be half what it
+  // currently is.
+  if (Subtarget.isThumb2())
+    Size /= 2;
+
+  return Size;
+}
+
 bool
 ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                            MachineBasicBlock &FMBB) const {
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@ -276,6 +276,10 @@ public:
    return NumCycles == 1;
  }

+  unsigned extraSizeToPredicateInstructions(const MachineFunction &MF,
+                                            unsigned NumInsts) const override;
+  unsigned predictBranchSizeForIfCvt(MachineInstr &MI) const override;
+
  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
                                 MachineBasicBlock &FMBB) const override;

--- a/llvm/test/CodeGen/ARM/ifcvt-size.mir
+++ b/llvm/test/CodeGen/ARM/ifcvt-size.mir
@ -0,0 +1,559 @@
+# RUN: llc %s -o - -run-pass=if-converter -debug-only=if-converter 2>%t| FileCheck %s
+# RUN: FileCheck %s < %t --check-prefix=DEBUG
+# REQUIRES: asserts
+
+# When optimising for size, we use a different set of heuristics for
+# if-conversion, which take into account the size of the instructions, not the
+# time taken to execute them. This is more complicated for Thumb, where it if
+# also affected by selection of narrow branch instructions, insertion if IT
+# instructions, and selection of the CB(N)Z instructions.
+
+--- |
+  target triple = "thumbv7-unknown-linux-gnueabi"
+
+  define void @fn1() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn2() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn3() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn4() minsize "target-features"="-thumb-mode" {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn5() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn6() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if2.then:
+    unreachable
+  if2.else:
+    unreachable
+  }
+
+  define void @fn7() minsize "target-features"="-thumb-mode" {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn8() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  if.end:
+    unreachable
+  }
+
+  define void @fn9() minsize {
+  entry:
+    unreachable
+  if.then:
+    unreachable
+  if.else:
+    unreachable
+  lab1:
+    unreachable
+  }
+...
+---
+name:            fn1
+alignment:       1
+tracksRegLiveness: true
+
+# If-conversion is profitable here because it will remove two branches of 2
+# bytes each (assuming they can become narrow branches later), and will only
+# add 2 bytes with the IT instruction.
+
+# CHECK-LABEL: name:            fn1
+# CHECK:      t2CMPri
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRSHi12
+# CHECK-NEXT: t2MOVi
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn1'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=4, CommonBytes=0, NumPredicatedInstructions=4, ExtraPredicateBytes=2)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    t2CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 11, killed $cpsr
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    t2B %bb.3, 14, $noreg
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $r1, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+
+  bb.3.if.end:
+    liveins: $r0, $r3
+
+    renamable $r1 = t2MOVi 0, 14, $noreg, $noreg
+    t2STRi12 killed renamable $r1, killed renamable $r3, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn2
+alignment:       1
+tracksRegLiveness: true
+
+# If-conversion is not profitable here, because the 5 conditional instructions
+# would require 2 IT instructions.
+
+# CHECK-LABEL: name:            fn2
+# CHECK:      t2CMPri
+# CHECK-NEXT: t2Bcc
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn2'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=4, CommonBytes=0, NumPredicatedInstructions=5, ExtraPredicateBytes=4)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    t2CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 11, killed $cpsr
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    t2B %bb.3, 14, $noreg
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $r1, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+
+  bb.3.if.end:
+    liveins: $r0, $r3
+
+    renamable $r1 = t2MOVi 0, 14, $noreg, $noreg
+    t2STRi12 killed renamable $r1, killed renamable $r3, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn3
+alignment:       1
+tracksRegLiveness: true
+
+# Here, the true and false blocks both end in a tBX_RET instruction. One of
+# these will be removed, saving 2 bytes, and the remaining one isn't
+# conditional, so doesn't push us over the limit of 4 instructions in an IT
+# block.
+
+# CHECK-LABEL: name:            fn3
+# CHECK:      t2CMPri
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRSHi12
+# CHECK-NEXT: tBX_RET
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn3'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=2, CommonBytes=2, NumPredicatedInstructions=4, ExtraPredicateBytes=2)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    t2CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 11, killed $cpsr
+
+  bb.1.if.then:
+    liveins: $r0, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+  bb.2.if.else:
+    liveins: $r1, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn4
+alignment:       1
+tracksRegLiveness: true
+
+# This is the same as fn2, but compiled for ARM, which doesn't need IT
+# instructions, so if-conversion is profitable.
+
+# CHECK-LABEL: name:            fn4
+# CHECK:      CMPri
+# CHECK-NEXT: LDRi12
+# CHECK-NEXT: LDRi12
+# CHECK-NEXT: LDRSH
+# CHECK-NEXT: LDRi12
+# CHECK-NEXT: LDRi12
+# CHECK-NEXT: MOVi
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn4'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=8, CommonBytes=0, NumPredicatedInstructions=5, ExtraPredicateBytes=0)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr
+    Bcc %bb.2, 11, killed $cpsr
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    B %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $r1, $r3
+
+    renamable $r0 = LDRi12 killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRSH killed renamable $r0, $noreg, 0, 14, $noreg
+
+  bb.3.if.end:
+    liveins: $r0, $r3
+
+    renamable $r1 = MOVi 0, 14, $noreg, $noreg
+    STRi12 killed renamable $r1, killed renamable $r3, 0, 14, $noreg
+    BX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn5
+alignment:       1
+tracksRegLiveness: true
+
+# Here, the compare and conditional branch can be turned into a CBZ, so we
+# don't want to if-convert.
+
+# CHECK-LABEL: name:            fn5
+# CHECK: t2CMPri
+# CHECK: t2Bcc
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn5'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=0, CommonBytes=2, NumPredicatedInstructions=4, ExtraPredicateBytes=2)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    t2CMPri killed renamable $r2, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.1.if.then:
+    liveins: $r0
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+  bb.2.if.else:
+    liveins: $r1
+
+    renamable $r0 = t2LDRi12 killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn6
+alignment:       1
+tracksRegLiveness: true
+
+# This is a forked-diamond pattern, we recognise that the conditional branches
+# at the ends of the true and false blocks are the same, and can be shared.
+
+# CHECK-LABEL: name:            fn6
+# CHECK:      t2CMPri
+# CHECK-NEXT: t2LDRSHi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2CMPri
+# CHECK-NEXT: t2Bcc
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn6'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=2, CommonBytes=12, NumPredicatedInstructions=4, ExtraPredicateBytes=2)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    t2CMPri killed renamable $r2, 4, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.1.if.then:
+    successors: %bb.3(0x30000000), %bb.4(0x50000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    t2CMPri renamable $r0, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.3.if2.then, 1, killed $cpsr
+    t2B %bb.4.if2.else, 14, $noreg
+
+  bb.2.if.else:
+    successors: %bb.3(0x30000000), %bb.4(0x50000000)
+    liveins: $r0, $r1, $r3
+
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+    t2CMPri renamable $r0, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.3.if2.then, 1, killed $cpsr
+    t2B %bb.4.if2.else, 14, $noreg
+
+  bb.3.if2.then:
+    liveins: $r0, $r1, $r3
+
+    t2STRi12 killed renamable $r1, killed renamable $r3, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+  bb.4.if2.else:
+    liveins: $r0
+
+    tBX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn7
+alignment:       1
+tracksRegLiveness: true
+
+# When compiling for ARM, it would be good for code size to generate very long
+# runs of conditional instructions, but we put an (arbitrary) limit on this to
+# avoid generating code which is very bad for performance, and only saves a few
+# bytes of code size.
+
+# CHECK-LABEL: name:            fn7
+# CHECK:      CMPri
+# CHECK-NEXT: Bcc
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr
+    Bcc %bb.2, 11, killed $cpsr
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    B %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $r1, $r3
+
+    renamable $r0 = LDRi12 killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = LDRSH killed renamable $r0, $noreg, 0, 14, $noreg
+
+  bb.3.if.end:
+    liveins: $r0, $r3
+
+    renamable $r1 = MOVi 0, 14, $noreg, $noreg
+    STRi12 killed renamable $r1, killed renamable $r3, 0, 14, $noreg
+    BX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn8
+alignment:       1
+tracksRegLiveness: true
+
+# The first t2LDRi12 instruction in each branch is the same, so one copy of it
+# will be removed, and it doesn't need to be predicated, keeping us under the 4
+# instruction IT block limit.
+
+# CHECK-LABEL: name:            fn8
+# CHECK:      t2CMPri
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRi12
+# CHECK-NEXT: t2LDRSHi12
+# CHECK-NEXT: t2MOVi
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn8'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=4, CommonBytes=4, NumPredicatedInstructions=4, ExtraPredicateBytes=2)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    t2CMPri killed renamable $r2, 5, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 11, killed $cpsr
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 4, 14, $noreg
+    t2B %bb.3, 14, $noreg
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $r0, $r3
+
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRi12 killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+
+  bb.3.if.end:
+    liveins: $r0, $r3
+
+    renamable $r1 = t2MOVi 0, 14, $noreg, $noreg
+    t2STRi12 killed renamable $r1, killed renamable $r3, 0, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+
+---
+name:            fn9
+alignment:       2
+tracksRegLiveness: true
+
+# The INLINEASM_BR instructions aren't analyzable, but they are identical so we
+# can still do diamond if-conversion. From a code-size POV, they are common
+# instructions, so one will be removed, and they don't need an IT block slot.
+
+# CHECK-LABEL: name:            fn9
+# CHECK:      tCMPi8
+# CHECK-NEXT: tLDRi
+# CHECK-NEXT: tLDRi
+# CHECK-NEXT: tLDRi
+# CHECK-NEXT: t2LDRSHi12
+# CHECK-NEXT: INLINEASM_BR
+
+# DEBUG-LABEL: Ifcvt: function ({{[0-9]+}}) 'fn9'
+# DEBUG: MeetIfcvtSizeLimit(BranchBytes=2, CommonBytes=6, NumPredicatedInstructions=4, ExtraPredicateBytes=2)
+
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.3(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    tCMPi8 killed renamable $r2, 42, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.3, 1, killed $cpsr
+
+  bb.1.if.then:
+    successors:  %bb.5(0x7fffffff)
+    liveins: $r0
+
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg
+    INLINEASM_BR &"b ${0:l}", 1, 13, blockaddress(@fn9, %ir-block.lab1)
+
+  bb.3.if.else:
+    successors: %bb.5(0x7fffffff)
+    liveins: $r1
+
+    renamable $r0 = tLDRi killed renamable $r1, 0, 14, $noreg
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg
+    renamable $r0 = t2LDRSHi12 killed renamable $r0, 0, 14, $noreg
+    INLINEASM_BR &"b ${0:l}", 1, 13, blockaddress(@fn9, %ir-block.lab1)
+
+  bb.5.lab1 (address-taken):
+    liveins: $r0
+
+    renamable $r0, dead $cpsr = nsw tADDi8 killed renamable $r0, 5, 14, $noreg
+    tBX_RET 14, $noreg, implicit $r0
+...