[AMDGPU][CostModel] Refine cost model for control-flow instructions.

Added cost estimation for switch instruction, updated costs of branches, fixed phi cost. Had to increase `-amdgpu-unroll-threshold-if` default value since conditional branch cost (size) was corrected to higher value. Test renamed to "control-flow.ll". Removed redundant code in `X86TTIImpl::getCFInstrCost()` and `PPCTTIImpl::getCFInstrCost()`. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D96805
2021-02-16 22:20:06 +03:00 · 2021-02-16 22:20:06 +03:00 · 8f4b7e94a2
parent 4f173c0c42
commit 8f4b7e94a2
19 changed files with 139 additions and 90 deletions
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@ -1103,9 +1103,10 @@ public:
                               unsigned Index = -1) const;

  /// \return The expected cost of control-flow related instructions such as
-  /// Phi, Ret, Br.
+  /// Phi, Ret, Br, Switch.
  int getCFInstrCost(unsigned Opcode,
-                     TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) const;
+                     TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+                     const Instruction *I = nullptr) const;

  /// \returns The expected cost of compare and select instructions. If there
  /// is an existing instruction that holds Opcode, it may be passed in the
@ -1573,8 +1574,8 @@ public:
                               const Instruction *I) = 0;
  virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                       VectorType *VecTy, unsigned Index) = 0;
-  virtual int getCFInstrCost(unsigned Opcode,
-                             TTI::TargetCostKind CostKind) = 0;
+  virtual int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                             const Instruction *I = nullptr) = 0;
  virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                                 CmpInst::Predicate VecPred,
                                 TTI::TargetCostKind CostKind,
@ -2040,8 +2041,9 @@ public:
                               unsigned Index) override {
    return Impl.getExtractWithExtendCost(Opcode, Dst, VecTy, Index);
  }
-  int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) override {
-    return Impl.getCFInstrCost(Opcode, CostKind);
+  int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                     const Instruction *I = nullptr) override {
+    return Impl.getCFInstrCost(Opcode, CostKind, I);
  }
  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                         CmpInst::Predicate VecPred,
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -512,7 +512,8 @@ public:
    return 1;
  }

-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) const {
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr) const {
    // A phi would be free, unless we're costing the throughput because it
    // will require a register.
    if (Opcode == Instruction::PHI && CostKind != TTI::TCK_RecipThroughput)
@ -933,7 +934,8 @@ public:
    case Instruction::Br:
    case Instruction::Ret:
    case Instruction::PHI:
-      return TargetTTI->getCFInstrCost(Opcode, CostKind);
+    case Instruction::Switch:
+      return TargetTTI->getCFInstrCost(Opcode, CostKind, I);
    case Instruction::ExtractValue:
    case Instruction::Freeze:
      return TTI::TCC_Free;
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@ -897,8 +897,9 @@ public:
                                     TTI::CastContextHint::None, TTI::TCK_RecipThroughput);
  }

-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
-    return BaseT::getCFInstrCost(Opcode, CostKind);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr) {
+    return BaseT::getCFInstrCost(Opcode, CostKind, I);
  }

  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@ -783,8 +783,11 @@ int TargetTransformInfo::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
 }

 int TargetTransformInfo::getCFInstrCost(unsigned Opcode,
-                                        TTI::TargetCostKind CostKind) const {
-  int Cost = TTIImpl->getCFInstrCost(Opcode, CostKind);
+                                        TTI::TargetCostKind CostKind,
+                                        const Instruction *I) const {
+  assert((I == nullptr || I->getOpcode() == Opcode) &&
+         "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCFInstrCost(Opcode, CostKind, I);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
@ -1374,6 +1377,7 @@ TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
  case Instruction::ExtractValue:
  case Instruction::ShuffleVector:
  case Instruction::Call:
+  case Instruction::Switch:
    return getUserCost(I, CostKind);
  default:
    // We don't have any information on this instruction.
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -653,7 +653,8 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
 }

 unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
-                                        TTI::TargetCostKind CostKind) {
+                                        TTI::TargetCostKind CostKind,
+                                        const Instruction *I) {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Opcode == Instruction::PHI ? 0 : 1;
  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@ -139,7 +139,8 @@ public:
  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                               unsigned Index);

-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr);

  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -39,7 +39,7 @@ static cl::opt<unsigned> UnrollThresholdLocal(
 static cl::opt<unsigned> UnrollThresholdIf(
  "amdgpu-unroll-threshold-if",
  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
-  cl::init(150), cl::Hidden);
+  cl::init(200), cl::Hidden);

 static cl::opt<bool> UnrollRuntimeLocal(
  "amdgpu-unroll-runtime-local",
@ -106,6 +106,10 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  UP.MaxCount = std::numeric_limits<unsigned>::max();
  UP.Partial = true;

+  // Conditional branch in a loop back edge needs 3 additional exec
+  // manipulations in average.
+  UP.BEInsns += 3;
+
  // TODO: Do we want runtime unrolling?

  // Maximum alloca size than can fit registers. Reserve 16 registers.
@ -809,18 +813,37 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 }

 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
-                                    TTI::TargetCostKind CostKind) {
-  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
-    return Opcode == Instruction::PHI ? 0 : 1;
-
-  // XXX - For some reason this isn't called for switch.
+                                    TTI::TargetCostKind CostKind,
+                                    const Instruction *I) {
+  assert((I == nullptr || I->getOpcode() == Opcode) &&
+         "Opcode should reflect passed instruction.");
+  const bool SCost =
+      (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);
+  const int CBrCost = SCost ? 5 : 7;
  switch (Opcode) {
-  case Instruction::Br:
-  case Instruction::Ret:
-    return 10;
-  default:
-    return BaseT::getCFInstrCost(Opcode, CostKind);
+  case Instruction::Br: {
+    // Branch instruction takes about 4 slots on gfx900.
+    auto BI = dyn_cast_or_null<BranchInst>(I);
+    if (BI && BI->isUnconditional())
+      return SCost ? 1 : 4;
+    // Suppose conditional branch takes additional 3 exec manipulations
+    // instructions in average.
+    return CBrCost;
  }
+  case Instruction::Switch: {
+    auto SI = dyn_cast_or_null<SwitchInst>(I);
+    // Each case (including default) takes 1 cmp + 1 cbr instructions in
+    // average.
+    return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
+  }
+  case Instruction::Ret:
+    return SCost ? 1 : 10;
+  case Instruction::PHI:
+    // TODO: 1. A prediction phi won't be eliminated?
+    //       2. Estimate data copy instructions in this case.
+    return 1;
+  }
+  return BaseT::getCFInstrCost(Opcode, CostKind, I);
 }

 int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
@ -1292,7 +1315,8 @@ unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }

 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
-                                     TTI::TargetCostKind CostKind) {
+                                     TTI::TargetCostKind CostKind,
+                                     const Instruction *I) {
  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
    return Opcode == Instruction::PHI ? 0 : 1;

@ -1302,7 +1326,7 @@ unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
  case Instruction::Ret:
    return 10;
  default:
-    return BaseT::getCFInstrCost(Opcode, CostKind);
+    return BaseT::getCFInstrCost(Opcode, CostKind, I);
  }
 }

--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -163,7 +163,8 @@ public:
      ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
      const Instruction *CxtI = nullptr);

-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr);

  bool isInlineAsmSourceOfDivergence(const CallInst *CI,
                                     ArrayRef<unsigned> Indices = {}) const;
@ -253,7 +254,8 @@ public:
  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                    unsigned AddrSpace) const;
  unsigned getMaxInterleaveFactor(unsigned VF);
-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr);
  int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 };

--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -379,7 +379,8 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  return getIntImmCost(Imm, Ty, CostKind);
 }

-int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                               const Instruction *I) {
  if (CostKind == TTI::TCK_RecipThroughput &&
      (ST->hasNEON() || ST->hasMVEIntegerOps())) {
    // FIXME: The vectorizer is highly sensistive to the cost of these
@ -388,7 +389,7 @@ int ARMTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
    // vector targets.
    return 0;
  }
-  return BaseT::getCFInstrCost(Opcode, CostKind);
+  return BaseT::getCFInstrCost(Opcode, CostKind, I);
 }

 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@ -198,8 +198,8 @@ public:

  bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }

-  int getCFInstrCost(unsigned Opcode,
-                     TTI::TargetCostKind CostKind);
+  int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                     const Instruction *I = nullptr);

  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@ -153,7 +153,8 @@ public:
                            const Instruction *I = nullptr);
  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);

-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr) {
    return 1;
  }

--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@ -1000,11 +1000,12 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
                              nullptr);
 }

-int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                               const Instruction *I) {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Opcode == Instruction::PHI ? 0 : 1;
  // Branches are assumed to be predicted.
-  return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+  return 0;
 }

 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@ -112,7 +112,8 @@ public:
  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                       TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                       const Instruction *I = nullptr);
-  int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+  int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                     const Instruction *I = nullptr);
  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                         CmpInst::Predicate VecPred,
                         TTI::TargetCostKind CostKind,
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@ -4076,12 +4076,13 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
  return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }

-unsigned
-X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+unsigned X86TTIImpl::getCFInstrCost(unsigned Opcode,
+                                    TTI::TargetCostKind CostKind,
+                                    const Instruction *I) {
  if (CostKind != TTI::TCK_RecipThroughput)
    return Opcode == Instruction::PHI ? 0 : 1;
  // Branches are assumed to be predicted.
-  return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+  return 0;
 }

 int X86TTIImpl::getGatherOverhead() const {
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@ -203,7 +203,8 @@ public:

  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);

-  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
+                          const Instruction *I = nullptr);

  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
                        Type *Ty, TTI::TargetCostKind CostKind,
--- a/llvm/test/Analysis/CostModel/AMDGPU/br.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/br.ll
@ -1,45 +0,0 @@
-; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
-
-; CHECK: 'test_br_cost'
-; CHECK: estimated cost of 10 for instruction: br i1
-; CHECK: estimated cost of 10 for instruction: br label
-; CHECK: estimated cost of 10 for instruction: ret void
-define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
-bb0:
-  br i1 undef, label %bb1, label %bb2
-
-bb1:
-  %vec = load i32, i32 addrspace(1)* %vaddr
-  %add = add i32 %vec, %b
-  store i32 %add, i32 addrspace(1)* %out
-  br label %bb2
-
-bb2:
-  ret void
-
-}
-
-; CHECK: 'test_switch_cost'
-; CHECK: estimated cost of -1 for instruction:   switch
-define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
-entry:
-  switch i32 %a, label %default [
-    i32 0, label %case0
-    i32 1, label %case1
-  ]
-
-case0:
-  store volatile i32 undef, i32 addrspace(1)* undef
-  ret void
-
-case1:
-  store volatile i32 undef, i32 addrspace(1)* undef
-  ret void
-
-default:
-  store volatile i32 undef, i32 addrspace(1)* undef
-  ret void
-
-end:
-  ret void
-}
--- a/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/control-flow.ll
@ -0,0 +1,52 @@
+; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck  --check-prefixes=ALL,SPEED %s
+; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SIZE %s
+
+; ALL-LABEL: 'test_br_cost'
+; SPEED: estimated cost of 7 for instruction: br i1
+; SPEED: estimated cost of 4 for instruction: br label
+; SPEED: estimated cost of 1 for instruction: %phi = phi i32 [
+; SPEED: estimated cost of 10 for instruction: ret void
+; SIZE: estimated cost of 5 for instruction: br i1
+; SIZE: estimated cost of 1 for instruction: br label
+; SIZE: estimated cost of 1 for instruction: %phi = phi i32 [
+; SIZE: estimated cost of 1 for instruction: ret void
+define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+bb0:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  %vec = load i32, i32 addrspace(1)* %vaddr
+  %add = add i32 %vec, %b
+  store i32 %add, i32 addrspace(1)* %out
+  br label %bb2
+
+bb2:
+  %phi = phi i32 [ %b, %bb0 ], [ %add, %bb1 ]
+  ret void
+}
+
+; ALL-LABEL: 'test_switch_cost'
+; SPEED: estimated cost of 24 for instruction:   switch
+; SIZE: estimated cost of 18 for instruction:   switch
+define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
+entry:
+  switch i32 %a, label %default [
+    i32 0, label %case0
+    i32 1, label %case1
+  ]
+
+case0:
+  store volatile i32 undef, i32 addrspace(1)* undef
+  ret void
+
+case1:
+  store volatile i32 undef, i32 addrspace(1)* undef
+  ret void
+
+default:
+  store volatile i32 undef, i32 addrspace(1)* undef
+  ret void
+
+end:
+  ret void
+}
--- a/llvm/test/CodeGen/AMDGPU/unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/unroll.ll
@ -81,8 +81,7 @@ entry:

 for.body:                                         ; preds = %entry, %for.inc
  %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
-  %and = and i32 %i1, 1
-  %tobool = icmp eq i32 %and, 0
+  %tobool = icmp eq i32 %i1, 0
  br i1 %tobool, label %for.inc, label %if.then

 if.then:                                          ; preds = %for.body
@ -93,7 +92,7 @@ if.then:                                          ; preds = %for.body

 for.inc:                                          ; preds = %for.body, %if.then
  %inc = add nuw nsw i32 %i1, 1
-  %cmp = icmp ult i32 %inc, 48
+  %cmp = icmp ult i32 %inc, 38
  br i1 %cmp, label %for.body, label %for.end

 for.end:                                          ; preds = %for.cond
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll
@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=75 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=49 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s

 ; CHECK-LABEL: @test_func_addrspacecast_cost_noop(
 ; CHECK-NOT: br i1