llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp

//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains an optimization for div and rem on architectures that
// execute short instructions significantly faster than longer instructions.
// For example, on Intel Atom 32-bit divides are slow enough that during
// runtime it is profitable to check the value of the operands, and if they are
// positive and less than 256 use an unsigned 8-bit divide.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Utils/BypassSlowDivision.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/KnownBits.h"
#include <cassert>
#include <cstdint>

using namespace llvm;

#define DEBUG_TYPE "bypass-slow-division"

namespace {

  struct QuotRemPair {
    Value *Quotient;
    Value *Remainder;

    QuotRemPair(Value *InQuotient, Value *InRemainder)
        : Quotient(InQuotient), Remainder(InRemainder) {}
  };

  /// A quotient and remainder, plus a BB from which they logically "originate".
  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
  /// corresponding predecessor.
  struct QuotRemWithBB {
    BasicBlock *BB = nullptr;
    Value *Quotient = nullptr;
    Value *Remainder = nullptr;
  };

using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
using BypassWidthsTy = DenseMap<unsigned, unsigned>;
using VisitedSetTy = SmallPtrSet<Instruction *, 4>;

enum ValueRange {
  /// Operand definitely fits into BypassType. No runtime checks are needed.
  VALRNG_KNOWN_SHORT,
  /// A runtime check is required, as value range is unknown.
  VALRNG_UNKNOWN,
  /// Operand is unlikely to fit into BypassType. The bypassing should be
  /// disabled.
  VALRNG_LIKELY_LONG
};

class FastDivInsertionTask {
  bool IsValidTask = false;
  Instruction *SlowDivOrRem = nullptr;
  IntegerType *BypassType = nullptr;
  BasicBlock *MainBB = nullptr;

  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
  QuotRemWithBB createSlowBB(BasicBlock *Successor);
  QuotRemWithBB createFastBB(BasicBlock *Successor);
  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
                                   BasicBlock *PhiBB);
  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
  Optional<QuotRemPair> insertFastDivAndRem();

  bool isSignedOp() {
    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
           SlowDivOrRem->getOpcode() == Instruction::SRem;
  }

  bool isDivisionOp() {
    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
           SlowDivOrRem->getOpcode() == Instruction::UDiv;
  }

  Type *getSlowType() { return SlowDivOrRem->getType(); }

public:
  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);

  Value *getReplacement(DivCacheTy &Cache);
};

} // end anonymous namespace

FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
                                           const BypassWidthsTy &BypassWidths) {
  switch (I->getOpcode()) {
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::URem:
  case Instruction::SRem:
    SlowDivOrRem = I;
    break;
  default:
    // I is not a div/rem operation.
    return;
  }

  // Skip division on vector types. Only optimize integer instructions.
  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
  if (!SlowType)
    return;

  // Skip if this bitwidth is not bypassed.
  auto BI = BypassWidths.find(SlowType->getBitWidth());
  if (BI == BypassWidths.end())
    return;

  // Get type for div/rem instruction with bypass bitwidth.
  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
  BypassType = BT;

  // The original basic block.
  MainBB = I->getParent();

  // The instruction is indeed a slow div or rem operation.
  IsValidTask = true;
}

/// Reuses previously-computed dividend or remainder from the current BB if
/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
/// perform the optimization and caches the resulting dividend and remainder.
/// If no replacement can be generated, nullptr is returned.
Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
  // First, make sure that the task is valid.
  if (!IsValidTask)
    return nullptr;

  // Then, look for a value in Cache.
  Value *Dividend = SlowDivOrRem->getOperand(0);
  Value *Divisor = SlowDivOrRem->getOperand(1);
  DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
  auto CacheI = Cache.find(Key);

  if (CacheI == Cache.end()) {
    // If previous instance does not exist, try to insert fast div.
    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
    // Bail out if insertFastDivAndRem has failed.
    if (!OptResult)
      return nullptr;
    CacheI = Cache.insert({Key, *OptResult}).first;
  }

  QuotRemPair &Value = CacheI->second;
  return isDivisionOp() ? Value.Quotient : Value.Remainder;
}

/// Check if a value looks like a hash.
///
/// The routine is expected to detect values computed using the most common hash
/// algorithms. Typically, hash computations end with one of the following
/// instructions:
///
/// 1) MUL with a constant wider than BypassType
/// 2) XOR instruction
///
/// And even if we are wrong and the value is not a hash, it is still quite
/// unlikely that such values will fit into BypassType.
///
/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
/// It is implemented as a depth-first search for values that look neither long
/// nor hash-like.
bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
  Instruction *I = dyn_cast<Instruction>(V);
  if (!I)
    return false;

  switch (I->getOpcode()) {
  case Instruction::Xor:
    return true;
  case Instruction::Mul: {
    // After Constant Hoisting pass, long constants may be represented as
    // bitcast instructions. As a result, some constants may look like an
    // instruction at first, and an additional check is necessary to find out if
    // an operand is actually a constant.
    Value *Op1 = I->getOperand(1);
    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
    if (!C && isa<BitCastInst>(Op1))
      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
  }
  case Instruction::PHI:
    // Stop IR traversal in case of a crazy input code. This limits recursion
    // depth.
    if (Visited.size() >= 16)
      return false;
    // Do not visit nodes that have been visited already. We return true because
    // it means that we couldn't find any value that doesn't look hash-like.
    if (Visited.find(I) != Visited.end())
      return true;
    Visited.insert(I);
    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
      // Ignore undef values as they probably don't affect the division
      // operands.
      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
             isa<UndefValue>(V);
    });
  default:
    return false;
  }
}

/// Check if an integer value fits into our bypass type.
ValueRange FastDivInsertionTask::getValueRange(Value *V,
                                               VisitedSetTy &Visited) {
  unsigned ShortLen = BypassType->getBitWidth();
  unsigned LongLen = V->getType()->getIntegerBitWidth();

  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
  unsigned HiBits = LongLen - ShortLen;

  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
  KnownBits Known(LongLen);

  computeKnownBits(V, Known, DL);

  if (Known.countMinLeadingZeros() >= HiBits)
    return VALRNG_KNOWN_SHORT;

  if (Known.countMaxLeadingZeros() < HiBits)
    return VALRNG_LIKELY_LONG;

  // Long integer divisions are often used in hashtable implementations. It's
  // not worth bypassing such divisions because hash values are extremely
  // unlikely to have enough leading zeros. The call below tries to detect
  // values that are unlikely to fit BypassType (including hashes).
  if (isHashLikeValue(V, Visited))
    return VALRNG_LIKELY_LONG;

  return VALRNG_UNKNOWN;
}

/// Add new basic block for slow div and rem operations and put it before
/// SuccessorBB.
QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
  QuotRemWithBB DivRemPair;
  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
                                     MainBB->getParent(), SuccessorBB);
  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());

  Value *Dividend = SlowDivOrRem->getOperand(0);
  Value *Divisor = SlowDivOrRem->getOperand(1);

  if (isSignedOp()) {
    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
  } else {
    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
  }

  Builder.CreateBr(SuccessorBB);
  return DivRemPair;
}

/// Add new basic block for fast div and rem operations and put it before
/// SuccessorBB.
QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
  QuotRemWithBB DivRemPair;
  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
                                     MainBB->getParent(), SuccessorBB);
  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());

  Value *Dividend = SlowDivOrRem->getOperand(0);
  Value *Divisor = SlowDivOrRem->getOperand(1);
  Value *ShortDivisorV =
      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
  Value *ShortDividendV =
      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);

  // udiv/urem because this optimization only handles positive numbers.
  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
  DivRemPair.Quotient =
      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
  DivRemPair.Remainder =
      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
  Builder.CreateBr(SuccessorBB);

  return DivRemPair;
}

/// Creates Phi nodes for result of Div and Rem.
QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
                                                       QuotRemWithBB &RHS,
                                                       BasicBlock *PhiBB) {
  IRBuilder<> Builder(PhiBB, PhiBB->begin());
  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
  return QuotRemPair(QuoPhi, RemPhi);
}

/// Creates a runtime check to test whether both the divisor and dividend fit
/// into BypassType. The check is inserted at the end of MainBB. True return
/// value means that the operands fit. Either of the operands may be NULL if it
/// doesn't need a runtime check.
Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
  assert((Op1 || Op2) && "Nothing to check");
  IRBuilder<> Builder(MainBB, MainBB->end());

  Value *OrV;
  if (Op1 && Op2)
    OrV = Builder.CreateOr(Op1, Op2);
  else
    OrV = Op1 ? Op1 : Op2;

  // BitMask is inverted to check if the operands are
  // larger than the bypass type
  uint64_t BitMask = ~BypassType->getBitMask();
  Value *AndV = Builder.CreateAnd(OrV, BitMask);

  // Compare operand values
  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
  return Builder.CreateICmpEQ(AndV, ZeroV);
}

/// Substitutes the div/rem instruction with code that checks the value of the
/// operands and uses a shorter-faster div/rem instruction when possible.
Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
  Value *Dividend = SlowDivOrRem->getOperand(0);
  Value *Divisor = SlowDivOrRem->getOperand(1);

  VisitedSetTy SetL;
  ValueRange DividendRange = getValueRange(Dividend, SetL);
  if (DividendRange == VALRNG_LIKELY_LONG)
    return None;

  VisitedSetTy SetR;
  ValueRange DivisorRange = getValueRange(Divisor, SetR);
  if (DivisorRange == VALRNG_LIKELY_LONG)
    return None;

  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);

  if (DividendShort && DivisorShort) {
    // If both operands are known to be short then just replace the long
    // division with a short one in-place.  Since we're not introducing control
    // flow in this case, narrowing the division is always a win, even if the
    // divisor is a constant (and will later get replaced by a multiplication).

    IRBuilder<> Builder(SlowDivOrRem);
    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
    return QuotRemPair(ExtDiv, ExtRem);
  }

  if (isa<ConstantInt>(Divisor)) {
    // If the divisor is not a constant, DAGCombiner will convert it to a
    // multiplication by a magic constant.  It isn't clear if it is worth
    // introducing control flow to get a narrower multiply.
    return None;
  }

  // After Constant Hoisting pass, long constants may be represented as
  // bitcast instructions. As a result, some constants may look like an
  // instruction at first, and an additional check is necessary to find out if
  // an operand is actually a constant.
  if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
    if (BCI->getParent() == SlowDivOrRem->getParent() &&
        isa<ConstantInt>(BCI->getOperand(0)))
      return None;

  if (DividendShort && !isSignedOp()) {
    // If the division is unsigned and Dividend is known to be short, then
    // either
    // 1) Divisor is less or equal to Dividend, and the result can be computed
    //    with a short division.
    // 2) Divisor is greater than Dividend. In this case, no division is needed
    //    at all: The quotient is 0 and the remainder is equal to Dividend.
    //
    // So instead of checking at runtime whether Divisor fits into BypassType,
    // we emit a runtime check to differentiate between these two cases. This
    // lets us entirely avoid a long div.

    // Split the basic block before the div/rem.
    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
    // Remove the unconditional branch from MainBB to SuccessorBB.
    MainBB->getInstList().back().eraseFromParent();
    QuotRemWithBB Long;
    Long.BB = MainBB;
    Long.Quotient = ConstantInt::get(getSlowType(), 0);
    Long.Remainder = Dividend;
    QuotRemWithBB Fast = createFastBB(SuccessorBB);
    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
    IRBuilder<> Builder(MainBB, MainBB->end());
    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
    return Result;
  } else {
    // General case. Create both slow and fast div/rem pairs and choose one of
    // them at runtime.

    // Split the basic block before the div/rem.
    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
    // Remove the unconditional branch from MainBB to SuccessorBB.
    MainBB->getInstList().back().eraseFromParent();
    QuotRemWithBB Fast = createFastBB(SuccessorBB);
    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
                                            DivisorShort ? nullptr : Divisor);
    IRBuilder<> Builder(MainBB, MainBB->end());
    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
    return Result;
  }
}

/// This optimization identifies DIV/REM instructions in a BB that can be
/// profitably bypassed and carried out with a shorter, faster divide.
bool llvm::bypassSlowDivision(BasicBlock *BB,
                              const BypassWidthsTy &BypassWidths) {
  DivCacheTy PerBBDivCache;

  bool MadeChange = false;
  Instruction* Next = &*BB->begin();
  while (Next != nullptr) {
    // We may add instructions immediately after I, but we want to skip over
    // them.
    Instruction* I = Next;
    Next = Next->getNextNode();

    FastDivInsertionTask Task(I, BypassWidths);
    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
      I->replaceAllUsesWith(Replacement);
      I->eraseFromParent();
      MadeChange = true;
    }
  }

  // Above we eagerly create divs and rems, as pairs, so that we can efficiently
  // create divrem machine instructions.  Now erase any unused divs / rems so we
  // don't leave extra instructions sitting around.
  for (auto &KV : PerBBDivCache)
    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
      RecursivelyDeleteTriviallyDeadInstructions(V);

  return MadeChange;
}