llvm-project/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp

//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass identifies loops where we can generate the PPC branch instructions
// that decrement and test the count register (CTR) (bdnz and friends).
//
// The pattern that defines the induction variable can changed depending on
// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
// normalizes induction variables, and the Loop Strength Reduction pass
// run by 'llc' may also make changes to the induction variable.
//
// Criteria for CTR loops:
//  - Countable loops (w/ ind. var for a trip count)
//  - Try inner-most loops first
//  - No nested CTR loops.
//  - No function calls in loops.
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "ctrloops"

#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/Dominators.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/PassSupport.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ValueHandle.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "PPCTargetMachine.h"
#include "PPC.h"

#include <algorithm>
#include <vector>

using namespace llvm;

#ifndef NDEBUG
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif

STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");

namespace llvm {
  void initializePPCCTRLoopsPass(PassRegistry&);
}

namespace {
  struct PPCCTRLoops : public FunctionPass {

#ifndef NDEBUG
    static int Counter;
#endif

  public:
    static char ID;

    PPCCTRLoops() : FunctionPass(ID), TM(0) {
      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
    }
    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
    }

    virtual bool runOnFunction(Function &F);

    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
      AU.addRequired<LoopInfo>();
      AU.addPreserved<LoopInfo>();
      AU.addRequired<DominatorTree>();
      AU.addPreserved<DominatorTree>();
      AU.addRequired<ScalarEvolution>();
    }

  private:
    // FIXME: Copied from LoopSimplify.
    BasicBlock *InsertPreheaderForLoop(Loop *L);
    void PlaceSplitBlockCarefully(BasicBlock *NewBB,
                                  SmallVectorImpl<BasicBlock*> &SplitPreds,
                                  Loop *L);

    bool convertToCTRLoop(Loop *L);
  private:
    PPCTargetMachine *TM;
    LoopInfo *LI;
    ScalarEvolution *SE;
    DataLayout *TD;
    DominatorTree *DT;
    const TargetLibraryInfo *LibInfo;
  };

  char PPCCTRLoops::ID = 0;
#ifndef NDEBUG
  int PPCCTRLoops::Counter = 0;
#endif
} // end anonymous namespace

INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                      false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTree)
INITIALIZE_PASS_DEPENDENCY(LoopInfo)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                    false, false)

FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
  return new PPCCTRLoops(TM);
}

bool PPCCTRLoops::runOnFunction(Function &F) {
  LI = &getAnalysis<LoopInfo>();
  SE = &getAnalysis<ScalarEvolution>();
  DT = &getAnalysis<DominatorTree>();
  TD = getAnalysisIfAvailable<DataLayout>();
  LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();

  bool MadeChange = false;

  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
       I != E; ++I) {
    Loop *L = *I;
    if (!L->getParentLoop())
      MadeChange |= convertToCTRLoop(L);
  }

  return MadeChange;
}

bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
  bool MadeChange = false;

  Triple TT = Triple(L->getHeader()->getParent()->getParent()->
                     getTargetTriple());
  if (!TT.isArch32Bit() && !TT.isArch64Bit())
    return MadeChange; // Unknown arch. type.

  // Process nested loops first.
  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
    MadeChange |= convertToCTRLoop(*I);
  }

  // If a nested loop has been converted, then we can't convert this loop.
  if (MadeChange)
    return MadeChange;

#ifndef NDEBUG
  // Stop trying after reaching the limit (if any).
  int Limit = CTRLoopLimit;
  if (Limit >= 0) {
    if (Counter >= CTRLoopLimit)
      return false;
    Counter++;
  }
#endif

  // We don't want to spill/restore the counter register, and so we don't
  // want to use the counter register if the loop contains calls.
  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
       I != IE; ++I) {
    for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end();
         J != JE; ++J) {
      if (CallInst *CI = dyn_cast<CallInst>(J)) {
        if (!TM)
          return MadeChange;
        const TargetLowering *TLI = TM->getTargetLowering();

        if (Function *F = CI->getCalledFunction()) {
          // Most intrinsics don't become function calls, but some might.
          // sin, cos, exp and log are always calls.
          unsigned Opcode;
          if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
            switch (F->getIntrinsicID()) {
            default: continue;

// VisualStudio defines setjmp as _setjmp
#if defined(_MSC_VER) && defined(setjmp) && \
                         !defined(setjmp_undefined_for_msvc)
#  pragma push_macro("setjmp")
#  undef setjmp
#  define setjmp_undefined_for_msvc
#endif

            case Intrinsic::setjmp:

#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
   // let's return it to _setjmp state
#  pragma pop_macro("setjmp")
#  undef setjmp_undefined_for_msvc
#endif

            case Intrinsic::longjmp:
            case Intrinsic::memcpy:
            case Intrinsic::memmove:
            case Intrinsic::memset:
            case Intrinsic::powi:
            case Intrinsic::log:
            case Intrinsic::log2:
            case Intrinsic::log10:
            case Intrinsic::exp:
            case Intrinsic::exp2:
            case Intrinsic::pow:
            case Intrinsic::sin:
            case Intrinsic::cos:
              return MadeChange;
            case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
            case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
            case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
            case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
            case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
            case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
            }
          }

          // PowerPC does not use [US]DIVREM or other library calls for
          // operations on regular types which are not otherwise library calls
          // (i.e. soft float or atomics). If adapting for targets that do,
          // additional care is required here.

          LibFunc::Func Func;
          if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
              LibInfo->getLibFunc(F->getName(), Func) &&
              LibInfo->hasOptimizedCodeGen(Func)) {
            // Non-read-only functions are never treated as intrinsics.
            if (!CI->onlyReadsMemory())
              return MadeChange;

            // Conversion happens only for FP calls.
            if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
              return MadeChange;

            switch (Func) {
            default: return MadeChange;
            case LibFunc::copysign:
            case LibFunc::copysignf:
            case LibFunc::copysignl:
              continue; // ISD::FCOPYSIGN is never a library call.
            case LibFunc::fabs:
            case LibFunc::fabsf:
            case LibFunc::fabsl:
              continue; // ISD::FABS is never a library call.
            case LibFunc::sqrt:
            case LibFunc::sqrtf:
            case LibFunc::sqrtl:
              Opcode = ISD::FSQRT; break;
            case LibFunc::floor:
            case LibFunc::floorf:
            case LibFunc::floorl:
              Opcode = ISD::FFLOOR; break;
            case LibFunc::nearbyint:
            case LibFunc::nearbyintf:
            case LibFunc::nearbyintl:
              Opcode = ISD::FNEARBYINT; break;
            case LibFunc::ceil:
            case LibFunc::ceilf:
            case LibFunc::ceill:
              Opcode = ISD::FCEIL; break;
            case LibFunc::rint:
            case LibFunc::rintf:
            case LibFunc::rintl:
              Opcode = ISD::FRINT; break;
            case LibFunc::trunc:
            case LibFunc::truncf:
            case LibFunc::truncl:
              Opcode = ISD::FTRUNC; break;
            }

            MVT VTy =
              TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true);
            if (VTy == MVT::Other)
              return MadeChange;

            if (TLI->isOperationLegalOrCustom(Opcode, VTy))
              continue;
            else if (VTy.isVector() &&
                     TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
              continue;

            return MadeChange;
          }
        }

        return MadeChange;
      } else if (isa<BinaryOperator>(J) &&
                 J->getType()->getScalarType()->isPPC_FP128Ty()) {
        // Most operations on ppc_f128 values become calls.
        return MadeChange;
      } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
                 isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
        CastInst *CI = cast<CastInst>(J);
        if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
            CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
            (TT.isArch32Bit() &&
             (CI->getSrcTy()->getScalarType()->isIntegerTy(64) ||
              CI->getDestTy()->getScalarType()->isIntegerTy(64))
            ))
          return MadeChange;
      } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
        // On PowerPC, indirect jumps use the counter register.
        return MadeChange;
      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
        if (!TM)
          return MadeChange;
        const TargetLowering *TLI = TM->getTargetLowering();

        if (TLI->supportJumpTables() &&
            SI->getNumCases()+1 >= (unsigned) TLI->getMinimumJumpTableEntries())
          return MadeChange;
      }
    }
  }

  SmallVector<BasicBlock*, 4> ExitingBlocks;
  L->getExitingBlocks(ExitingBlocks);

  BasicBlock *CountedExitBlock = 0;
  const SCEV *ExitCount = 0;
  BranchInst *CountedExitBranch = 0;
  for (SmallVector<BasicBlock*, 4>::iterator I = ExitingBlocks.begin(),
       IE = ExitingBlocks.end(); I != IE; ++I) {
    const SCEV *EC = SE->getExitCount(L, *I);
    DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
                    (*I)->getName() << ": " << *EC << "\n");
    if (isa<SCEVCouldNotCompute>(EC))
      continue;
    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
      if (ConstEC->getValue()->isZero())
        continue;
    } else if (!SE->isLoopInvariant(EC, L))
      continue;

    // We now have a loop-invariant count of loop iterations (which is not the
    // constant zero) for which we know that this loop will not exit via this
    // exisiting block.

    // We need to make sure that this block will run on every loop iteration.
    // For this to be true, we must dominate all blocks with backedges. Such
    // blocks are in-loop predecessors to the header block.
    bool NotAlways = false;
    for (pred_iterator PI = pred_begin(L->getHeader()),
         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
      if (!L->contains(*PI))
        continue;

      if (!DT->dominates(*I, *PI)) {
        NotAlways = true;
        break;
      }
    }

    if (NotAlways)
      continue;

    // Make sure this blocks ends with a conditional branch.
    Instruction *TI = (*I)->getTerminator();
    if (!TI)
      continue;

    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
      if (!BI->isConditional())
        continue;

      CountedExitBranch = BI;
    } else
      continue;

    // Note that this block may not be the loop latch block, even if the loop
    // has a latch block.
    CountedExitBlock = *I;
    ExitCount = EC;
    break;
  }

  if (!CountedExitBlock)
    return MadeChange;

  BasicBlock *Preheader = L->getLoopPreheader();
  if (!Preheader)
    Preheader = InsertPreheaderForLoop(L);
  if (!Preheader)
    return MadeChange;

  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");

  // Insert the count into the preheader and replace the condition used by the
  // selected branch.
  MadeChange = true;

  SCEVExpander SCEVE(*SE, "loopcnt");
  LLVMContext &C = SE->getContext();
  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
                                       Type::getInt32Ty(C);
  if (!ExitCount->getType()->isPointerTy() &&
      ExitCount->getType() != CountType)
    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
  ExitCount = SE->getAddExpr(ExitCount,
                             SE->getConstant(CountType, 1));
  Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType,
                                       Preheader->getTerminator());

  IRBuilder<> CountBuilder(Preheader->getTerminator());
  Module *M = Preheader->getParent()->getParent();
  Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
                                               CountType);
  CountBuilder.CreateCall(MTCTRFunc, ECValue);

  IRBuilder<> CondBuilder(CountedExitBranch);
  Value *DecFunc =
    Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
  Value *NewCond = CondBuilder.CreateCall(DecFunc);
  Value *OldCond = CountedExitBranch->getCondition();
  CountedExitBranch->setCondition(NewCond);

  // The false branch must exit the loop.
  if (!L->contains(CountedExitBranch->getSuccessor(0)))
    CountedExitBranch->swapSuccessors();

  // The old condition may be dead now, and may have even created a dead PHI
  // (the original induction variable).
  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
  DeleteDeadPHIs(CountedExitBlock);

  ++NumCTRLoops;
  return MadeChange;
}

// FIXME: Copied from LoopSimplify.
BasicBlock *PPCCTRLoops::InsertPreheaderForLoop(Loop *L) {
  BasicBlock *Header = L->getHeader();

  // Compute the set of predecessors of the loop that are not in the loop.
  SmallVector<BasicBlock*, 8> OutsideBlocks;
  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
       PI != PE; ++PI) {
    BasicBlock *P = *PI;
    if (!L->contains(P)) {         // Coming in from outside the loop?
      // If the loop is branched to from an indirect branch, we won't
      // be able to fully transform the loop, because it prohibits
      // edge splitting.
      if (isa<IndirectBrInst>(P->getTerminator())) return 0;

      // Keep track of it.
      OutsideBlocks.push_back(P);
    }
  }

  // Split out the loop pre-header.
  BasicBlock *PreheaderBB;
  if (!Header->isLandingPad()) {
    PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
                                         this);
  } else {
    SmallVector<BasicBlock*, 2> NewBBs;
    SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
                                ".split-lp", this, NewBBs);
    PreheaderBB = NewBBs[0];
  }

  PreheaderBB->getTerminator()->setDebugLoc(
                                      Header->getFirstNonPHI()->getDebugLoc());
  DEBUG(dbgs() << "Creating pre-header "
               << PreheaderBB->getName() << "\n");

  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
  // code layout too horribly.
  PlaceSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);

  return PreheaderBB;
}

void PPCCTRLoops::PlaceSplitBlockCarefully(BasicBlock *NewBB,
                                       SmallVectorImpl<BasicBlock*> &SplitPreds,
                                            Loop *L) {
  // Check to see if NewBB is already well placed.
  Function::iterator BBI = NewBB; --BBI;
  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
    if (&*BBI == SplitPreds[i])
      return;
  }

  // If it isn't already after an outside block, move it after one.  This is
  // always good as it makes the uncond branch from the outside block into a
  // fall-through.

  // Figure out *which* outside block to put this after.  Prefer an outside
  // block that neighbors a BB actually in the loop.
  BasicBlock *FoundBB = 0;
  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
    Function::iterator BBI = SplitPreds[i];
    if (++BBI != NewBB->getParent()->end() &&
        L->contains(BBI)) {
      FoundBB = SplitPreds[i];
      break;
    }
  }

  // If our heuristic for a *good* bb to place this after doesn't find
  // anything, just pick something.  It's likely better than leaving it within
  // the loop.
  if (!FoundBB)
    FoundBB = SplitPreds[0];
  NewBB->moveAfter(FoundBB);
}