forked from OSchip/llvm-project
[MergeICmps] MergeICmps is a new optimization pass that turns chains of integer
comparisons into memcmp. Thanks to recent improvements in the LLVM codegen, the memcmp is typically inlined as a chain of efficient hardware comparisons. This typically benefits C++ member or nonmember operator==(). For now this is disabled by default until: - https://bugs.llvm.org/show_bug.cgi?id=33329 is complete - Benchmarks show that this is always useful. Differential Revision: https://reviews.llvm.org/D33987 llvm-svn: 312315
This commit is contained in:
parent
b16598d54f
commit
9473c01e96
|
@ -256,6 +256,7 @@ void initializeMemorySSAPrinterLegacyPassPass(PassRegistry&);
|
|||
void initializeMemorySSAWrapperPassPass(PassRegistry&);
|
||||
void initializeMemorySanitizerPass(PassRegistry&);
|
||||
void initializeMergeFunctionsPass(PassRegistry&);
|
||||
void initializeMergeICmpsPass(PassRegistry&);
|
||||
void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
|
||||
void initializeMetaRenamerPass(PassRegistry&);
|
||||
void initializeModuleDebugInfoPrinterPass(PassRegistry&);
|
||||
|
|
|
@ -179,6 +179,7 @@ namespace {
|
|||
(void) llvm::createPostOrderFunctionAttrsLegacyPass();
|
||||
(void) llvm::createReversePostOrderFunctionAttrsPass();
|
||||
(void) llvm::createMergeFunctionsPass();
|
||||
(void) llvm::createMergeICmpsPass();
|
||||
std::string buf;
|
||||
llvm::raw_string_ostream os(buf);
|
||||
(void) llvm::createPrintModulePass(os);
|
||||
|
|
|
@ -420,6 +420,12 @@ Pass *createLowerAtomicPass();
|
|||
//
|
||||
Pass *createLowerGuardIntrinsicPass();
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// MergeICmps - Merge integer comparison chains
|
||||
//
|
||||
Pass *createMergeICmpsPass();
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// ValuePropagation - Propagate CFG-derived value information
|
||||
|
|
|
@ -94,6 +94,10 @@ static cl::opt<bool> EnableImplicitNullChecks(
|
|||
"enable-implicit-null-checks",
|
||||
cl::desc("Fold null checks into faulting memory operations"),
|
||||
cl::init(false));
|
||||
static cl::opt<bool> EnableMergeICmps(
|
||||
"enable-mergeicmps",
|
||||
cl::desc("Merge ICmp chains into a single memcmp"),
|
||||
cl::init(false));
|
||||
static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
|
||||
cl::desc("Print LLVM IR produced by the loop-reduce pass"));
|
||||
static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
|
||||
|
@ -591,6 +595,10 @@ void TargetPassConfig::addIRPasses() {
|
|||
addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
|
||||
}
|
||||
|
||||
if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
|
||||
addPass(createMergeICmpsPass());
|
||||
}
|
||||
|
||||
// Run GC lowering passes for builtin collectors
|
||||
// TODO: add a pass insertion point here
|
||||
addPass(createGCLoweringPass());
|
||||
|
|
|
@ -42,6 +42,7 @@ add_llvm_library(LLVMScalarOpts
|
|||
LowerExpectIntrinsic.cpp
|
||||
LowerGuardIntrinsic.cpp
|
||||
MemCpyOptimizer.cpp
|
||||
MergeICmps.cpp
|
||||
MergedLoadStoreMotion.cpp
|
||||
NaryReassociate.cpp
|
||||
NewGVN.cpp
|
||||
|
|
|
@ -0,0 +1,641 @@
|
|||
//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This pass turns chains of integer comparisons into memcmp (the memcmp is
|
||||
// later typically inlined as a chain of efficient hardware comparisons). This
|
||||
// typically benefits c++ member or nonmember operator==().
|
||||
//
|
||||
// The basic idea is to replace a larger chain of integer comparisons loaded
|
||||
// from contiguous memory locations into a smaller chain of such integer
|
||||
// comparisons. Benefits are double:
|
||||
// - There are less jumps, and therefore less opportunities for mispredictions
|
||||
// and I-cache misses.
|
||||
// - Code size is smaller, both because jumps are removed and because the
|
||||
// encoding of a 2*n byte compare is smaller than that of two n-byte
|
||||
// compares.
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/ADT/APSInt.h"
|
||||
#include "llvm/Analysis/Loads.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Transforms/Utils/BuildLibCalls.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
#define DEBUG_TYPE "mergeicmps"
|
||||
|
||||
#define MERGEICMPS_DOT_ON
|
||||
|
||||
// A BCE atom.
|
||||
struct BCEAtom {
|
||||
const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
|
||||
|
||||
bool operator<(const BCEAtom &O) const {
|
||||
return Base() == O.Base() ? Offset.slt(O.Offset) : Base() < O.Base();
|
||||
}
|
||||
|
||||
GetElementPtrInst *GEP = nullptr;
|
||||
LoadInst *LoadI = nullptr;
|
||||
APInt Offset;
|
||||
};
|
||||
|
||||
// If this value is a load from a constant offset w.r.t. a base address, and
|
||||
// there are no othe rusers of the load or address, returns the base address and
|
||||
// the offset.
|
||||
BCEAtom visitICmpLoadOperand(Value *const Val) {
|
||||
BCEAtom Result;
|
||||
if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
|
||||
DEBUG(dbgs() << "load\n");
|
||||
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
|
||||
DEBUG(dbgs() << "used outside of block\n");
|
||||
return {};
|
||||
}
|
||||
if (LoadI->isVolatile()) {
|
||||
DEBUG(dbgs() << "volatile\n");
|
||||
return {};
|
||||
}
|
||||
Value *const Addr = LoadI->getOperand(0);
|
||||
if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
|
||||
DEBUG(dbgs() << "GEP\n");
|
||||
if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
|
||||
DEBUG(dbgs() << "used outside of block\n");
|
||||
return {};
|
||||
}
|
||||
const auto &DL = GEP->getModule()->getDataLayout();
|
||||
if (!isDereferenceablePointer(GEP, DL)) {
|
||||
DEBUG(dbgs() << "not dereferenceable\n");
|
||||
// We need to make sure that we can do comparison in any order, so we
|
||||
// require memory to be unconditionnally dereferencable.
|
||||
return {};
|
||||
}
|
||||
Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
|
||||
if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
|
||||
Result.GEP = GEP;
|
||||
Result.LoadI = LoadI;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Result;
|
||||
}
|
||||
|
||||
// A basic block with a comparison between two BCE atoms.
|
||||
// Note: the terminology is misleading: the comparison is symmetric, so there
|
||||
// is no real {l/r}hs. To break the symmetry, we use the smallest atom as Lhs.
|
||||
class BCECmpBlock {
|
||||
public:
|
||||
BCECmpBlock() {}
|
||||
|
||||
BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
|
||||
: Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
|
||||
if (Rhs_ < Lhs_)
|
||||
std::swap(Rhs_, Lhs_);
|
||||
}
|
||||
|
||||
bool IsValid() const {
|
||||
return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
|
||||
}
|
||||
|
||||
// Assert the the block is consistent: If valid, it should also have
|
||||
// non-null members besides Lhs_ and Rhs_.
|
||||
void AssertConsistent() const {
|
||||
if (IsValid()) {
|
||||
assert(BB);
|
||||
assert(CmpI);
|
||||
assert(BranchI);
|
||||
}
|
||||
}
|
||||
|
||||
const BCEAtom &Lhs() const { return Lhs_; }
|
||||
const BCEAtom &Rhs() const { return Rhs_; }
|
||||
int SizeBits() const { return SizeBits_; }
|
||||
|
||||
// Returns true if the block does other works besides comparison.
|
||||
bool doesOtherWork() const;
|
||||
|
||||
// The basic block where this comparison happens.
|
||||
BasicBlock *BB = nullptr;
|
||||
// The ICMP for this comparison.
|
||||
ICmpInst *CmpI = nullptr;
|
||||
// The terminating branch.
|
||||
BranchInst *BranchI = nullptr;
|
||||
|
||||
private:
|
||||
BCEAtom Lhs_;
|
||||
BCEAtom Rhs_;
|
||||
int SizeBits_ = 0;
|
||||
};
|
||||
|
||||
bool BCECmpBlock::doesOtherWork() const {
|
||||
AssertConsistent();
|
||||
// TODO(courbet): Can we allow some other things ? This is very conservative.
|
||||
// We might be able to get away with anything does does not have any side
|
||||
// effects outside of the basic block.
|
||||
// Note: The GEPs and/or loads are not necessarily in the same block.
|
||||
for (const Instruction &Inst : *BB) {
|
||||
if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
|
||||
if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP))
|
||||
return true;
|
||||
} else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
|
||||
if (!(Lhs_.LoadI == L || Rhs_.LoadI == L))
|
||||
return true;
|
||||
} else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
|
||||
if (C != CmpI)
|
||||
return true;
|
||||
} else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
|
||||
if (Br != BranchI)
|
||||
return true;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Visit the given comparison. If this is a comparison between two valid
|
||||
// BCE atoms, returns the comparison.
|
||||
BCECmpBlock visitICmp(const ICmpInst *const CmpI,
|
||||
const ICmpInst::Predicate ExpectedPredicate) {
|
||||
if (CmpI->getPredicate() == ExpectedPredicate) {
|
||||
DEBUG(dbgs() << "cmp "
|
||||
<< (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
|
||||
<< "\n");
|
||||
auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
|
||||
if (!Lhs.Base())
|
||||
return {};
|
||||
auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
|
||||
if (!Rhs.Base())
|
||||
return {};
|
||||
return BCECmpBlock(std::move(Lhs), std::move(Rhs),
|
||||
CmpI->getOperand(0)->getType()->getScalarSizeInBits());
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// Visit the given comparison block. If this is a comparison between two valid
|
||||
// BCE atoms, returns the comparison.
|
||||
BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
|
||||
const BasicBlock *const PhiBlock) {
|
||||
if (Block->empty())
|
||||
return {};
|
||||
auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
|
||||
if (!BranchI)
|
||||
return {};
|
||||
DEBUG(dbgs() << "branch\n");
|
||||
if (BranchI->isUnconditional()) {
|
||||
// In this case, we expect an incoming value which is the result of the
|
||||
// comparison. This is the last link in the chain of comparisons (note
|
||||
// that this does not mean that this is the last incoming value, blocks
|
||||
// can be reordered).
|
||||
auto *const CmpI = dyn_cast<ICmpInst>(Val);
|
||||
if (!CmpI)
|
||||
return {};
|
||||
DEBUG(dbgs() << "icmp\n");
|
||||
auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
|
||||
Result.CmpI = CmpI;
|
||||
Result.BranchI = BranchI;
|
||||
return Result;
|
||||
} else {
|
||||
// In this case, we expect a constant incoming value (the comparison is
|
||||
// chained).
|
||||
const auto *const Const = dyn_cast<ConstantInt>(Val);
|
||||
DEBUG(dbgs() << "const\n");
|
||||
if (!Const->isZero())
|
||||
return {};
|
||||
DEBUG(dbgs() << "false\n");
|
||||
auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
|
||||
if (!CmpI)
|
||||
return {};
|
||||
DEBUG(dbgs() << "icmp\n");
|
||||
assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
|
||||
BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
|
||||
auto Result = visitICmp(
|
||||
CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
|
||||
Result.CmpI = CmpI;
|
||||
Result.BranchI = BranchI;
|
||||
return Result;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// A chain of comparisons.
|
||||
class BCECmpChain {
|
||||
public:
|
||||
BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi);
|
||||
|
||||
int size() const { return Comparisons_.size(); }
|
||||
|
||||
#ifdef MERGEICMPS_DOT_ON
|
||||
void dump() const;
|
||||
#endif // MERGEICMPS_DOT_ON
|
||||
|
||||
bool simplify(const TargetLibraryInfo *const TLI);
|
||||
|
||||
private:
|
||||
static bool IsContiguous(const BCECmpBlock &First,
|
||||
const BCECmpBlock &Second) {
|
||||
return First.Lhs().Base() == Second.Lhs().Base() &&
|
||||
First.Rhs().Base() == Second.Rhs().Base() &&
|
||||
First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
|
||||
First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
|
||||
}
|
||||
|
||||
// Merges the given comparison blocks into one memcmp block and update
|
||||
// branches. Comparisons are assumed to be continguous. If NextBBInChain is
|
||||
// null, the merged block will link to the phi block.
|
||||
static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
|
||||
BasicBlock *const NextBBInChain, PHINode &Phi,
|
||||
const TargetLibraryInfo *const TLI);
|
||||
|
||||
PHINode &Phi_;
|
||||
std::vector<BCECmpBlock> Comparisons_;
|
||||
// The original entry block (before sorting);
|
||||
BasicBlock *EntryBlock_;
|
||||
};
|
||||
|
||||
BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
|
||||
: Phi_(Phi) {
|
||||
// Now look inside blocks to check for BCE comparisons.
|
||||
std::vector<BCECmpBlock> Comparisons;
|
||||
for (BasicBlock *Block : Blocks) {
|
||||
BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
|
||||
Block, Phi.getParent());
|
||||
Comparison.BB = Block;
|
||||
if (!Comparison.IsValid()) {
|
||||
DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
|
||||
return;
|
||||
}
|
||||
if (Comparison.doesOtherWork()) {
|
||||
DEBUG(dbgs() << "block does extra work besides compare\n");
|
||||
if (Comparisons.empty()) { // First block.
|
||||
// TODO(courbet): The first block can do other things, and we should
|
||||
// split them apart in a separate block before the comparison chain.
|
||||
// Right now we just discard it and make the chain shorter.
|
||||
DEBUG(dbgs()
|
||||
<< "ignoring first block that does extra work besides compare\n");
|
||||
continue;
|
||||
}
|
||||
// TODO(courbet): Right now we abort the whole chain. We could be
|
||||
// merging only the blocks that don't do other work and resume the
|
||||
// chain from there. For example:
|
||||
// if (a[0] == b[0]) { // bb1
|
||||
// if (a[1] == b[1]) { // bb2
|
||||
// some_value = 3; //bb3
|
||||
// if (a[2] == b[2]) { //bb3
|
||||
// do a ton of stuff //bb4
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// This is:
|
||||
//
|
||||
// bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
|
||||
// \ \ \ \
|
||||
// ne ne ne \
|
||||
// \ \ \ v
|
||||
// +------------+-----------+----------> bb_phi
|
||||
//
|
||||
// We can only merge the first two comparisons, because bb3* does
|
||||
// "other work" (setting some_value to 3).
|
||||
// We could still merge bb1 and bb2 though.
|
||||
return;
|
||||
}
|
||||
DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
|
||||
<< " bits between " << Comparison.Lhs().Base() << " + "
|
||||
<< Comparison.Lhs().Offset << " and "
|
||||
<< Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
|
||||
<< "\n");
|
||||
DEBUG(dbgs() << "\n");
|
||||
Comparisons.push_back(Comparison);
|
||||
}
|
||||
EntryBlock_ = Comparisons[0].BB;
|
||||
Comparisons_ = std::move(Comparisons);
|
||||
#ifdef MERGEICMPS_DOT_ON
|
||||
errs() << "BEFORE REORDERING:\n\n";
|
||||
dump();
|
||||
#endif // MERGEICMPS_DOT_ON
|
||||
// Reorder blocks by LHS. We can do that without changing the
|
||||
// semantics because we are only accessing dereferencable memory.
|
||||
std::sort(Comparisons_.begin(), Comparisons_.end(),
|
||||
[](const BCECmpBlock &a, const BCECmpBlock &b) {
|
||||
return a.Lhs() < b.Lhs();
|
||||
});
|
||||
#ifdef MERGEICMPS_DOT_ON
|
||||
errs() << "AFTER REORDERING:\n\n";
|
||||
dump();
|
||||
#endif // MERGEICMPS_DOT_ON
|
||||
}
|
||||
|
||||
#ifdef MERGEICMPS_DOT_ON
|
||||
void BCECmpChain::dump() const {
|
||||
errs() << "digraph dag {\n";
|
||||
errs() << " graph [bgcolor=transparent];\n";
|
||||
errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
|
||||
errs() << " edge [color=black];\n";
|
||||
for (size_t I = 0; I < Comparisons_.size(); ++I) {
|
||||
const auto &Comparison = Comparisons_[I];
|
||||
errs() << " \"" << I << "\" [label=\"%"
|
||||
<< Comparison.Lhs().Base()->getName() << " + "
|
||||
<< Comparison.Lhs().Offset << " == %"
|
||||
<< Comparison.Rhs().Base()->getName() << " + "
|
||||
<< Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
|
||||
<< " bytes)\"];\n";
|
||||
const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
|
||||
if (I > 0)
|
||||
errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
|
||||
errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
|
||||
}
|
||||
errs() << " \"Phi\" [label=\"Phi\"];\n";
|
||||
errs() << "}\n\n";
|
||||
}
|
||||
#endif // MERGEICMPS_DOT_ON
|
||||
|
||||
bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
|
||||
// First pass to check if there is at least one merge. If not, we don't do
|
||||
// anything and we keep analysis passes intact.
|
||||
{
|
||||
bool AtLeastOneMerged = false;
|
||||
for (size_t I = 1; I < Comparisons_.size(); ++I) {
|
||||
if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
|
||||
AtLeastOneMerged = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!AtLeastOneMerged)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Remove phi references to comparison blocks, they will be rebuilt as we
|
||||
// merge the blocks.
|
||||
for (const auto &Comparison : Comparisons_) {
|
||||
Phi_.removeIncomingValue(Comparison.BB, false);
|
||||
}
|
||||
|
||||
// Point the predecessors of the chain to the first comparison block (which is
|
||||
// the new entry point).
|
||||
if (EntryBlock_ != Comparisons_[0].BB)
|
||||
EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
|
||||
|
||||
// Effectively merge blocks.
|
||||
int NumMerged = 1;
|
||||
for (size_t I = 1; I < Comparisons_.size(); ++I) {
|
||||
if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
|
||||
++NumMerged;
|
||||
} else {
|
||||
// Merge all previous comparisons and start a new merge block.
|
||||
mergeComparisons(
|
||||
makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
|
||||
Comparisons_[I].BB, Phi_, TLI);
|
||||
NumMerged = 1;
|
||||
}
|
||||
}
|
||||
mergeComparisons(makeArrayRef(Comparisons_)
|
||||
.slice(Comparisons_.size() - NumMerged, NumMerged),
|
||||
nullptr, Phi_, TLI);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
|
||||
BasicBlock *const NextBBInChain,
|
||||
PHINode &Phi,
|
||||
const TargetLibraryInfo *const TLI) {
|
||||
assert(!Comparisons.empty());
|
||||
const auto &FirstComparison = *Comparisons.begin();
|
||||
BasicBlock *const BB = FirstComparison.BB;
|
||||
LLVMContext &Context = BB->getContext();
|
||||
|
||||
if (Comparisons.size() >= 2) {
|
||||
DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
|
||||
const auto TotalSize =
|
||||
std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
|
||||
[](int Size, const BCECmpBlock &C) {
|
||||
return Size + C.SizeBits();
|
||||
}) /
|
||||
8;
|
||||
|
||||
// Incoming edges do not need to be updated, and both GEPs are already
|
||||
// computing the right address, we just need to:
|
||||
// - replace the two loads and the icmp with the memcmp
|
||||
// - update the branch
|
||||
// - update the incoming values in the phi.
|
||||
FirstComparison.BranchI->eraseFromParent();
|
||||
FirstComparison.CmpI->eraseFromParent();
|
||||
FirstComparison.Lhs().LoadI->eraseFromParent();
|
||||
FirstComparison.Rhs().LoadI->eraseFromParent();
|
||||
|
||||
IRBuilder<> Builder(BB);
|
||||
const auto &DL = Phi.getModule()->getDataLayout();
|
||||
Value *const MemCmpCall =
|
||||
emitMemCmp(FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
|
||||
ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
|
||||
Builder, DL, TLI);
|
||||
Value *const MemCmpIsZero = Builder.CreateICmpEQ(
|
||||
MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
|
||||
|
||||
// Add a branch to the next basic block in the chain.
|
||||
if (NextBBInChain) {
|
||||
Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
|
||||
Phi.addIncoming(ConstantInt::getFalse(Context), BB);
|
||||
} else {
|
||||
Builder.CreateBr(Phi.getParent());
|
||||
Phi.addIncoming(MemCmpIsZero, BB);
|
||||
}
|
||||
|
||||
// Delete merged blocks.
|
||||
for (size_t I = 1; I < Comparisons.size(); ++I) {
|
||||
BasicBlock *CBB = Comparisons[I].BB;
|
||||
CBB->replaceAllUsesWith(BB);
|
||||
CBB->eraseFromParent();
|
||||
}
|
||||
} else {
|
||||
assert(Comparisons.size() == 1);
|
||||
// There are no blocks to merge, but we still need to update the branches.
|
||||
DEBUG(dbgs() << "Only one comparison, updating branches\n");
|
||||
if (NextBBInChain) {
|
||||
if (FirstComparison.BranchI->isConditional()) {
|
||||
DEBUG(dbgs() << "conditional -> conditional\n");
|
||||
// Just update the "true" target, the "false" target should already be
|
||||
// the phi block.
|
||||
assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
|
||||
FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
|
||||
Phi.addIncoming(ConstantInt::getFalse(Context), BB);
|
||||
} else {
|
||||
DEBUG(dbgs() << "unconditional -> conditional\n");
|
||||
// Replace the unconditional branch by a conditional one.
|
||||
FirstComparison.BranchI->eraseFromParent();
|
||||
IRBuilder<> Builder(BB);
|
||||
Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
|
||||
Phi.getParent());
|
||||
Phi.addIncoming(FirstComparison.CmpI, BB);
|
||||
}
|
||||
} else {
|
||||
if (FirstComparison.BranchI->isConditional()) {
|
||||
DEBUG(dbgs() << "conditional -> unconditional\n");
|
||||
// Replace the conditional branch by an unconditional one.
|
||||
FirstComparison.BranchI->eraseFromParent();
|
||||
IRBuilder<> Builder(BB);
|
||||
Builder.CreateBr(Phi.getParent());
|
||||
Phi.addIncoming(FirstComparison.CmpI, BB);
|
||||
} else {
|
||||
DEBUG(dbgs() << "unconditional -> unconditional\n");
|
||||
Phi.addIncoming(FirstComparison.CmpI, BB);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
|
||||
BasicBlock *const LastBlock,
|
||||
int NumBlocks) {
|
||||
// Walk up from the last block to find other blocks.
|
||||
std::vector<BasicBlock *> Blocks(NumBlocks);
|
||||
BasicBlock *CurBlock = LastBlock;
|
||||
for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
|
||||
if (CurBlock->hasAddressTaken()) {
|
||||
// Somebody is jumping to the block through an address, all bets are
|
||||
// off.
|
||||
DEBUG(dbgs() << "skip: block " << BlockIndex
|
||||
<< " has its address taken\n");
|
||||
return {};
|
||||
}
|
||||
Blocks[BlockIndex] = CurBlock;
|
||||
auto *SinglePredecessor = CurBlock->getSinglePredecessor();
|
||||
if (!SinglePredecessor) {
|
||||
// The block has two or more predecessors.
|
||||
DEBUG(dbgs() << "skip: block " << BlockIndex
|
||||
<< " has two or more predecessors\n");
|
||||
return {};
|
||||
}
|
||||
if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
|
||||
// The block does not link back to the phi.
|
||||
DEBUG(dbgs() << "skip: block " << BlockIndex
|
||||
<< " does not link back to the phi\n");
|
||||
return {};
|
||||
}
|
||||
CurBlock = SinglePredecessor;
|
||||
}
|
||||
Blocks[0] = CurBlock;
|
||||
return Blocks;
|
||||
}
|
||||
|
||||
bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
|
||||
DEBUG(dbgs() << "processPhi()\n");
|
||||
if (Phi.getNumIncomingValues() <= 1) {
|
||||
DEBUG(dbgs() << "skip: only one incoming value in phi\n");
|
||||
return false;
|
||||
}
|
||||
// We are looking for something that has the following structure:
|
||||
// bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
|
||||
// \ \ \ \
|
||||
// ne ne ne \
|
||||
// \ \ \ v
|
||||
// +------------+-----------+----------> bb_phi
|
||||
//
|
||||
// - The last basic block (bb4 here) must branch unconditionally to bb_phi.
|
||||
// It's the only block that contributes a non-constant value to the Phi.
|
||||
// - All other blocks (b1, b2, b3) must have exactly two successors, one of
|
||||
// them being the the phi block.
|
||||
// - All intermediate blocks (bb2, bb3) must have only one predecessor.
|
||||
// - Blocks cannot do other work besides the comparison, see doesOtherWork()
|
||||
|
||||
// The blocks are not necessarily ordered in the phi, so we start from the
|
||||
// last block and reconstruct the order.
|
||||
BasicBlock *LastBlock = nullptr;
|
||||
for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
|
||||
if (isa<ConstantInt>(Phi.getIncomingValue(I)))
|
||||
continue;
|
||||
if (LastBlock) {
|
||||
// There are several non-constant values.
|
||||
DEBUG(dbgs() << "skip: several non-constant values\n");
|
||||
return false;
|
||||
}
|
||||
LastBlock = Phi.getIncomingBlock(I);
|
||||
}
|
||||
if (!LastBlock) {
|
||||
// There is no non-constant block.
|
||||
DEBUG(dbgs() << "skip: no non-constant block\n");
|
||||
return false;
|
||||
}
|
||||
if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
|
||||
DEBUG(dbgs() << "skip: last block non-phi successor\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto Blocks =
|
||||
getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
|
||||
if (Blocks.empty())
|
||||
return false;
|
||||
BCECmpChain CmpChain(Blocks, Phi);
|
||||
|
||||
if (CmpChain.size() < 2) {
|
||||
DEBUG(dbgs() << "skip: only one compare block\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return CmpChain.simplify(TLI);
|
||||
}
|
||||
|
||||
class MergeICmps : public FunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
MergeICmps() : FunctionPass(ID) {
|
||||
initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
if (skipFunction(F)) return false;
|
||||
const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
|
||||
auto PA = runImpl(F, &TLI);
|
||||
return !PA.areAllPreserved();
|
||||
}
|
||||
|
||||
private:
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
||||
}
|
||||
|
||||
PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI);
|
||||
};
|
||||
|
||||
PreservedAnalyses MergeICmps::runImpl(Function &F,
|
||||
const TargetLibraryInfo *TLI) {
|
||||
DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
|
||||
|
||||
bool MadeChange = false;
|
||||
|
||||
for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
|
||||
// A Phi operation is always first in a basic block.
|
||||
if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
|
||||
MadeChange |= processPhi(*Phi, TLI);
|
||||
}
|
||||
|
||||
if (MadeChange)
|
||||
return PreservedAnalyses::none();
|
||||
return PreservedAnalyses::all();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
char MergeICmps::ID = 0;
|
||||
INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
|
||||
"Merge contiguous icmps into a memcmp", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
|
||||
INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
|
||||
"Merge contiguous icmps into a memcmp", false, false)
|
||||
|
||||
Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
|
||||
|
|
@ -73,6 +73,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
|
|||
initializeLowerExpectIntrinsicPass(Registry);
|
||||
initializeLowerGuardIntrinsicLegacyPassPass(Registry);
|
||||
initializeMemCpyOptLegacyPassPass(Registry);
|
||||
initializeMergeICmpsPass(Registry);
|
||||
initializeMergedLoadStoreMotionLegacyPassPass(Registry);
|
||||
initializeNaryReassociateLegacyPassPass(Registry);
|
||||
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
|
||||
|
||||
%"struct.std::pair" = type { i32, i32 }
|
||||
|
||||
define zeroext i1 @opeq1(
|
||||
%"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
|
||||
%"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
|
||||
entry:
|
||||
%first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
|
||||
%0 = load i32, i32* %first.i, align 4
|
||||
%first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
|
||||
%1 = load i32, i32* %first1.i, align 4
|
||||
%cmp.i = icmp eq i32 %0, %1
|
||||
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
|
||||
|
||||
land.rhs.i:
|
||||
%second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
|
||||
%2 = load i32, i32* %second.i, align 4
|
||||
%second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
|
||||
%3 = load i32, i32* %second2.i, align 4
|
||||
%cmp3.i = icmp eq i32 %2, %3
|
||||
br label %opeq1.exit
|
||||
|
||||
opeq1.exit:
|
||||
%4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
|
||||
ret i1 %4
|
||||
; CHECK-LABEL: @opeq1(
|
||||
; The entry block with zero-offset GEPs is kept, loads are removed.
|
||||
; CHECK: entry
|
||||
; CHECK: getelementptr {{.*}} i32 0
|
||||
; CHECK-NOT: load
|
||||
; CHECK: getelementptr {{.*}} i32 0
|
||||
; CHECK-NOT: load
|
||||
; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
|
||||
; CHECK: @memcmp({{.*}}8)
|
||||
; CHECK: icmp eq {{.*}} 0
|
||||
; The branch is now a direct branch; the other block has been removed.
|
||||
; CHECK: br label %opeq1.exit
|
||||
; CHECK-NOT: br
|
||||
; The phi is updated.
|
||||
; CHECK: phi i1 [ %{{[^,]*}}, %entry ]
|
||||
; CHECK-NEXT: ret
|
||||
}
|
||||
|
||||
; Same as above, but the two blocks are in inverse order.
|
||||
define zeroext i1 @opeq1_inverse(
|
||||
%"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
|
||||
%"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
|
||||
entry:
|
||||
%first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
|
||||
%0 = load i32, i32* %first.i, align 4
|
||||
%first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
|
||||
%1 = load i32, i32* %first1.i, align 4
|
||||
%cmp.i = icmp eq i32 %0, %1
|
||||
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
|
||||
|
||||
land.rhs.i:
|
||||
%second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
|
||||
%2 = load i32, i32* %second.i, align 4
|
||||
%second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
|
||||
%3 = load i32, i32* %second2.i, align 4
|
||||
%cmp3.i = icmp eq i32 %2, %3
|
||||
br label %opeq1.exit
|
||||
|
||||
opeq1.exit:
|
||||
%4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
|
||||
ret i1 %4
|
||||
; CHECK-LABEL: @opeq1_inverse(
|
||||
; The second block with zero-offset GEPs is kept, loads are removed.
|
||||
; CHECK: land.rhs.i
|
||||
; CHECK: getelementptr {{.*}} i32 0
|
||||
; CHECK-NOT: load
|
||||
; CHECK: getelementptr {{.*}} i32 0
|
||||
; CHECK-NOT: load
|
||||
; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
|
||||
; CHECK: @memcmp({{.*}}8)
|
||||
; CHECK: icmp eq {{.*}} 0
|
||||
; The branch is now a direct branch; the other block has been removed.
|
||||
; CHECK: br label %opeq1.exit
|
||||
; CHECK-NOT: br
|
||||
; The phi is updated.
|
||||
; CHECK: phi i1 [ %{{[^,]*}}, %land.rhs.i ]
|
||||
; CHECK-NEXT: ret
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
|
||||
|
||||
; This is a more involved test: clang generates this weird pattern for
|
||||
; tuple<uint8_t, uint8_t, uint8_t, uint8_t>. Right now we skip the entry block
|
||||
; (which defines the base pointer for other blocks) and the last one (which
|
||||
; does not have the expected structure). Only middle blocks (bytes [1,2]) are
|
||||
; merged.
|
||||
|
||||
%"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
|
||||
%"struct.std::_Tuple_impl" = type { %"struct.std::_Tuple_impl.0", %"struct.std::_Head_base.6" }
|
||||
%"struct.std::_Tuple_impl.0" = type { %"struct.std::_Tuple_impl.1", %"struct.std::_Head_base.5" }
|
||||
%"struct.std::_Tuple_impl.1" = type { %"struct.std::_Tuple_impl.2", %"struct.std::_Head_base.4" }
|
||||
%"struct.std::_Tuple_impl.2" = type { %"struct.std::_Head_base" }
|
||||
%"struct.std::_Head_base" = type { i8 }
|
||||
%"struct.std::_Head_base.4" = type { i8 }
|
||||
%"struct.std::_Head_base.5" = type { i8 }
|
||||
%"struct.std::_Head_base.6" = type { i8 }
|
||||
|
||||
define zeroext i1 @opeq(
|
||||
%"class.std::tuple"* nocapture readonly dereferenceable(4) %a,
|
||||
%"class.std::tuple"* nocapture readonly dereferenceable(4) %b) local_unnamed_addr #1 {
|
||||
entry:
|
||||
%0 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %a, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
|
||||
%add.ptr.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 3
|
||||
%1 = load i8, i8* %add.ptr.i.i.i.i.i, align 1
|
||||
%2 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %b, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
|
||||
%add.ptr.i.i.i6.i.i = getelementptr inbounds i8, i8* %2, i64 3
|
||||
%3 = load i8, i8* %add.ptr.i.i.i6.i.i, align 1
|
||||
%cmp.i.i = icmp eq i8 %1, %3
|
||||
br i1 %cmp.i.i, label %land.rhs.i.i, label %opeq.exit
|
||||
|
||||
land.rhs.i.i:
|
||||
%add.ptr.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 2
|
||||
%4 = load i8, i8* %add.ptr.i.i.i.i.i.i, align 1
|
||||
%add.ptr.i.i.i6.i.i.i = getelementptr inbounds i8, i8* %2, i64 2
|
||||
%5 = load i8, i8* %add.ptr.i.i.i6.i.i.i, align 1
|
||||
%cmp.i.i.i = icmp eq i8 %4, %5
|
||||
br i1 %cmp.i.i.i, label %land.rhs.i.i.i, label %opeq.exit
|
||||
|
||||
land.rhs.i.i.i:
|
||||
%add.ptr.i.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 1
|
||||
%6 = load i8, i8* %add.ptr.i.i.i.i.i.i.i, align 1
|
||||
%add.ptr.i.i.i6.i.i.i.i = getelementptr inbounds i8, i8* %2, i64 1
|
||||
%7 = load i8, i8* %add.ptr.i.i.i6.i.i.i.i, align 1
|
||||
%cmp.i.i.i.i = icmp eq i8 %6, %7
|
||||
br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %opeq.exit
|
||||
|
||||
land.rhs.i.i.i.i:
|
||||
%8 = load i8, i8* %0, align 1
|
||||
%9 = load i8, i8* %2, align 1
|
||||
%cmp.i.i.i.i.i = icmp eq i8 %8, %9
|
||||
br label %opeq.exit
|
||||
|
||||
opeq.exit:
|
||||
%10 = phi i1 [ false, %entry ], [ false, %land.rhs.i.i ], [ false, %land.rhs.i.i.i ], [ %cmp.i.i.i.i.i, %land.rhs.i.i.i.i ]
|
||||
ret i1 %10
|
||||
; CHECK-LABEL: @opeq(
|
||||
; The entry block is kept as is, but the next block is now the merged comparison
|
||||
; block for bytes [1,2] or the block for the head.
|
||||
; CHECK: entry
|
||||
; CHECK: br i1 %cmp.i.i, label %land.rhs.i.i.i{{(.i)?}}, label %opeq.exit
|
||||
; The two 1 byte loads and compares at offset 1 are replaced with a single
|
||||
; 2-byte memcmp.
|
||||
; CHECK: land.rhs.i.i.i
|
||||
; CHECK: @memcmp({{.*}}2)
|
||||
; CHECK: icmp eq {{.*}} 0
|
||||
; In the end we have three blocks.
|
||||
; CHECK: phi i1
|
||||
; CHECK-SAME %entry
|
||||
; CHECK-SAME %land.rhs.i.i.i.i
|
||||
; CHECK-SAME %land.rhs.i.i.i
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
|
||||
|
||||
%"struct.std::pair" = type { i32, i32 }
|
||||
|
||||
define zeroext i1 @opeq(
|
||||
%"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
|
||||
%"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
|
||||
entry:
|
||||
%first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
|
||||
%0 = load i32, i32* %first.i, align 4
|
||||
%first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
|
||||
%1 = load i32, i32* %first1.i, align 4
|
||||
%cmp.i = icmp eq i32 %0, %1
|
||||
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
|
||||
|
||||
land.rhs.i:
|
||||
%second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
|
||||
%2 = load volatile i32, i32* %second.i, align 4
|
||||
%second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
|
||||
%3 = load i32, i32* %second2.i, align 4
|
||||
%cmp3.i = icmp eq i32 %2, %3
|
||||
br label %opeq1.exit
|
||||
|
||||
opeq1.exit:
|
||||
%4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
|
||||
ret i1 %4
|
||||
; CHECK-LABEL: @opeq(
|
||||
; CHECK-NOT: memcmp
|
||||
}
|
||||
|
Loading…
Reference in New Issue