forked from OSchip/llvm-project
[clangd] Dex: FALSE iterator, peephole optimizations, fix AND bug
Summary: The FALSE iterator will be used in a followup patch to fix a logic bug in Dex (currently, tokens that don't have posting lists in the index are simply dropped from the query, changing semantics). It can usually be optimized away, so added the following opmitizations: - simplify booleans inside AND/OR - replace effectively-empty AND/OR with booleans - flatten nested AND/ORs While working on this, found a bug in the AND iterator: its constructor sync() assumes that ReachedEnd is set if applicable, but the constructor never sets it. This crashes if a non-first iterator is nonempty. Reviewers: ilya-biryukov Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits Differential Revision: https://reviews.llvm.org/D52789 llvm-svn: 343774
This commit is contained in:
parent
82a3b1c687
commit
87f69eaf4e
|
@ -8,6 +8,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Iterator.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <numeric>
|
||||
|
@ -26,9 +27,11 @@ namespace {
|
|||
class AndIterator : public Iterator {
|
||||
public:
|
||||
explicit AndIterator(std::vector<std::unique_ptr<Iterator>> AllChildren)
|
||||
: Children(std::move(AllChildren)) {
|
||||
: Iterator(Kind::And), Children(std::move(AllChildren)) {
|
||||
assert(!Children.empty() && "AND iterator should have at least one child.");
|
||||
// Establish invariants.
|
||||
for (const auto &Child : Children)
|
||||
ReachedEnd |= Child->reachedEnd();
|
||||
sync();
|
||||
// When children are sorted by the estimateSize(), sync() calls are more
|
||||
// effective. Each sync() starts with the first child and makes sure all
|
||||
|
@ -122,6 +125,7 @@ private:
|
|||
/// update the field, rather than traversing the whole subtree in each
|
||||
/// reachedEnd() call.
|
||||
bool ReachedEnd = false;
|
||||
friend Corpus; // For optimizations.
|
||||
};
|
||||
|
||||
/// Implements Iterator over the union of other iterators.
|
||||
|
@ -133,7 +137,7 @@ private:
|
|||
class OrIterator : public Iterator {
|
||||
public:
|
||||
explicit OrIterator(std::vector<std::unique_ptr<Iterator>> AllChildren)
|
||||
: Children(std::move(AllChildren)) {
|
||||
: Iterator(Kind::Or), Children(std::move(AllChildren)) {
|
||||
assert(!Children.empty() && "OR iterator should have at least one child.");
|
||||
}
|
||||
|
||||
|
@ -208,6 +212,7 @@ private:
|
|||
|
||||
// FIXME(kbobyrev): Would storing Children in min-heap be faster?
|
||||
std::vector<std::unique_ptr<Iterator>> Children;
|
||||
friend Corpus; // For optimizations.
|
||||
};
|
||||
|
||||
/// TrueIterator handles PostingLists which contain all items of the index. It
|
||||
|
@ -215,7 +220,7 @@ private:
|
|||
/// in O(1).
|
||||
class TrueIterator : public Iterator {
|
||||
public:
|
||||
explicit TrueIterator(DocID Size) : Size(Size) {}
|
||||
explicit TrueIterator(DocID Size) : Iterator(Kind::True), Size(Size) {}
|
||||
|
||||
bool reachedEnd() const override { return Index >= Size; }
|
||||
|
||||
|
@ -251,12 +256,35 @@ private:
|
|||
DocID Size;
|
||||
};
|
||||
|
||||
/// FalseIterator yields no results.
|
||||
class FalseIterator : public Iterator {
|
||||
public:
|
||||
FalseIterator() : Iterator(Kind::False) {}
|
||||
bool reachedEnd() const override { return true; }
|
||||
void advance() override { assert(false); }
|
||||
void advanceTo(DocID ID) override { assert(false); }
|
||||
DocID peek() const override {
|
||||
assert(false);
|
||||
return 0;
|
||||
}
|
||||
float consume() override {
|
||||
assert(false);
|
||||
return 1;
|
||||
}
|
||||
size_t estimateSize() const override { return 0; }
|
||||
|
||||
private:
|
||||
llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
|
||||
return OS << "false";
|
||||
}
|
||||
};
|
||||
|
||||
/// Boost iterator is a wrapper around its child which multiplies scores of
|
||||
/// each retrieved item by a given factor.
|
||||
class BoostIterator : public Iterator {
|
||||
public:
|
||||
BoostIterator(std::unique_ptr<Iterator> Child, float Factor)
|
||||
: Child(move(Child)), Factor(Factor) {}
|
||||
: Child(std::move(Child)), Factor(Factor) {}
|
||||
|
||||
bool reachedEnd() const override { return Child->reachedEnd(); }
|
||||
|
||||
|
@ -286,7 +314,7 @@ private:
|
|||
class LimitIterator : public Iterator {
|
||||
public:
|
||||
LimitIterator(std::unique_ptr<Iterator> Child, size_t Limit)
|
||||
: Child(move(Child)), Limit(Limit), ItemsLeft(Limit) {}
|
||||
: Child(std::move(Child)), Limit(Limit), ItemsLeft(Limit) {}
|
||||
|
||||
bool reachedEnd() const override {
|
||||
return ItemsLeft == 0 || Child->reachedEnd();
|
||||
|
@ -331,30 +359,86 @@ std::vector<std::pair<DocID, float>> consume(Iterator &It) {
|
|||
|
||||
std::unique_ptr<Iterator>
|
||||
Corpus::intersect(std::vector<std::unique_ptr<Iterator>> Children) const {
|
||||
// If there is exactly one child, pull it one level up: AND(Child) -> Child.
|
||||
return Children.size() == 1 ? std::move(Children.front())
|
||||
: llvm::make_unique<AndIterator>(move(Children));
|
||||
std::vector<std::unique_ptr<Iterator>> RealChildren;
|
||||
for (auto &Child : Children) {
|
||||
switch (Child->kind()) {
|
||||
case Iterator::Kind::True:
|
||||
break; // No effect, drop the iterator.
|
||||
case Iterator::Kind::False:
|
||||
return std::move(Child); // Intersection is empty.
|
||||
case Iterator::Kind::And: {
|
||||
// Inline nested AND into parent AND.
|
||||
auto &NewChildren = static_cast<AndIterator *>(Child.get())->Children;
|
||||
std::move(NewChildren.begin(), NewChildren.end(),
|
||||
std::back_inserter(RealChildren));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
RealChildren.push_back(std::move(Child));
|
||||
}
|
||||
}
|
||||
switch (RealChildren.size()) {
|
||||
case 0:
|
||||
return all();
|
||||
case 1:
|
||||
return std::move(RealChildren.front());
|
||||
default:
|
||||
return llvm::make_unique<AndIterator>(std::move(RealChildren));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator>
|
||||
Corpus::unionOf(std::vector<std::unique_ptr<Iterator>> Children) const {
|
||||
// If there is exactly one child, pull it one level up: OR(Child) -> Child.
|
||||
return Children.size() == 1 ? std::move(Children.front())
|
||||
: llvm::make_unique<OrIterator>(move(Children));
|
||||
std::vector<std::unique_ptr<Iterator>> RealChildren;
|
||||
for (auto &Child : Children) {
|
||||
switch (Child->kind()) {
|
||||
case Iterator::Kind::False:
|
||||
break; // No effect, drop the iterator.
|
||||
case Iterator::Kind::Or: {
|
||||
// Inline nested OR into parent OR.
|
||||
auto &NewChildren = static_cast<OrIterator *>(Child.get())->Children;
|
||||
std::move(NewChildren.begin(), NewChildren.end(),
|
||||
std::back_inserter(RealChildren));
|
||||
break;
|
||||
}
|
||||
case Iterator::Kind::True:
|
||||
// Don't return all(), which would discard sibling boosts.
|
||||
default:
|
||||
RealChildren.push_back(std::move(Child));
|
||||
}
|
||||
}
|
||||
switch (RealChildren.size()) {
|
||||
case 0:
|
||||
return none();
|
||||
case 1:
|
||||
return std::move(RealChildren.front());
|
||||
default:
|
||||
return llvm::make_unique<OrIterator>(std::move(RealChildren));
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> Corpus::all() const {
|
||||
return llvm::make_unique<TrueIterator>(Size);
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> Corpus::none() const {
|
||||
return llvm::make_unique<FalseIterator>();
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> Corpus::boost(std::unique_ptr<Iterator> Child,
|
||||
float Factor) const {
|
||||
return llvm::make_unique<BoostIterator>(move(Child), Factor);
|
||||
if (Factor == 1)
|
||||
return Child;
|
||||
if (Child->kind() == Iterator::Kind::False)
|
||||
return Child;
|
||||
return llvm::make_unique<BoostIterator>(std::move(Child), Factor);
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> Corpus::limit(std::unique_ptr<Iterator> Child,
|
||||
size_t Limit) const {
|
||||
return llvm::make_unique<LimitIterator>(move(Child), Limit);
|
||||
if (Child->kind() == Iterator::Kind::False)
|
||||
return Child;
|
||||
return llvm::make_unique<LimitIterator>(std::move(Child), Limit);
|
||||
}
|
||||
|
||||
} // namespace dex
|
||||
|
|
|
@ -98,8 +98,16 @@ public:
|
|||
return Iterator.dump(OS);
|
||||
}
|
||||
|
||||
/// Inspect iterator type, used internally for optimizing query trees.
|
||||
enum class Kind { And, Or, True, False, Other };
|
||||
Kind kind() const { return MyKind; }
|
||||
|
||||
protected:
|
||||
Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
|
||||
|
||||
private:
|
||||
virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
|
||||
Kind MyKind;
|
||||
};
|
||||
|
||||
/// Advances the iterator until it is exhausted. Returns pairs of document IDs
|
||||
|
@ -150,6 +158,9 @@ public:
|
|||
/// containing all items in range [0, Size) in an efficient manner.
|
||||
std::unique_ptr<Iterator> all() const;
|
||||
|
||||
/// Returns FALSE Iterator which iterates over no documents.
|
||||
std::unique_ptr<Iterator> none() const;
|
||||
|
||||
/// Returns BOOST iterator which multiplies the score of each item by given
|
||||
/// factor. Boosting can be used as a computationally inexpensive filtering.
|
||||
/// Users can return significantly more items using consumeAndBoost() and
|
||||
|
|
|
@ -109,6 +109,18 @@ TEST(DexIterators, AndThreeLists) {
|
|||
EXPECT_TRUE(And->reachedEnd());
|
||||
}
|
||||
|
||||
TEST(DexIterators, AndEmpty) {
|
||||
Corpus C{10000};
|
||||
const PostingList L1({1});
|
||||
const PostingList L2({2});
|
||||
// These iterators are empty, but the optimizer can't tell.
|
||||
auto Empty1 = C.intersect(L1.iterator(), L2.iterator());
|
||||
auto Empty2 = C.intersect(L1.iterator(), L2.iterator());
|
||||
// And syncs iterators on construction, and used to fail on empty children.
|
||||
auto And = C.intersect(std::move(Empty1), std::move(Empty2));
|
||||
EXPECT_TRUE(And->reachedEnd());
|
||||
}
|
||||
|
||||
TEST(DexIterators, OrTwoLists) {
|
||||
Corpus C{10000};
|
||||
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
|
||||
|
@ -270,18 +282,8 @@ TEST(DexIterators, Limit) {
|
|||
}
|
||||
|
||||
TEST(DexIterators, True) {
|
||||
Corpus C{0};
|
||||
auto TrueIterator = C.all();
|
||||
EXPECT_TRUE(TrueIterator->reachedEnd());
|
||||
EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
|
||||
|
||||
C = Corpus{7};
|
||||
const PostingList L0({1, 2, 5, 7});
|
||||
TrueIterator = C.all();
|
||||
EXPECT_THAT(TrueIterator->peek(), 0);
|
||||
auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator));
|
||||
EXPECT_FALSE(AndIterator->reachedEnd());
|
||||
EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
|
||||
EXPECT_TRUE(Corpus{0}.all()->reachedEnd());
|
||||
EXPECT_THAT(consumeIDs(*Corpus{4}.all()), ElementsAre(0, 1, 2, 3));
|
||||
}
|
||||
|
||||
TEST(DexIterators, Boost) {
|
||||
|
@ -313,6 +315,38 @@ TEST(DexIterators, Boost) {
|
|||
EXPECT_THAT(ElementBoost, 3);
|
||||
}
|
||||
|
||||
TEST(DexIterators, Optimizations) {
|
||||
Corpus C{5};
|
||||
const PostingList L1({1});
|
||||
const PostingList L2({2});
|
||||
const PostingList L3({3});
|
||||
|
||||
// empty and/or yield true/false
|
||||
EXPECT_EQ(llvm::to_string(*C.intersect()), "true");
|
||||
EXPECT_EQ(llvm::to_string(*C.unionOf()), "false");
|
||||
|
||||
// true/false inside and/or short-circuit
|
||||
EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.all())), "[1]");
|
||||
EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.none())), "false");
|
||||
// Not optimized to avoid breaking boosts.
|
||||
EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.all())),
|
||||
"(| [1] true)");
|
||||
EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.none())), "[1]");
|
||||
|
||||
// and/or nested inside and/or are flattened
|
||||
EXPECT_EQ(llvm::to_string(*C.intersect(
|
||||
L1.iterator(), C.intersect(L1.iterator(), L1.iterator()))),
|
||||
"(& [1] [1] [1])");
|
||||
EXPECT_EQ(llvm::to_string(*C.unionOf(
|
||||
L1.iterator(), C.unionOf(L2.iterator(), L3.iterator()))),
|
||||
"(| [1] [2] [3])");
|
||||
|
||||
// optimizations combine over multiple levels
|
||||
EXPECT_EQ(llvm::to_string(*C.intersect(
|
||||
C.intersect(L1.iterator(), C.intersect()), C.unionOf(C.all()))),
|
||||
"[1]");
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Search token tests.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
Loading…
Reference in New Issue