[clangd] Dex: add Corpus factory for iterators, rename, fold constant. NFC

Summary:
- Corpus avoids having to pass size to the true iterator, and (soon) any
  iterator that might optimize down to true.
- Shorten names of factory functions now they're scoped to the Corpus.
  intersect() and unionOf() rather than createAnd() or createOr() as this
  seems to read better to me, and fits with other short names. Opinion wanted!
- DEFAULT_BOOST_SCORE --> 1. This is a multiplier, don't obfuscate identity.
- Simplify variadic templates in Iterator.h

Reviewers: ioeric

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, cfe-commits

Differential Revision: https://reviews.llvm.org/D52711

llvm-svn: 343589
This commit is contained in:
Sam McCall 2018-10-02 13:44:26 +00:00
parent ea2046bea9
commit a1e7385d5c
6 changed files with 127 additions and 111 deletions

View File

@ -56,7 +56,8 @@ std::vector<Token> generateSearchTokens(const Symbol &Sym) {
std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
llvm::ArrayRef<std::string> ProximityPaths,
llvm::ArrayRef<std::string> URISchemes,
const llvm::DenseMap<Token, PostingList> &InvertedIndex) {
const llvm::DenseMap<Token, PostingList> &InvertedIndex,
const Corpus &Corpus) {
std::vector<std::unique_ptr<Iterator>> BoostingIterators;
// Deduplicate parent URIs extracted from the ProximityPaths.
llvm::StringSet<> ParentURIs;
@ -91,8 +92,8 @@ std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
if (It != InvertedIndex.end()) {
// FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
PathProximitySignals.SymbolURI = ParentURI;
BoostingIterators.push_back(createBoost(It->second.iterator(&It->first),
PathProximitySignals.evaluate()));
BoostingIterators.push_back(Corpus.boost(
It->second.iterator(&It->first), PathProximitySignals.evaluate()));
}
}
return BoostingIterators;
@ -101,6 +102,7 @@ std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
} // namespace
void Dex::buildIndex() {
this->Corpus = dex::Corpus(Symbols.size());
std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());
for (size_t I = 0; I < Symbols.size(); ++I) {
@ -159,7 +161,7 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
TrigramIterators.push_back(It->second.iterator(&It->first));
}
if (!TrigramIterators.empty())
TopLevelChildren.push_back(createAnd(move(TrigramIterators)));
TopLevelChildren.push_back(Corpus.intersect(move(TrigramIterators)));
// Generate scope tokens for search query.
std::vector<std::unique_ptr<Iterator>> ScopeIterators;
@ -170,22 +172,22 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
ScopeIterators.push_back(It->second.iterator(&It->first));
}
if (Req.AnyScope)
ScopeIterators.push_back(createBoost(createTrue(Symbols.size()),
ScopeIterators.empty() ? 1.0 : 0.2));
ScopeIterators.push_back(
Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
// Add OR iterator for scopes if there are any Scope Iterators.
if (!ScopeIterators.empty())
TopLevelChildren.push_back(createOr(move(ScopeIterators)));
TopLevelChildren.push_back(Corpus.unionOf(move(ScopeIterators)));
// Add proximity paths boosting.
auto BoostingIterators = createFileProximityIterators(
Req.ProximityPaths, URISchemes, InvertedIndex);
Req.ProximityPaths, URISchemes, InvertedIndex, Corpus);
// Boosting iterators do not actually filter symbols. In order to preserve
// the validity of resulting query, TRUE iterator should be added along
// BOOSTs.
if (!BoostingIterators.empty()) {
BoostingIterators.push_back(createTrue(Symbols.size()));
TopLevelChildren.push_back(createOr(move(BoostingIterators)));
BoostingIterators.push_back(Corpus.all());
TopLevelChildren.push_back(Corpus.unionOf(move(BoostingIterators)));
}
if (Req.RestrictForCodeCompletion)
@ -196,14 +198,14 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
// Use TRUE iterator if both trigrams and scopes from the query are not
// present in the symbol index.
auto QueryIterator = TopLevelChildren.empty()
? createTrue(Symbols.size())
: createAnd(move(TopLevelChildren));
? Corpus.all()
: Corpus.intersect(move(TopLevelChildren));
// Retrieve more items than it was requested: some of the items with high
// final score might not be retrieved otherwise.
// FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
// using 100x of the requested number might not be good in practice, e.g.
// when the requested number of items is small.
auto Root = Req.Limit ? createLimit(move(QueryIterator), *Req.Limit * 100)
auto Root = Req.Limit ? Corpus.limit(move(QueryIterator), *Req.Limit * 100)
: move(QueryIterator);
SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
vlog("Dex query tree: {0}", *Root);

View File

@ -44,7 +44,7 @@ public:
// All symbols must outlive this index.
template <typename Range>
Dex(Range &&Symbols, llvm::ArrayRef<std::string> Schemes)
: URISchemes(Schemes) {
: Corpus(0), URISchemes(Schemes) {
// If Schemes don't contain any items, fall back to SymbolCollector's
// default URI schemes.
if (URISchemes.empty()) {
@ -101,6 +101,7 @@ private:
/// std. Inverted index is used to retrieve posting lists which are processed
/// during the fuzzyFind process.
llvm::DenseMap<Token, PostingList> InvertedIndex;
Corpus Corpus;
std::shared_ptr<void> KeepAlive; // poor man's move-only std::any
// Size of memory retained by KeepAlive.
size_t BackingDataSize = 0;

View File

@ -64,7 +64,7 @@ public:
float consume() override {
assert(!reachedEnd() && "AND iterator can't consume() at the end.");
float Boost = DEFAULT_BOOST_SCORE;
float Boost = 1;
for (const auto &Child : Children)
Boost *= Child->consume();
return Boost;
@ -175,12 +175,12 @@ public:
return Result;
}
// Returns the maximum boosting score among all Children when iterator is not
// exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise.
// Returns the maximum boosting score among all Children when iterator
// points to the current ID.
float consume() override {
assert(!reachedEnd() && "OR iterator can't consume() at the end.");
const DocID ID = peek();
float Boost = DEFAULT_BOOST_SCORE;
float Boost = 1;
for (const auto &Child : Children)
if (!Child->reachedEnd() && Child->peek() == ID)
Boost = std::max(Boost, Child->consume());
@ -236,7 +236,7 @@ public:
float consume() override {
assert(!reachedEnd() && "TRUE iterator can't consume() at the end.");
return DEFAULT_BOOST_SCORE;
return 1;
}
size_t estimateSize() const override { return Size; }
@ -330,30 +330,30 @@ std::vector<std::pair<DocID, float>> consume(Iterator &It) {
}
std::unique_ptr<Iterator>
createAnd(std::vector<std::unique_ptr<Iterator>> Children) {
Corpus::intersect(std::vector<std::unique_ptr<Iterator>> Children) const {
// If there is exactly one child, pull it one level up: AND(Child) -> Child.
return Children.size() == 1 ? std::move(Children.front())
: llvm::make_unique<AndIterator>(move(Children));
}
std::unique_ptr<Iterator>
createOr(std::vector<std::unique_ptr<Iterator>> Children) {
Corpus::unionOf(std::vector<std::unique_ptr<Iterator>> Children) const {
// If there is exactly one child, pull it one level up: OR(Child) -> Child.
return Children.size() == 1 ? std::move(Children.front())
: llvm::make_unique<OrIterator>(move(Children));
}
std::unique_ptr<Iterator> createTrue(DocID Size) {
std::unique_ptr<Iterator> Corpus::all() const {
return llvm::make_unique<TrueIterator>(Size);
}
std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
float Factor) {
std::unique_ptr<Iterator> Corpus::boost(std::unique_ptr<Iterator> Child,
float Factor) const {
return llvm::make_unique<BoostIterator>(move(Child), Factor);
}
std::unique_ptr<Iterator> createLimit(std::unique_ptr<Iterator> Child,
size_t Limit) {
std::unique_ptr<Iterator> Corpus::limit(std::unique_ptr<Iterator> Child,
size_t Limit) const {
return llvm::make_unique<LimitIterator>(move(Child), Limit);
}

View File

@ -101,8 +101,6 @@ public:
return Iterator.dump(OS);
}
constexpr static float DEFAULT_BOOST_SCORE = 1;
private:
virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
};
@ -117,69 +115,74 @@ private:
/// to acquire preliminary scores of requested items.
std::vector<std::pair<DocID, float>> consume(Iterator &It);
/// Returns AND Iterator which performs the intersection of the PostingLists of
/// its children.
///
/// consume(): AND Iterator returns the product of Childrens' boosting scores
/// when not exhausted and DEFAULT_BOOST_SCORE otherwise.
std::unique_ptr<Iterator>
createAnd(std::vector<std::unique_ptr<Iterator>> Children);
/// Returns OR Iterator which performs the union of the PostingLists of its
/// children.
///
/// consume(): OR Iterator returns the highest boost value among children
/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE
/// otherwise.
std::unique_ptr<Iterator>
createOr(std::vector<std::unique_ptr<Iterator>> Children);
/// Returns TRUE Iterator which iterates over "virtual" PostingList containing
/// all items in range [0, Size) in an efficient manner.
///
/// TRUE returns DEFAULT_BOOST_SCORE for each processed item.
std::unique_ptr<Iterator> createTrue(DocID Size);
/// Returns BOOST iterator which multiplies the score of each item by given
/// factor. Boosting can be used as a computationally inexpensive filtering.
/// Users can return significantly more items using consumeAndBoost() and then
/// trim Top K using retrieval score.
std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
float Factor);
/// Returns LIMIT iterator, which yields up to N elements of its child iterator.
/// Elements only count towards the limit if they are part of the final result
/// set. Therefore the following iterator (AND (2) (LIMIT (1 2) 1)) yields (2),
/// not ().
std::unique_ptr<Iterator> createLimit(std::unique_ptr<Iterator> Child,
size_t Limit);
/// This allows createAnd(create(...), create(...)) syntax.
template <typename... Args> std::unique_ptr<Iterator> createAnd(Args... args) {
std::vector<std::unique_ptr<Iterator>> Children;
populateChildren(Children, args...);
return createAnd(move(Children));
}
/// This allows createOr(create(...), create(...)) syntax.
template <typename... Args> std::unique_ptr<Iterator> createOr(Args... args) {
std::vector<std::unique_ptr<Iterator>> Children;
populateChildren(Children, args...);
return createOr(move(Children));
}
template <typename HeadT, typename... TailT>
namespace detail {
// Variadic template machinery.
inline void populateChildren(std::vector<std::unique_ptr<Iterator>> &) {}
template <typename... TailT>
void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
HeadT &Head, TailT &... Tail) {
std::unique_ptr<Iterator> Head, TailT... Tail) {
Children.push_back(move(Head));
populateChildren(Children, Tail...);
populateChildren(Children, move(Tail)...);
}
} // namespace detail
template <typename HeadT>
void populateChildren(std::vector<std::unique_ptr<Iterator>> &Children,
HeadT &Head) {
Children.push_back(move(Head));
}
// A corpus is a set of documents, and a factory for iterators over them.
class Corpus {
DocID Size;
public:
explicit Corpus(DocID Size) : Size(Size) {}
/// Returns AND Iterator which performs the intersection of the PostingLists
/// of its children.
///
/// consume(): AND Iterator returns the product of Childrens' boosting
/// scores.
std::unique_ptr<Iterator>
intersect(std::vector<std::unique_ptr<Iterator>> Children) const;
/// Returns OR Iterator which performs the union of the PostingLists of its
/// children.
///
/// consume(): OR Iterator returns the highest boost value among children
/// containing the requested item.
std::unique_ptr<Iterator>
unionOf(std::vector<std::unique_ptr<Iterator>> Children) const;
/// Returns TRUE Iterator which iterates over "virtual" PostingList
/// containing all items in range [0, Size) in an efficient manner.
std::unique_ptr<Iterator> all() const;
/// Returns BOOST iterator which multiplies the score of each item by given
/// factor. Boosting can be used as a computationally inexpensive filtering.
/// Users can return significantly more items using consumeAndBoost() and
/// then trim Top K using retrieval score.
std::unique_ptr<Iterator> boost(std::unique_ptr<Iterator> Child,
float Factor) const;
/// Returns LIMIT iterator, which yields up to N elements of its child
/// iterator. Elements only count towards the limit if they are part of the
/// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2)
/// 1)) yields (2), not ().
std::unique_ptr<Iterator> limit(std::unique_ptr<Iterator> Child,
size_t Limit) const;
/// This allows intersect(create(...), create(...)) syntax.
template <typename... Args>
std::unique_ptr<Iterator> intersect(Args... args) const {
std::vector<std::unique_ptr<Iterator>> Children;
detail::populateChildren(Children, std::forward<Args>(args)...);
return intersect(move(Children));
}
/// This allows unionOf(create(...), create(...)) syntax.
template <typename... Args>
std::unique_ptr<Iterator> unionOf(Args... args) const {
std::vector<std::unique_ptr<Iterator>> Children;
detail::populateChildren(Children, std::forward<Args>(args)...);
return unionOf(move(Children));
}
};
} // namespace dex
} // namespace clangd

View File

@ -63,7 +63,7 @@ public:
float consume() override {
assert(!reachedEnd() &&
"Posting List iterator can't consume() at the end.");
return DEFAULT_BOOST_SCORE;
return 1;
}
size_t estimateSize() const override {

View File

@ -70,15 +70,16 @@ TEST(DexIterators, DocumentIterator) {
}
TEST(DexIterators, AndTwoLists) {
Corpus C{10000};
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
auto And = createAnd(L1.iterator(), L0.iterator());
auto And = C.intersect(L1.iterator(), L0.iterator());
EXPECT_FALSE(And->reachedEnd());
EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
And = createAnd(L0.iterator(), L1.iterator());
And = C.intersect(L0.iterator(), L1.iterator());
And->advanceTo(0);
EXPECT_EQ(And->peek(), 0U);
@ -94,11 +95,12 @@ TEST(DexIterators, AndTwoLists) {
}
TEST(DexIterators, AndThreeLists) {
Corpus C{10000};
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000});
auto And = createAnd(L0.iterator(), L1.iterator(), L2.iterator());
auto And = C.intersect(L0.iterator(), L1.iterator(), L2.iterator());
EXPECT_EQ(And->peek(), 7U);
And->advanceTo(300);
EXPECT_EQ(And->peek(), 320U);
@ -108,10 +110,11 @@ TEST(DexIterators, AndThreeLists) {
}
TEST(DexIterators, OrTwoLists) {
Corpus C{10000};
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
auto Or = createOr(L0.iterator(), L1.iterator());
auto Or = C.unionOf(L0.iterator(), L1.iterator());
EXPECT_FALSE(Or->reachedEnd());
EXPECT_EQ(Or->peek(), 0U);
@ -134,18 +137,19 @@ TEST(DexIterators, OrTwoLists) {
Or->advanceTo(9001);
EXPECT_TRUE(Or->reachedEnd());
Or = createOr(L0.iterator(), L1.iterator());
Or = C.unionOf(L0.iterator(), L1.iterator());
EXPECT_THAT(consumeIDs(*Or),
ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U));
}
TEST(DexIterators, OrThreeLists) {
Corpus C{10000};
const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000});
const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000});
auto Or = createOr(L0.iterator(), L1.iterator(), L2.iterator());
auto Or = C.unionOf(L0.iterator(), L1.iterator(), L2.iterator());
EXPECT_FALSE(Or->reachedEnd());
EXPECT_EQ(Or->peek(), 0U);
@ -194,17 +198,18 @@ TEST(DexIterators, QueryTree) {
// |1, 5, 7, 9| |1, 5| |0, 3, 5|
// +----------+ +----+ +-------+
//
Corpus C{10};
const PostingList L0({1, 3, 5, 8, 9});
const PostingList L1({1, 5, 7, 9});
const PostingList L2({1, 5});
const PostingList L3({0, 3, 5});
// Root of the query tree: [1, 5]
auto Root = createAnd(
auto Root = C.intersect(
// Lower And Iterator: [1, 5, 9]
createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)),
C.intersect(L0.iterator(), C.boost(L1.iterator(), 2U)),
// Lower Or Iterator: [0, 1, 5]
createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U)));
C.unionOf(C.boost(L2.iterator(), 3U), C.boost(L3.iterator(), 4U)));
EXPECT_FALSE(Root->reachedEnd());
EXPECT_EQ(Root->peek(), 1U);
@ -226,6 +231,7 @@ TEST(DexIterators, QueryTree) {
}
TEST(DexIterators, StringRepresentation) {
Corpus C{10};
const PostingList L1({1, 3, 5});
const PostingList L2({1, 7, 9});
@ -238,56 +244,60 @@ TEST(DexIterators, StringRepresentation) {
auto I2 = L1.iterator(&Tok);
EXPECT_EQ(llvm::to_string(*I2), "T=L2");
auto Tree = createLimit(createAnd(move(I1), move(I2)), 10);
auto Tree = C.limit(C.intersect(move(I1), move(I2)), 10);
EXPECT_EQ(llvm::to_string(*Tree), "(LIMIT 10 (& [1 3 5] T=L2))");
}
TEST(DexIterators, Limit) {
Corpus C{10000};
const PostingList L0({3, 6, 7, 20, 42, 100});
const PostingList L1({1, 3, 5, 6, 7, 30, 100});
const PostingList L2({0, 3, 5, 7, 8, 100});
auto DocIterator = createLimit(L0.iterator(), 42);
auto DocIterator = C.limit(L0.iterator(), 42);
EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7, 20, 42, 100));
DocIterator = createLimit(L0.iterator(), 3);
DocIterator = C.limit(L0.iterator(), 3);
EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7));
DocIterator = createLimit(L0.iterator(), 0);
DocIterator = C.limit(L0.iterator(), 0);
EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre());
auto AndIterator = createAnd(
createLimit(createTrue(9000), 343), createLimit(L0.iterator(), 2),
createLimit(L1.iterator(), 3), createLimit(L2.iterator(), 42));
auto AndIterator =
C.intersect(C.limit(C.all(), 343), C.limit(L0.iterator(), 2),
C.limit(L1.iterator(), 3), C.limit(L2.iterator(), 42));
EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(3, 7));
}
TEST(DexIterators, True) {
auto TrueIterator = createTrue(0U);
Corpus C{0};
auto TrueIterator = C.all();
EXPECT_TRUE(TrueIterator->reachedEnd());
EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
C = Corpus{7};
const PostingList L0({1, 2, 5, 7});
TrueIterator = createTrue(7U);
TrueIterator = C.all();
EXPECT_THAT(TrueIterator->peek(), 0);
auto AndIterator = createAnd(L0.iterator(), move(TrueIterator));
auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator));
EXPECT_FALSE(AndIterator->reachedEnd());
EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
}
TEST(DexIterators, Boost) {
auto BoostIterator = createBoost(createTrue(5U), 42U);
Corpus C{5};
auto BoostIterator = C.boost(C.all(), 42U);
EXPECT_FALSE(BoostIterator->reachedEnd());
auto ElementBoost = BoostIterator->consume();
EXPECT_THAT(ElementBoost, 42U);
const PostingList L0({2, 4});
const PostingList L1({1, 4});
auto Root = createOr(createTrue(5U), createBoost(L0.iterator(), 2U),
createBoost(L1.iterator(), 3U));
auto Root = C.unionOf(C.all(), C.boost(L0.iterator(), 2U),
C.boost(L1.iterator(), 3U));
ElementBoost = Root->consume();
EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE);
EXPECT_THAT(ElementBoost, 1);
Root->advance();
EXPECT_THAT(Root->peek(), 1U);
ElementBoost = Root->consume();