From a1e7385d5c8b7ae8f2368b0a7cb5102dd4a9ab1a Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Tue, 2 Oct 2018 13:44:26 +0000 Subject: [PATCH] [clangd] Dex: add Corpus factory for iterators, rename, fold constant. NFC Summary: - Corpus avoids having to pass size to the true iterator, and (soon) any iterator that might optimize down to true. - Shorten names of factory functions now they're scoped to the Corpus. intersect() and unionOf() rather than createAnd() or createOr() as this seems to read better to me, and fits with other short names. Opinion wanted! - DEFAULT_BOOST_SCORE --> 1. This is a multiplier, don't obfuscate identity. - Simplify variadic templates in Iterator.h Reviewers: ioeric Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, cfe-commits Differential Revision: https://reviews.llvm.org/D52711 llvm-svn: 343589 --- clang-tools-extra/clangd/index/dex/Dex.cpp | 28 ++-- clang-tools-extra/clangd/index/dex/Dex.h | 3 +- .../clangd/index/dex/Iterator.cpp | 24 ++-- clang-tools-extra/clangd/index/dex/Iterator.h | 125 +++++++++--------- .../clangd/index/dex/PostingList.cpp | 2 +- .../unittests/clangd/DexTests.cpp | 56 ++++---- 6 files changed, 127 insertions(+), 111 deletions(-) diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp index 2c0c3d738776..3b236f8918a3 100644 --- a/clang-tools-extra/clangd/index/dex/Dex.cpp +++ b/clang-tools-extra/clangd/index/dex/Dex.cpp @@ -56,7 +56,8 @@ std::vector generateSearchTokens(const Symbol &Sym) { std::vector> createFileProximityIterators( llvm::ArrayRef ProximityPaths, llvm::ArrayRef URISchemes, - const llvm::DenseMap &InvertedIndex) { + const llvm::DenseMap &InvertedIndex, + const Corpus &Corpus) { std::vector> BoostingIterators; // Deduplicate parent URIs extracted from the ProximityPaths. llvm::StringSet<> ParentURIs; @@ -91,8 +92,8 @@ std::vector> createFileProximityIterators( if (It != InvertedIndex.end()) { // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. PathProximitySignals.SymbolURI = ParentURI; - BoostingIterators.push_back(createBoost(It->second.iterator(&It->first), - PathProximitySignals.evaluate())); + BoostingIterators.push_back(Corpus.boost( + It->second.iterator(&It->first), PathProximitySignals.evaluate())); } } return BoostingIterators; @@ -101,6 +102,7 @@ std::vector> createFileProximityIterators( } // namespace void Dex::buildIndex() { + this->Corpus = dex::Corpus(Symbols.size()); std::vector> ScoredSymbols(Symbols.size()); for (size_t I = 0; I < Symbols.size(); ++I) { @@ -159,7 +161,7 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, TrigramIterators.push_back(It->second.iterator(&It->first)); } if (!TrigramIterators.empty()) - TopLevelChildren.push_back(createAnd(move(TrigramIterators))); + TopLevelChildren.push_back(Corpus.intersect(move(TrigramIterators))); // Generate scope tokens for search query. std::vector> ScopeIterators; @@ -170,22 +172,22 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, ScopeIterators.push_back(It->second.iterator(&It->first)); } if (Req.AnyScope) - ScopeIterators.push_back(createBoost(createTrue(Symbols.size()), - ScopeIterators.empty() ? 1.0 : 0.2)); + ScopeIterators.push_back( + Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2)); // Add OR iterator for scopes if there are any Scope Iterators. if (!ScopeIterators.empty()) - TopLevelChildren.push_back(createOr(move(ScopeIterators))); + TopLevelChildren.push_back(Corpus.unionOf(move(ScopeIterators))); // Add proximity paths boosting. auto BoostingIterators = createFileProximityIterators( - Req.ProximityPaths, URISchemes, InvertedIndex); + Req.ProximityPaths, URISchemes, InvertedIndex, Corpus); // Boosting iterators do not actually filter symbols. In order to preserve // the validity of resulting query, TRUE iterator should be added along // BOOSTs. if (!BoostingIterators.empty()) { - BoostingIterators.push_back(createTrue(Symbols.size())); - TopLevelChildren.push_back(createOr(move(BoostingIterators))); + BoostingIterators.push_back(Corpus.all()); + TopLevelChildren.push_back(Corpus.unionOf(move(BoostingIterators))); } if (Req.RestrictForCodeCompletion) @@ -196,14 +198,14 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, // Use TRUE iterator if both trigrams and scopes from the query are not // present in the symbol index. auto QueryIterator = TopLevelChildren.empty() - ? createTrue(Symbols.size()) - : createAnd(move(TopLevelChildren)); + ? Corpus.all() + : Corpus.intersect(move(TopLevelChildren)); // Retrieve more items than it was requested: some of the items with high // final score might not be retrieved otherwise. // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as // using 100x of the requested number might not be good in practice, e.g. // when the requested number of items is small. - auto Root = Req.Limit ? createLimit(move(QueryIterator), *Req.Limit * 100) + auto Root = Req.Limit ? Corpus.limit(move(QueryIterator), *Req.Limit * 100) : move(QueryIterator); SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root)); vlog("Dex query tree: {0}", *Root); diff --git a/clang-tools-extra/clangd/index/dex/Dex.h b/clang-tools-extra/clangd/index/dex/Dex.h index 4b2d856588d7..d8360ba3fe42 100644 --- a/clang-tools-extra/clangd/index/dex/Dex.h +++ b/clang-tools-extra/clangd/index/dex/Dex.h @@ -44,7 +44,7 @@ public: // All symbols must outlive this index. template Dex(Range &&Symbols, llvm::ArrayRef Schemes) - : URISchemes(Schemes) { + : Corpus(0), URISchemes(Schemes) { // If Schemes don't contain any items, fall back to SymbolCollector's // default URI schemes. if (URISchemes.empty()) { @@ -101,6 +101,7 @@ private: /// std. Inverted index is used to retrieve posting lists which are processed /// during the fuzzyFind process. llvm::DenseMap InvertedIndex; + Corpus Corpus; std::shared_ptr KeepAlive; // poor man's move-only std::any // Size of memory retained by KeepAlive. size_t BackingDataSize = 0; diff --git a/clang-tools-extra/clangd/index/dex/Iterator.cpp b/clang-tools-extra/clangd/index/dex/Iterator.cpp index de6232974777..e7794d4aa239 100644 --- a/clang-tools-extra/clangd/index/dex/Iterator.cpp +++ b/clang-tools-extra/clangd/index/dex/Iterator.cpp @@ -64,7 +64,7 @@ public: float consume() override { assert(!reachedEnd() && "AND iterator can't consume() at the end."); - float Boost = DEFAULT_BOOST_SCORE; + float Boost = 1; for (const auto &Child : Children) Boost *= Child->consume(); return Boost; @@ -175,12 +175,12 @@ public: return Result; } - // Returns the maximum boosting score among all Children when iterator is not - // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise. + // Returns the maximum boosting score among all Children when iterator + // points to the current ID. float consume() override { assert(!reachedEnd() && "OR iterator can't consume() at the end."); const DocID ID = peek(); - float Boost = DEFAULT_BOOST_SCORE; + float Boost = 1; for (const auto &Child : Children) if (!Child->reachedEnd() && Child->peek() == ID) Boost = std::max(Boost, Child->consume()); @@ -236,7 +236,7 @@ public: float consume() override { assert(!reachedEnd() && "TRUE iterator can't consume() at the end."); - return DEFAULT_BOOST_SCORE; + return 1; } size_t estimateSize() const override { return Size; } @@ -330,30 +330,30 @@ std::vector> consume(Iterator &It) { } std::unique_ptr -createAnd(std::vector> Children) { +Corpus::intersect(std::vector> Children) const { // If there is exactly one child, pull it one level up: AND(Child) -> Child. return Children.size() == 1 ? std::move(Children.front()) : llvm::make_unique(move(Children)); } std::unique_ptr -createOr(std::vector> Children) { +Corpus::unionOf(std::vector> Children) const { // If there is exactly one child, pull it one level up: OR(Child) -> Child. return Children.size() == 1 ? std::move(Children.front()) : llvm::make_unique(move(Children)); } -std::unique_ptr createTrue(DocID Size) { +std::unique_ptr Corpus::all() const { return llvm::make_unique(Size); } -std::unique_ptr createBoost(std::unique_ptr Child, - float Factor) { +std::unique_ptr Corpus::boost(std::unique_ptr Child, + float Factor) const { return llvm::make_unique(move(Child), Factor); } -std::unique_ptr createLimit(std::unique_ptr Child, - size_t Limit) { +std::unique_ptr Corpus::limit(std::unique_ptr Child, + size_t Limit) const { return llvm::make_unique(move(Child), Limit); } diff --git a/clang-tools-extra/clangd/index/dex/Iterator.h b/clang-tools-extra/clangd/index/dex/Iterator.h index 3066a8c78460..ba1efbf813fd 100644 --- a/clang-tools-extra/clangd/index/dex/Iterator.h +++ b/clang-tools-extra/clangd/index/dex/Iterator.h @@ -101,8 +101,6 @@ public: return Iterator.dump(OS); } - constexpr static float DEFAULT_BOOST_SCORE = 1; - private: virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; }; @@ -117,69 +115,74 @@ private: /// to acquire preliminary scores of requested items. std::vector> consume(Iterator &It); -/// Returns AND Iterator which performs the intersection of the PostingLists of -/// its children. -/// -/// consume(): AND Iterator returns the product of Childrens' boosting scores -/// when not exhausted and DEFAULT_BOOST_SCORE otherwise. -std::unique_ptr -createAnd(std::vector> Children); - -/// Returns OR Iterator which performs the union of the PostingLists of its -/// children. -/// -/// consume(): OR Iterator returns the highest boost value among children -/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE -/// otherwise. -std::unique_ptr -createOr(std::vector> Children); - -/// Returns TRUE Iterator which iterates over "virtual" PostingList containing -/// all items in range [0, Size) in an efficient manner. -/// -/// TRUE returns DEFAULT_BOOST_SCORE for each processed item. -std::unique_ptr createTrue(DocID Size); - -/// Returns BOOST iterator which multiplies the score of each item by given -/// factor. Boosting can be used as a computationally inexpensive filtering. -/// Users can return significantly more items using consumeAndBoost() and then -/// trim Top K using retrieval score. -std::unique_ptr createBoost(std::unique_ptr Child, - float Factor); - -/// Returns LIMIT iterator, which yields up to N elements of its child iterator. -/// Elements only count towards the limit if they are part of the final result -/// set. Therefore the following iterator (AND (2) (LIMIT (1 2) 1)) yields (2), -/// not (). -std::unique_ptr createLimit(std::unique_ptr Child, - size_t Limit); - -/// This allows createAnd(create(...), create(...)) syntax. -template std::unique_ptr createAnd(Args... args) { - std::vector> Children; - populateChildren(Children, args...); - return createAnd(move(Children)); -} - -/// This allows createOr(create(...), create(...)) syntax. -template std::unique_ptr createOr(Args... args) { - std::vector> Children; - populateChildren(Children, args...); - return createOr(move(Children)); -} - -template +namespace detail { +// Variadic template machinery. +inline void populateChildren(std::vector> &) {} +template void populateChildren(std::vector> &Children, - HeadT &Head, TailT &... Tail) { + std::unique_ptr Head, TailT... Tail) { Children.push_back(move(Head)); - populateChildren(Children, Tail...); + populateChildren(Children, move(Tail)...); } +} // namespace detail -template -void populateChildren(std::vector> &Children, - HeadT &Head) { - Children.push_back(move(Head)); -} +// A corpus is a set of documents, and a factory for iterators over them. +class Corpus { + DocID Size; + +public: + explicit Corpus(DocID Size) : Size(Size) {} + + /// Returns AND Iterator which performs the intersection of the PostingLists + /// of its children. + /// + /// consume(): AND Iterator returns the product of Childrens' boosting + /// scores. + std::unique_ptr + intersect(std::vector> Children) const; + + /// Returns OR Iterator which performs the union of the PostingLists of its + /// children. + /// + /// consume(): OR Iterator returns the highest boost value among children + /// containing the requested item. + std::unique_ptr + unionOf(std::vector> Children) const; + + /// Returns TRUE Iterator which iterates over "virtual" PostingList + /// containing all items in range [0, Size) in an efficient manner. + std::unique_ptr all() const; + + /// Returns BOOST iterator which multiplies the score of each item by given + /// factor. Boosting can be used as a computationally inexpensive filtering. + /// Users can return significantly more items using consumeAndBoost() and + /// then trim Top K using retrieval score. + std::unique_ptr boost(std::unique_ptr Child, + float Factor) const; + + /// Returns LIMIT iterator, which yields up to N elements of its child + /// iterator. Elements only count towards the limit if they are part of the + /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2) + /// 1)) yields (2), not (). + std::unique_ptr limit(std::unique_ptr Child, + size_t Limit) const; + + /// This allows intersect(create(...), create(...)) syntax. + template + std::unique_ptr intersect(Args... args) const { + std::vector> Children; + detail::populateChildren(Children, std::forward(args)...); + return intersect(move(Children)); + } + + /// This allows unionOf(create(...), create(...)) syntax. + template + std::unique_ptr unionOf(Args... args) const { + std::vector> Children; + detail::populateChildren(Children, std::forward(args)...); + return unionOf(move(Children)); + } +}; } // namespace dex } // namespace clangd diff --git a/clang-tools-extra/clangd/index/dex/PostingList.cpp b/clang-tools-extra/clangd/index/dex/PostingList.cpp index cab869f6fc4c..4cb6395aab1c 100644 --- a/clang-tools-extra/clangd/index/dex/PostingList.cpp +++ b/clang-tools-extra/clangd/index/dex/PostingList.cpp @@ -63,7 +63,7 @@ public: float consume() override { assert(!reachedEnd() && "Posting List iterator can't consume() at the end."); - return DEFAULT_BOOST_SCORE; + return 1; } size_t estimateSize() const override { diff --git a/clang-tools-extra/unittests/clangd/DexTests.cpp b/clang-tools-extra/unittests/clangd/DexTests.cpp index 6d761f513f91..28ec9c5c8344 100644 --- a/clang-tools-extra/unittests/clangd/DexTests.cpp +++ b/clang-tools-extra/unittests/clangd/DexTests.cpp @@ -70,15 +70,16 @@ TEST(DexIterators, DocumentIterator) { } TEST(DexIterators, AndTwoLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); - auto And = createAnd(L1.iterator(), L0.iterator()); + auto And = C.intersect(L1.iterator(), L0.iterator()); EXPECT_FALSE(And->reachedEnd()); EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); - And = createAnd(L0.iterator(), L1.iterator()); + And = C.intersect(L0.iterator(), L1.iterator()); And->advanceTo(0); EXPECT_EQ(And->peek(), 0U); @@ -94,11 +95,12 @@ TEST(DexIterators, AndTwoLists) { } TEST(DexIterators, AndThreeLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000}); - auto And = createAnd(L0.iterator(), L1.iterator(), L2.iterator()); + auto And = C.intersect(L0.iterator(), L1.iterator(), L2.iterator()); EXPECT_EQ(And->peek(), 7U); And->advanceTo(300); EXPECT_EQ(And->peek(), 320U); @@ -108,10 +110,11 @@ TEST(DexIterators, AndThreeLists) { } TEST(DexIterators, OrTwoLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); - auto Or = createOr(L0.iterator(), L1.iterator()); + auto Or = C.unionOf(L0.iterator(), L1.iterator()); EXPECT_FALSE(Or->reachedEnd()); EXPECT_EQ(Or->peek(), 0U); @@ -134,18 +137,19 @@ TEST(DexIterators, OrTwoLists) { Or->advanceTo(9001); EXPECT_TRUE(Or->reachedEnd()); - Or = createOr(L0.iterator(), L1.iterator()); + Or = C.unionOf(L0.iterator(), L1.iterator()); EXPECT_THAT(consumeIDs(*Or), ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); } TEST(DexIterators, OrThreeLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000}); - auto Or = createOr(L0.iterator(), L1.iterator(), L2.iterator()); + auto Or = C.unionOf(L0.iterator(), L1.iterator(), L2.iterator()); EXPECT_FALSE(Or->reachedEnd()); EXPECT_EQ(Or->peek(), 0U); @@ -194,17 +198,18 @@ TEST(DexIterators, QueryTree) { // |1, 5, 7, 9| |1, 5| |0, 3, 5| // +----------+ +----+ +-------+ // + Corpus C{10}; const PostingList L0({1, 3, 5, 8, 9}); const PostingList L1({1, 5, 7, 9}); const PostingList L2({1, 5}); const PostingList L3({0, 3, 5}); // Root of the query tree: [1, 5] - auto Root = createAnd( + auto Root = C.intersect( // Lower And Iterator: [1, 5, 9] - createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)), + C.intersect(L0.iterator(), C.boost(L1.iterator(), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U))); + C.unionOf(C.boost(L2.iterator(), 3U), C.boost(L3.iterator(), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); @@ -226,6 +231,7 @@ TEST(DexIterators, QueryTree) { } TEST(DexIterators, StringRepresentation) { + Corpus C{10}; const PostingList L1({1, 3, 5}); const PostingList L2({1, 7, 9}); @@ -238,56 +244,60 @@ TEST(DexIterators, StringRepresentation) { auto I2 = L1.iterator(&Tok); EXPECT_EQ(llvm::to_string(*I2), "T=L2"); - auto Tree = createLimit(createAnd(move(I1), move(I2)), 10); + auto Tree = C.limit(C.intersect(move(I1), move(I2)), 10); EXPECT_EQ(llvm::to_string(*Tree), "(LIMIT 10 (& [1 3 5] T=L2))"); } TEST(DexIterators, Limit) { + Corpus C{10000}; const PostingList L0({3, 6, 7, 20, 42, 100}); const PostingList L1({1, 3, 5, 6, 7, 30, 100}); const PostingList L2({0, 3, 5, 7, 8, 100}); - auto DocIterator = createLimit(L0.iterator(), 42); + auto DocIterator = C.limit(L0.iterator(), 42); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7, 20, 42, 100)); - DocIterator = createLimit(L0.iterator(), 3); + DocIterator = C.limit(L0.iterator(), 3); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7)); - DocIterator = createLimit(L0.iterator(), 0); + DocIterator = C.limit(L0.iterator(), 0); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre()); - auto AndIterator = createAnd( - createLimit(createTrue(9000), 343), createLimit(L0.iterator(), 2), - createLimit(L1.iterator(), 3), createLimit(L2.iterator(), 42)); + auto AndIterator = + C.intersect(C.limit(C.all(), 343), C.limit(L0.iterator(), 2), + C.limit(L1.iterator(), 3), C.limit(L2.iterator(), 42)); EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(3, 7)); } TEST(DexIterators, True) { - auto TrueIterator = createTrue(0U); + Corpus C{0}; + auto TrueIterator = C.all(); EXPECT_TRUE(TrueIterator->reachedEnd()); EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); + C = Corpus{7}; const PostingList L0({1, 2, 5, 7}); - TrueIterator = createTrue(7U); + TrueIterator = C.all(); EXPECT_THAT(TrueIterator->peek(), 0); - auto AndIterator = createAnd(L0.iterator(), move(TrueIterator)); + auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator)); EXPECT_FALSE(AndIterator->reachedEnd()); EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); } TEST(DexIterators, Boost) { - auto BoostIterator = createBoost(createTrue(5U), 42U); + Corpus C{5}; + auto BoostIterator = C.boost(C.all(), 42U); EXPECT_FALSE(BoostIterator->reachedEnd()); auto ElementBoost = BoostIterator->consume(); EXPECT_THAT(ElementBoost, 42U); const PostingList L0({2, 4}); const PostingList L1({1, 4}); - auto Root = createOr(createTrue(5U), createBoost(L0.iterator(), 2U), - createBoost(L1.iterator(), 3U)); + auto Root = C.unionOf(C.all(), C.boost(L0.iterator(), 2U), + C.boost(L1.iterator(), 3U)); ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); + EXPECT_THAT(ElementBoost, 1); Root->advance(); EXPECT_THAT(Root->peek(), 1U); ElementBoost = Root->consume();