From a659d779f8b47342fc72735cb4ed5d095658abd8 Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Tue, 2 Oct 2018 19:59:23 +0000 Subject: [PATCH] Reland r343589 "[clangd] Dex: add Corpus factory for iterators, rename, fold constant. NFC"" This reverts commit r343610. llvm-svn: 343622 --- clang-tools-extra/clangd/index/dex/Dex.cpp | 28 ++-- clang-tools-extra/clangd/index/dex/Dex.h | 3 +- .../clangd/index/dex/Iterator.cpp | 24 ++-- clang-tools-extra/clangd/index/dex/Iterator.h | 125 +++++++++--------- .../clangd/index/dex/PostingList.cpp | 2 +- .../unittests/clangd/DexTests.cpp | 56 ++++---- 6 files changed, 127 insertions(+), 111 deletions(-) diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp index 2c0c3d738776..3b236f8918a3 100644 --- a/clang-tools-extra/clangd/index/dex/Dex.cpp +++ b/clang-tools-extra/clangd/index/dex/Dex.cpp @@ -56,7 +56,8 @@ std::vector generateSearchTokens(const Symbol &Sym) { std::vector> createFileProximityIterators( llvm::ArrayRef ProximityPaths, llvm::ArrayRef URISchemes, - const llvm::DenseMap &InvertedIndex) { + const llvm::DenseMap &InvertedIndex, + const Corpus &Corpus) { std::vector> BoostingIterators; // Deduplicate parent URIs extracted from the ProximityPaths. llvm::StringSet<> ParentURIs; @@ -91,8 +92,8 @@ std::vector> createFileProximityIterators( if (It != InvertedIndex.end()) { // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. PathProximitySignals.SymbolURI = ParentURI; - BoostingIterators.push_back(createBoost(It->second.iterator(&It->first), - PathProximitySignals.evaluate())); + BoostingIterators.push_back(Corpus.boost( + It->second.iterator(&It->first), PathProximitySignals.evaluate())); } } return BoostingIterators; @@ -101,6 +102,7 @@ std::vector> createFileProximityIterators( } // namespace void Dex::buildIndex() { + this->Corpus = dex::Corpus(Symbols.size()); std::vector> ScoredSymbols(Symbols.size()); for (size_t I = 0; I < Symbols.size(); ++I) { @@ -159,7 +161,7 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, TrigramIterators.push_back(It->second.iterator(&It->first)); } if (!TrigramIterators.empty()) - TopLevelChildren.push_back(createAnd(move(TrigramIterators))); + TopLevelChildren.push_back(Corpus.intersect(move(TrigramIterators))); // Generate scope tokens for search query. std::vector> ScopeIterators; @@ -170,22 +172,22 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, ScopeIterators.push_back(It->second.iterator(&It->first)); } if (Req.AnyScope) - ScopeIterators.push_back(createBoost(createTrue(Symbols.size()), - ScopeIterators.empty() ? 1.0 : 0.2)); + ScopeIterators.push_back( + Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2)); // Add OR iterator for scopes if there are any Scope Iterators. if (!ScopeIterators.empty()) - TopLevelChildren.push_back(createOr(move(ScopeIterators))); + TopLevelChildren.push_back(Corpus.unionOf(move(ScopeIterators))); // Add proximity paths boosting. auto BoostingIterators = createFileProximityIterators( - Req.ProximityPaths, URISchemes, InvertedIndex); + Req.ProximityPaths, URISchemes, InvertedIndex, Corpus); // Boosting iterators do not actually filter symbols. In order to preserve // the validity of resulting query, TRUE iterator should be added along // BOOSTs. if (!BoostingIterators.empty()) { - BoostingIterators.push_back(createTrue(Symbols.size())); - TopLevelChildren.push_back(createOr(move(BoostingIterators))); + BoostingIterators.push_back(Corpus.all()); + TopLevelChildren.push_back(Corpus.unionOf(move(BoostingIterators))); } if (Req.RestrictForCodeCompletion) @@ -196,14 +198,14 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req, // Use TRUE iterator if both trigrams and scopes from the query are not // present in the symbol index. auto QueryIterator = TopLevelChildren.empty() - ? createTrue(Symbols.size()) - : createAnd(move(TopLevelChildren)); + ? Corpus.all() + : Corpus.intersect(move(TopLevelChildren)); // Retrieve more items than it was requested: some of the items with high // final score might not be retrieved otherwise. // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as // using 100x of the requested number might not be good in practice, e.g. // when the requested number of items is small. - auto Root = Req.Limit ? createLimit(move(QueryIterator), *Req.Limit * 100) + auto Root = Req.Limit ? Corpus.limit(move(QueryIterator), *Req.Limit * 100) : move(QueryIterator); SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root)); vlog("Dex query tree: {0}", *Root); diff --git a/clang-tools-extra/clangd/index/dex/Dex.h b/clang-tools-extra/clangd/index/dex/Dex.h index 4b2d856588d7..baa215310fe9 100644 --- a/clang-tools-extra/clangd/index/dex/Dex.h +++ b/clang-tools-extra/clangd/index/dex/Dex.h @@ -44,7 +44,7 @@ public: // All symbols must outlive this index. template Dex(Range &&Symbols, llvm::ArrayRef Schemes) - : URISchemes(Schemes) { + : Corpus(0), URISchemes(Schemes) { // If Schemes don't contain any items, fall back to SymbolCollector's // default URI schemes. if (URISchemes.empty()) { @@ -101,6 +101,7 @@ private: /// std. Inverted index is used to retrieve posting lists which are processed /// during the fuzzyFind process. llvm::DenseMap InvertedIndex; + dex::Corpus Corpus; std::shared_ptr KeepAlive; // poor man's move-only std::any // Size of memory retained by KeepAlive. size_t BackingDataSize = 0; diff --git a/clang-tools-extra/clangd/index/dex/Iterator.cpp b/clang-tools-extra/clangd/index/dex/Iterator.cpp index de6232974777..e7794d4aa239 100644 --- a/clang-tools-extra/clangd/index/dex/Iterator.cpp +++ b/clang-tools-extra/clangd/index/dex/Iterator.cpp @@ -64,7 +64,7 @@ public: float consume() override { assert(!reachedEnd() && "AND iterator can't consume() at the end."); - float Boost = DEFAULT_BOOST_SCORE; + float Boost = 1; for (const auto &Child : Children) Boost *= Child->consume(); return Boost; @@ -175,12 +175,12 @@ public: return Result; } - // Returns the maximum boosting score among all Children when iterator is not - // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise. + // Returns the maximum boosting score among all Children when iterator + // points to the current ID. float consume() override { assert(!reachedEnd() && "OR iterator can't consume() at the end."); const DocID ID = peek(); - float Boost = DEFAULT_BOOST_SCORE; + float Boost = 1; for (const auto &Child : Children) if (!Child->reachedEnd() && Child->peek() == ID) Boost = std::max(Boost, Child->consume()); @@ -236,7 +236,7 @@ public: float consume() override { assert(!reachedEnd() && "TRUE iterator can't consume() at the end."); - return DEFAULT_BOOST_SCORE; + return 1; } size_t estimateSize() const override { return Size; } @@ -330,30 +330,30 @@ std::vector> consume(Iterator &It) { } std::unique_ptr -createAnd(std::vector> Children) { +Corpus::intersect(std::vector> Children) const { // If there is exactly one child, pull it one level up: AND(Child) -> Child. return Children.size() == 1 ? std::move(Children.front()) : llvm::make_unique(move(Children)); } std::unique_ptr -createOr(std::vector> Children) { +Corpus::unionOf(std::vector> Children) const { // If there is exactly one child, pull it one level up: OR(Child) -> Child. return Children.size() == 1 ? std::move(Children.front()) : llvm::make_unique(move(Children)); } -std::unique_ptr createTrue(DocID Size) { +std::unique_ptr Corpus::all() const { return llvm::make_unique(Size); } -std::unique_ptr createBoost(std::unique_ptr Child, - float Factor) { +std::unique_ptr Corpus::boost(std::unique_ptr Child, + float Factor) const { return llvm::make_unique(move(Child), Factor); } -std::unique_ptr createLimit(std::unique_ptr Child, - size_t Limit) { +std::unique_ptr Corpus::limit(std::unique_ptr Child, + size_t Limit) const { return llvm::make_unique(move(Child), Limit); } diff --git a/clang-tools-extra/clangd/index/dex/Iterator.h b/clang-tools-extra/clangd/index/dex/Iterator.h index 04b344123807..2b3bd4481a1e 100644 --- a/clang-tools-extra/clangd/index/dex/Iterator.h +++ b/clang-tools-extra/clangd/index/dex/Iterator.h @@ -98,8 +98,6 @@ public: return Iterator.dump(OS); } - constexpr static float DEFAULT_BOOST_SCORE = 1; - private: virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; }; @@ -114,69 +112,74 @@ private: /// to acquire preliminary scores of requested items. std::vector> consume(Iterator &It); -/// Returns AND Iterator which performs the intersection of the PostingLists of -/// its children. -/// -/// consume(): AND Iterator returns the product of Childrens' boosting scores -/// when not exhausted and DEFAULT_BOOST_SCORE otherwise. -std::unique_ptr -createAnd(std::vector> Children); - -/// Returns OR Iterator which performs the union of the PostingLists of its -/// children. -/// -/// consume(): OR Iterator returns the highest boost value among children -/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE -/// otherwise. -std::unique_ptr -createOr(std::vector> Children); - -/// Returns TRUE Iterator which iterates over "virtual" PostingList containing -/// all items in range [0, Size) in an efficient manner. -/// -/// TRUE returns DEFAULT_BOOST_SCORE for each processed item. -std::unique_ptr createTrue(DocID Size); - -/// Returns BOOST iterator which multiplies the score of each item by given -/// factor. Boosting can be used as a computationally inexpensive filtering. -/// Users can return significantly more items using consumeAndBoost() and then -/// trim Top K using retrieval score. -std::unique_ptr createBoost(std::unique_ptr Child, - float Factor); - -/// Returns LIMIT iterator, which yields up to N elements of its child iterator. -/// Elements only count towards the limit if they are part of the final result -/// set. Therefore the following iterator (AND (2) (LIMIT (1 2) 1)) yields (2), -/// not (). -std::unique_ptr createLimit(std::unique_ptr Child, - size_t Limit); - -/// This allows createAnd(create(...), create(...)) syntax. -template std::unique_ptr createAnd(Args... args) { - std::vector> Children; - populateChildren(Children, args...); - return createAnd(move(Children)); -} - -/// This allows createOr(create(...), create(...)) syntax. -template std::unique_ptr createOr(Args... args) { - std::vector> Children; - populateChildren(Children, args...); - return createOr(move(Children)); -} - -template +namespace detail { +// Variadic template machinery. +inline void populateChildren(std::vector> &) {} +template void populateChildren(std::vector> &Children, - HeadT &Head, TailT &... Tail) { + std::unique_ptr Head, TailT... Tail) { Children.push_back(move(Head)); - populateChildren(Children, Tail...); + populateChildren(Children, move(Tail)...); } +} // namespace detail -template -void populateChildren(std::vector> &Children, - HeadT &Head) { - Children.push_back(move(Head)); -} +// A corpus is a set of documents, and a factory for iterators over them. +class Corpus { + DocID Size; + +public: + explicit Corpus(DocID Size) : Size(Size) {} + + /// Returns AND Iterator which performs the intersection of the PostingLists + /// of its children. + /// + /// consume(): AND Iterator returns the product of Childrens' boosting + /// scores. + std::unique_ptr + intersect(std::vector> Children) const; + + /// Returns OR Iterator which performs the union of the PostingLists of its + /// children. + /// + /// consume(): OR Iterator returns the highest boost value among children + /// containing the requested item. + std::unique_ptr + unionOf(std::vector> Children) const; + + /// Returns TRUE Iterator which iterates over "virtual" PostingList + /// containing all items in range [0, Size) in an efficient manner. + std::unique_ptr all() const; + + /// Returns BOOST iterator which multiplies the score of each item by given + /// factor. Boosting can be used as a computationally inexpensive filtering. + /// Users can return significantly more items using consumeAndBoost() and + /// then trim Top K using retrieval score. + std::unique_ptr boost(std::unique_ptr Child, + float Factor) const; + + /// Returns LIMIT iterator, which yields up to N elements of its child + /// iterator. Elements only count towards the limit if they are part of the + /// final result set. Therefore the following iterator (AND (2) (LIMIT (1 2) + /// 1)) yields (2), not (). + std::unique_ptr limit(std::unique_ptr Child, + size_t Limit) const; + + /// This allows intersect(create(...), create(...)) syntax. + template + std::unique_ptr intersect(Args... args) const { + std::vector> Children; + detail::populateChildren(Children, std::forward(args)...); + return intersect(move(Children)); + } + + /// This allows unionOf(create(...), create(...)) syntax. + template + std::unique_ptr unionOf(Args... args) const { + std::vector> Children; + detail::populateChildren(Children, std::forward(args)...); + return unionOf(move(Children)); + } +}; } // namespace dex } // namespace clangd diff --git a/clang-tools-extra/clangd/index/dex/PostingList.cpp b/clang-tools-extra/clangd/index/dex/PostingList.cpp index cab869f6fc4c..4cb6395aab1c 100644 --- a/clang-tools-extra/clangd/index/dex/PostingList.cpp +++ b/clang-tools-extra/clangd/index/dex/PostingList.cpp @@ -63,7 +63,7 @@ public: float consume() override { assert(!reachedEnd() && "Posting List iterator can't consume() at the end."); - return DEFAULT_BOOST_SCORE; + return 1; } size_t estimateSize() const override { diff --git a/clang-tools-extra/unittests/clangd/DexTests.cpp b/clang-tools-extra/unittests/clangd/DexTests.cpp index 6d761f513f91..28ec9c5c8344 100644 --- a/clang-tools-extra/unittests/clangd/DexTests.cpp +++ b/clang-tools-extra/unittests/clangd/DexTests.cpp @@ -70,15 +70,16 @@ TEST(DexIterators, DocumentIterator) { } TEST(DexIterators, AndTwoLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); - auto And = createAnd(L1.iterator(), L0.iterator()); + auto And = C.intersect(L1.iterator(), L0.iterator()); EXPECT_FALSE(And->reachedEnd()); EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); - And = createAnd(L0.iterator(), L1.iterator()); + And = C.intersect(L0.iterator(), L1.iterator()); And->advanceTo(0); EXPECT_EQ(And->peek(), 0U); @@ -94,11 +95,12 @@ TEST(DexIterators, AndTwoLists) { } TEST(DexIterators, AndThreeLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000}); - auto And = createAnd(L0.iterator(), L1.iterator(), L2.iterator()); + auto And = C.intersect(L0.iterator(), L1.iterator(), L2.iterator()); EXPECT_EQ(And->peek(), 7U); And->advanceTo(300); EXPECT_EQ(And->peek(), 320U); @@ -108,10 +110,11 @@ TEST(DexIterators, AndThreeLists) { } TEST(DexIterators, OrTwoLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); - auto Or = createOr(L0.iterator(), L1.iterator()); + auto Or = C.unionOf(L0.iterator(), L1.iterator()); EXPECT_FALSE(Or->reachedEnd()); EXPECT_EQ(Or->peek(), 0U); @@ -134,18 +137,19 @@ TEST(DexIterators, OrTwoLists) { Or->advanceTo(9001); EXPECT_TRUE(Or->reachedEnd()); - Or = createOr(L0.iterator(), L1.iterator()); + Or = C.unionOf(L0.iterator(), L1.iterator()); EXPECT_THAT(consumeIDs(*Or), ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); } TEST(DexIterators, OrThreeLists) { + Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000}); - auto Or = createOr(L0.iterator(), L1.iterator(), L2.iterator()); + auto Or = C.unionOf(L0.iterator(), L1.iterator(), L2.iterator()); EXPECT_FALSE(Or->reachedEnd()); EXPECT_EQ(Or->peek(), 0U); @@ -194,17 +198,18 @@ TEST(DexIterators, QueryTree) { // |1, 5, 7, 9| |1, 5| |0, 3, 5| // +----------+ +----+ +-------+ // + Corpus C{10}; const PostingList L0({1, 3, 5, 8, 9}); const PostingList L1({1, 5, 7, 9}); const PostingList L2({1, 5}); const PostingList L3({0, 3, 5}); // Root of the query tree: [1, 5] - auto Root = createAnd( + auto Root = C.intersect( // Lower And Iterator: [1, 5, 9] - createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)), + C.intersect(L0.iterator(), C.boost(L1.iterator(), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U))); + C.unionOf(C.boost(L2.iterator(), 3U), C.boost(L3.iterator(), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); @@ -226,6 +231,7 @@ TEST(DexIterators, QueryTree) { } TEST(DexIterators, StringRepresentation) { + Corpus C{10}; const PostingList L1({1, 3, 5}); const PostingList L2({1, 7, 9}); @@ -238,56 +244,60 @@ TEST(DexIterators, StringRepresentation) { auto I2 = L1.iterator(&Tok); EXPECT_EQ(llvm::to_string(*I2), "T=L2"); - auto Tree = createLimit(createAnd(move(I1), move(I2)), 10); + auto Tree = C.limit(C.intersect(move(I1), move(I2)), 10); EXPECT_EQ(llvm::to_string(*Tree), "(LIMIT 10 (& [1 3 5] T=L2))"); } TEST(DexIterators, Limit) { + Corpus C{10000}; const PostingList L0({3, 6, 7, 20, 42, 100}); const PostingList L1({1, 3, 5, 6, 7, 30, 100}); const PostingList L2({0, 3, 5, 7, 8, 100}); - auto DocIterator = createLimit(L0.iterator(), 42); + auto DocIterator = C.limit(L0.iterator(), 42); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7, 20, 42, 100)); - DocIterator = createLimit(L0.iterator(), 3); + DocIterator = C.limit(L0.iterator(), 3); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7)); - DocIterator = createLimit(L0.iterator(), 0); + DocIterator = C.limit(L0.iterator(), 0); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre()); - auto AndIterator = createAnd( - createLimit(createTrue(9000), 343), createLimit(L0.iterator(), 2), - createLimit(L1.iterator(), 3), createLimit(L2.iterator(), 42)); + auto AndIterator = + C.intersect(C.limit(C.all(), 343), C.limit(L0.iterator(), 2), + C.limit(L1.iterator(), 3), C.limit(L2.iterator(), 42)); EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(3, 7)); } TEST(DexIterators, True) { - auto TrueIterator = createTrue(0U); + Corpus C{0}; + auto TrueIterator = C.all(); EXPECT_TRUE(TrueIterator->reachedEnd()); EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); + C = Corpus{7}; const PostingList L0({1, 2, 5, 7}); - TrueIterator = createTrue(7U); + TrueIterator = C.all(); EXPECT_THAT(TrueIterator->peek(), 0); - auto AndIterator = createAnd(L0.iterator(), move(TrueIterator)); + auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator)); EXPECT_FALSE(AndIterator->reachedEnd()); EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); } TEST(DexIterators, Boost) { - auto BoostIterator = createBoost(createTrue(5U), 42U); + Corpus C{5}; + auto BoostIterator = C.boost(C.all(), 42U); EXPECT_FALSE(BoostIterator->reachedEnd()); auto ElementBoost = BoostIterator->consume(); EXPECT_THAT(ElementBoost, 42U); const PostingList L0({2, 4}); const PostingList L1({1, 4}); - auto Root = createOr(createTrue(5U), createBoost(L0.iterator(), 2U), - createBoost(L1.iterator(), 3U)); + auto Root = C.unionOf(C.all(), C.boost(L0.iterator(), 2U), + C.boost(L1.iterator(), 3U)); ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); + EXPECT_THAT(ElementBoost, 1); Root->advance(); EXPECT_THAT(Root->peek(), 1U); ElementBoost = Root->consume();