forked from OSchip/llvm-project
[clangd] Simplify Dex query tree logic and fix missing-posting-list bug
Summary: The bug being fixed: when a posting list doesn't exist in the index, it was previously just dropped from the query rather than being treated as empty. Now that we have the FALSE iterator, we can use it instead. The query tree logic previously had a bunch of special cases to detect whether subtrees are empty. Now we just naively build the whole tree, and rely on the query optimizations to drop the trivial parts. Finally, there was a bug in trigram generation: the empty query would generate a single trigram "$$$" instead of no trigrams. This had no effect (there was no posting list, so the other bug cancelled it out). But we now have to fix this bug too. Reviewers: ilya-biryukov Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits Differential Revision: https://reviews.llvm.org/D52796 llvm-svn: 343802
This commit is contained in:
parent
aa728f1afa
commit
50b89f0a9b
|
@ -62,7 +62,7 @@ std::vector<Token> generateSearchTokens(const Symbol &Sym) {
|
|||
}
|
||||
|
||||
// Constructs BOOST iterators for Path Proximities.
|
||||
std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
|
||||
std::unique_ptr<Iterator> createFileProximityIterator(
|
||||
llvm::ArrayRef<std::string> ProximityPaths,
|
||||
llvm::ArrayRef<std::string> URISchemes,
|
||||
const llvm::DenseMap<Token, PostingList> &InvertedIndex,
|
||||
|
@ -105,7 +105,8 @@ std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
|
|||
It->second.iterator(&It->first), PathProximitySignals.evaluate()));
|
||||
}
|
||||
}
|
||||
return BoostingIterators;
|
||||
BoostingIterators.push_back(Corpus.all());
|
||||
return Corpus.unionOf(std::move(BoostingIterators));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -147,6 +148,12 @@ void Dex::buildIndex() {
|
|||
{TokenToPostingList.first, PostingList(TokenToPostingList.second)});
|
||||
}
|
||||
|
||||
std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
|
||||
auto It = InvertedIndex.find(Tok);
|
||||
return It == InvertedIndex.end() ? Corpus.none()
|
||||
: It->second.iterator(&It->first);
|
||||
}
|
||||
|
||||
/// Constructs iterators over tokens extracted from the query and exhausts it
|
||||
/// while applying Callback to each symbol in the order of decreasing quality
|
||||
/// of the matched symbols.
|
||||
|
@ -160,64 +167,40 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
|
|||
// Prevent clients from postfiltering them for longer queries.
|
||||
bool More = !Req.Query.empty() && Req.Query.size() < 3;
|
||||
|
||||
std::vector<std::unique_ptr<Iterator>> TopLevelChildren;
|
||||
std::vector<std::unique_ptr<Iterator>> Criteria;
|
||||
const auto TrigramTokens = generateQueryTrigrams(Req.Query);
|
||||
|
||||
// Generate query trigrams and construct AND iterator over all query
|
||||
// trigrams.
|
||||
std::vector<std::unique_ptr<Iterator>> TrigramIterators;
|
||||
for (const auto &Trigram : TrigramTokens) {
|
||||
const auto It = InvertedIndex.find(Trigram);
|
||||
if (It != InvertedIndex.end())
|
||||
TrigramIterators.push_back(It->second.iterator(&It->first));
|
||||
}
|
||||
if (!TrigramIterators.empty())
|
||||
TopLevelChildren.push_back(Corpus.intersect(move(TrigramIterators)));
|
||||
for (const auto &Trigram : TrigramTokens)
|
||||
TrigramIterators.push_back(iterator(Trigram));
|
||||
Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
|
||||
|
||||
// Generate scope tokens for search query.
|
||||
std::vector<std::unique_ptr<Iterator>> ScopeIterators;
|
||||
for (const auto &Scope : Req.Scopes) {
|
||||
Token Tok(Token::Kind::Scope, Scope);
|
||||
const auto It = InvertedIndex.find(Tok);
|
||||
if (It != InvertedIndex.end())
|
||||
ScopeIterators.push_back(It->second.iterator(&It->first));
|
||||
}
|
||||
if (Req.AnyScope)
|
||||
for (const auto &Scope : Req.Scopes)
|
||||
ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
|
||||
if (Req.AnyScope || /*legacy*/ Req.Scopes.empty())
|
||||
ScopeIterators.push_back(
|
||||
Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
|
||||
Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
|
||||
|
||||
// Add OR iterator for scopes if there are any Scope Iterators.
|
||||
if (!ScopeIterators.empty())
|
||||
TopLevelChildren.push_back(Corpus.unionOf(move(ScopeIterators)));
|
||||
|
||||
// Add proximity paths boosting.
|
||||
auto BoostingIterators = createFileProximityIterators(
|
||||
Req.ProximityPaths, URISchemes, InvertedIndex, Corpus);
|
||||
// Boosting iterators do not actually filter symbols. In order to preserve
|
||||
// the validity of resulting query, TRUE iterator should be added along
|
||||
// BOOSTs.
|
||||
if (!BoostingIterators.empty()) {
|
||||
BoostingIterators.push_back(Corpus.all());
|
||||
TopLevelChildren.push_back(Corpus.unionOf(move(BoostingIterators)));
|
||||
}
|
||||
// Add proximity paths boosting (all symbols, some boosted).
|
||||
Criteria.push_back(createFileProximityIterator(Req.ProximityPaths, URISchemes,
|
||||
InvertedIndex, Corpus));
|
||||
|
||||
if (Req.RestrictForCodeCompletion)
|
||||
TopLevelChildren.push_back(
|
||||
InvertedIndex.find(RestrictedForCodeCompletion)
|
||||
->second.iterator(&RestrictedForCodeCompletion));
|
||||
Criteria.push_back(iterator(RestrictedForCodeCompletion));
|
||||
|
||||
// Use TRUE iterator if both trigrams and scopes from the query are not
|
||||
// present in the symbol index.
|
||||
auto QueryIterator = TopLevelChildren.empty()
|
||||
? Corpus.all()
|
||||
: Corpus.intersect(move(TopLevelChildren));
|
||||
auto Root = Corpus.intersect(move(Criteria));
|
||||
// Retrieve more items than it was requested: some of the items with high
|
||||
// final score might not be retrieved otherwise.
|
||||
// FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
|
||||
// using 100x of the requested number might not be good in practice, e.g.
|
||||
// when the requested number of items is small.
|
||||
auto Root = Req.Limit ? Corpus.limit(move(QueryIterator), *Req.Limit * 100)
|
||||
: move(QueryIterator);
|
||||
// FIXME(kbobyrev): Tune this ratio.
|
||||
if (Req.Limit)
|
||||
Root = Corpus.limit(move(Root), *Req.Limit * 100);
|
||||
SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
|
||||
vlog("Dex query tree: {0}", *Root);
|
||||
|
||||
|
|
|
@ -87,6 +87,7 @@ public:
|
|||
|
||||
private:
|
||||
void buildIndex();
|
||||
std::unique_ptr<Iterator> iterator(const Token &Tok) const;
|
||||
|
||||
/// Stores symbols sorted in the descending order of symbol quality..
|
||||
std::vector<const Symbol *> Symbols;
|
||||
|
|
|
@ -106,7 +106,6 @@ protected:
|
|||
Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
|
||||
|
||||
private:
|
||||
Kind MyKind;
|
||||
virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
|
||||
Kind MyKind;
|
||||
};
|
||||
|
|
|
@ -87,6 +87,8 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
|
|||
}
|
||||
|
||||
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
|
||||
if (Query.empty())
|
||||
return {};
|
||||
std::string LowercaseQuery = Query.lower();
|
||||
if (Query.size() < 3) // short-query trigrams only
|
||||
return {Token(Token::Kind::Trigram, LowercaseQuery)};
|
||||
|
|
|
@ -408,6 +408,7 @@ TEST(DexTrigrams, QueryTrigrams) {
|
|||
EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
|
||||
|
||||
EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({}));
|
||||
EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
|
||||
|
@ -526,8 +527,6 @@ TEST(DexTest, FuzzyMatch) {
|
|||
UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
|
||||
}
|
||||
|
||||
// TODO(sammccall): enable after D52796 bugfix.
|
||||
#if 0
|
||||
TEST(DexTest, ShortQuery) {
|
||||
auto I =
|
||||
Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), URISchemes);
|
||||
|
@ -549,7 +548,6 @@ TEST(DexTest, ShortQuery) {
|
|||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
|
||||
EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) {
|
||||
auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), RefSlab(),
|
||||
|
@ -614,6 +612,15 @@ TEST(DexTest, IgnoreCases) {
|
|||
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
|
||||
}
|
||||
|
||||
TEST(DexTest, UnknownPostingList) {
|
||||
// Regression test: we used to ignore unknown scopes and accept any symbol.
|
||||
auto I = Dex::build(generateSymbols({"ns::ABC", "ns::abc"}), RefSlab(),
|
||||
URISchemes);
|
||||
FuzzyFindRequest Req;
|
||||
Req.Scopes = {"ns2::"};
|
||||
EXPECT_THAT(match(*I, Req), UnorderedElementsAre());
|
||||
}
|
||||
|
||||
TEST(DexTest, Lookup) {
|
||||
auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), RefSlab(), URISchemes);
|
||||
EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));
|
||||
|
|
Loading…
Reference in New Issue