[clangd] Simplify Dex query tree logic and fix missing-posting-list bug

Summary:
The bug being fixed: when a posting list doesn't exist in the index, it
was previously just dropped from the query rather than being treated as
empty. Now that we have the FALSE iterator, we can use it instead.

The query tree logic previously had a bunch of special cases to detect whether
subtrees are empty. Now we just naively build the whole tree, and rely
on the query optimizations to drop the trivial parts.

Finally, there was a bug in trigram generation: the empty query would
generate a single trigram "$$$" instead of no trigrams.
This had no effect (there was no posting list, so the other bug
cancelled it out). But we now have to fix this bug too.

Reviewers: ilya-biryukov

Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits

Differential Revision: https://reviews.llvm.org/D52796

llvm-svn: 343802
This commit is contained in:
Sam McCall 2018-10-04 17:18:55 +00:00
parent aa728f1afa
commit 50b89f0a9b
5 changed files with 38 additions and 46 deletions

View File

@ -62,7 +62,7 @@ std::vector<Token> generateSearchTokens(const Symbol &Sym) {
}
// Constructs BOOST iterators for Path Proximities.
std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
std::unique_ptr<Iterator> createFileProximityIterator(
llvm::ArrayRef<std::string> ProximityPaths,
llvm::ArrayRef<std::string> URISchemes,
const llvm::DenseMap<Token, PostingList> &InvertedIndex,
@ -105,7 +105,8 @@ std::vector<std::unique_ptr<Iterator>> createFileProximityIterators(
It->second.iterator(&It->first), PathProximitySignals.evaluate()));
}
}
return BoostingIterators;
BoostingIterators.push_back(Corpus.all());
return Corpus.unionOf(std::move(BoostingIterators));
}
} // namespace
@ -147,6 +148,12 @@ void Dex::buildIndex() {
{TokenToPostingList.first, PostingList(TokenToPostingList.second)});
}
std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
auto It = InvertedIndex.find(Tok);
return It == InvertedIndex.end() ? Corpus.none()
: It->second.iterator(&It->first);
}
/// Constructs iterators over tokens extracted from the query and exhausts it
/// while applying Callback to each symbol in the order of decreasing quality
/// of the matched symbols.
@ -160,64 +167,40 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
// Prevent clients from postfiltering them for longer queries.
bool More = !Req.Query.empty() && Req.Query.size() < 3;
std::vector<std::unique_ptr<Iterator>> TopLevelChildren;
std::vector<std::unique_ptr<Iterator>> Criteria;
const auto TrigramTokens = generateQueryTrigrams(Req.Query);
// Generate query trigrams and construct AND iterator over all query
// trigrams.
std::vector<std::unique_ptr<Iterator>> TrigramIterators;
for (const auto &Trigram : TrigramTokens) {
const auto It = InvertedIndex.find(Trigram);
if (It != InvertedIndex.end())
TrigramIterators.push_back(It->second.iterator(&It->first));
}
if (!TrigramIterators.empty())
TopLevelChildren.push_back(Corpus.intersect(move(TrigramIterators)));
for (const auto &Trigram : TrigramTokens)
TrigramIterators.push_back(iterator(Trigram));
Criteria.push_back(Corpus.intersect(move(TrigramIterators)));
// Generate scope tokens for search query.
std::vector<std::unique_ptr<Iterator>> ScopeIterators;
for (const auto &Scope : Req.Scopes) {
Token Tok(Token::Kind::Scope, Scope);
const auto It = InvertedIndex.find(Tok);
if (It != InvertedIndex.end())
ScopeIterators.push_back(It->second.iterator(&It->first));
}
if (Req.AnyScope)
for (const auto &Scope : Req.Scopes)
ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
if (Req.AnyScope || /*legacy*/ Req.Scopes.empty())
ScopeIterators.push_back(
Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
Criteria.push_back(Corpus.unionOf(move(ScopeIterators)));
// Add OR iterator for scopes if there are any Scope Iterators.
if (!ScopeIterators.empty())
TopLevelChildren.push_back(Corpus.unionOf(move(ScopeIterators)));
// Add proximity paths boosting.
auto BoostingIterators = createFileProximityIterators(
Req.ProximityPaths, URISchemes, InvertedIndex, Corpus);
// Boosting iterators do not actually filter symbols. In order to preserve
// the validity of resulting query, TRUE iterator should be added along
// BOOSTs.
if (!BoostingIterators.empty()) {
BoostingIterators.push_back(Corpus.all());
TopLevelChildren.push_back(Corpus.unionOf(move(BoostingIterators)));
}
// Add proximity paths boosting (all symbols, some boosted).
Criteria.push_back(createFileProximityIterator(Req.ProximityPaths, URISchemes,
InvertedIndex, Corpus));
if (Req.RestrictForCodeCompletion)
TopLevelChildren.push_back(
InvertedIndex.find(RestrictedForCodeCompletion)
->second.iterator(&RestrictedForCodeCompletion));
Criteria.push_back(iterator(RestrictedForCodeCompletion));
// Use TRUE iterator if both trigrams and scopes from the query are not
// present in the symbol index.
auto QueryIterator = TopLevelChildren.empty()
? Corpus.all()
: Corpus.intersect(move(TopLevelChildren));
auto Root = Corpus.intersect(move(Criteria));
// Retrieve more items than it was requested: some of the items with high
// final score might not be retrieved otherwise.
// FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as
// using 100x of the requested number might not be good in practice, e.g.
// when the requested number of items is small.
auto Root = Req.Limit ? Corpus.limit(move(QueryIterator), *Req.Limit * 100)
: move(QueryIterator);
// FIXME(kbobyrev): Tune this ratio.
if (Req.Limit)
Root = Corpus.limit(move(Root), *Req.Limit * 100);
SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
vlog("Dex query tree: {0}", *Root);

View File

@ -87,6 +87,7 @@ public:
private:
void buildIndex();
std::unique_ptr<Iterator> iterator(const Token &Tok) const;
/// Stores symbols sorted in the descending order of symbol quality..
std::vector<const Symbol *> Symbols;

View File

@ -106,7 +106,6 @@ protected:
Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
private:
Kind MyKind;
virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
Kind MyKind;
};

View File

@ -87,6 +87,8 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
}
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
if (Query.empty())
return {};
std::string LowercaseQuery = Query.lower();
if (Query.size() < 3) // short-query trigrams only
return {Token(Token::Kind::Trigram, LowercaseQuery)};

View File

@ -408,6 +408,7 @@ TEST(DexTrigrams, QueryTrigrams) {
EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl"}));
EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({}));
EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"}));
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
@ -526,8 +527,6 @@ TEST(DexTest, FuzzyMatch) {
UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
}
// TODO(sammccall): enable after D52796 bugfix.
#if 0
TEST(DexTest, ShortQuery) {
auto I =
Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), URISchemes);
@ -549,7 +548,6 @@ TEST(DexTest, ShortQuery) {
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
}
#endif
TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) {
auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), RefSlab(),
@ -614,6 +612,15 @@ TEST(DexTest, IgnoreCases) {
EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc"));
}
TEST(DexTest, UnknownPostingList) {
// Regression test: we used to ignore unknown scopes and accept any symbol.
auto I = Dex::build(generateSymbols({"ns::ABC", "ns::abc"}), RefSlab(),
URISchemes);
FuzzyFindRequest Req;
Req.Scopes = {"ns2::"};
EXPECT_THAT(match(*I, Req), UnorderedElementsAre());
}
TEST(DexTest, Lookup) {
auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), RefSlab(), URISchemes);
EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc"));