From 976a74d7d2dbd19670614f603caf490cca892fdc Mon Sep 17 00:00:00 2001 From: Kirill Bobyrev Date: Tue, 7 Dec 2021 15:46:12 +0100 Subject: [PATCH] [clangd] Dex Trigrams: Improve query trigram generation These are the trigrams for queries right now: - "va" -> {Trigram("va")} - "va_" -> {} (empty) This is suboptimal since the resulting query will discard the query information and return all symbols, some of which will be later be scored expensively (fuzzy matching score). This is related to https://github.com/clangd/clangd/issues/39 but does not fix it. Accidentally, because of that incorrect behavior, when user types "tok::va" there are no results (the issue is that `tok::kw___builtin_va_arg` does not have "va" token) but when "tok::va_" is typed, expected result (`tok::kw___builtin_va_arg`) shows up by accident. This is because the dex query transformer will only lookup symbols within the `tok::` namespace. There won't be many, so the returned results will contain symbol we need; this symbol will be filtered out by the expensive checks and that will be displayed in the editor. Reviewed By: sammccall Differential Revision: https://reviews.llvm.org/D113995 --- .../clangd/index/dex/Trigram.cpp | 69 ++++++++++----- .../clangd/unittests/DexTests.cpp | 87 +++++++++++++------ 2 files changed, 109 insertions(+), 47 deletions(-) diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp index fddca8a95320..e6f42f67c50e 100644 --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -12,10 +12,14 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include +#include #include #include +#include namespace clang { namespace clangd { @@ -25,21 +29,22 @@ namespace dex { template static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Apply fuzzy matching text segmentation. - std::vector Roles(Identifier.size()); + llvm::SmallVector Roles(Identifier.size()); calculateRoles(Identifier, llvm::makeMutableArrayRef(Roles.data(), Identifier.size())); std::string LowercaseIdentifier = Identifier.lower(); // For each character, store indices of the characters to which fuzzy matching - // algorithm can jump. There are 3 possible variants: + // algorithm can jump. There are 2 possible variants: // // * Next Tail - next character from the same segment // * Next Head - front character of the next segment // // Next stores tuples of three indices in the presented order, if a variant is // not available then 0 is stored. - std::vector> Next(LowercaseIdentifier.size()); + llvm::SmallVector, 12> Next( + LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead}}; @@ -51,14 +56,14 @@ static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Iterate through valid sequences of three characters Fuzzy Matcher can // process. - for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { + for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; - for (const unsigned J : Next[I]) { + for (unsigned J : Next[I]) { if (J == 0) continue; - for (const unsigned K : Next[J]) { + for (unsigned K : Next[J]) { if (K == 0) continue; Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J], @@ -66,16 +71,29 @@ static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { } } } - // Emit short-query trigrams: FooBar -> f, fo, fb. - if (!LowercaseIdentifier.empty()) - Out(Trigram(LowercaseIdentifier[0])); - if (LowercaseIdentifier.size() >= 2) - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1])); - for (size_t I = 1; I < LowercaseIdentifier.size(); ++I) - if (Roles[I] == Head) { - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I])); + // Short queries semantics are different. When the user dosn't type enough + // symbols to form trigrams, we still want to serve meaningful results. To + // achieve that, we form incomplete trigrams (bi- and unigrams) for the + // identifiers and also generate short trigrams on the query side from what + // is available. We allow a small number of short trigram types in order to + // prevent excessive memory usage and increase the quality of the search. + // Only the first few symbols are allowed to be used in incomplete trigrams. + // + // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams: + // "_", "_a", "a", "ab", "ad", "d", "de", "dg" + for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) { + // The first symbol might be a separator, so the loop condition should be + // stopping as soon as there is no next head or we have seen two heads. + if (Roles[Position] == Head) + ++HeadsSeen; + Out(Trigram(LowercaseIdentifier[Position])); + for (unsigned I : Next[Position]) + if (I != 0) + Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I])); + Position = Next[Position][1]; + if (Position == 0) break; - } + } } void generateIdentifierTrigrams(llvm::StringRef Identifier, @@ -101,17 +119,16 @@ void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector generateQueryTrigrams(llvm::StringRef Query) { if (Query.empty()) return {}; - std::string LowercaseQuery = Query.lower(); - if (Query.size() < 3) // short-query trigrams only - return {Token(Token::Kind::Trigram, LowercaseQuery)}; // Apply fuzzy matching text segmentation. - std::vector Roles(Query.size()); + llvm::SmallVector Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + std::string LowercaseQuery = Query.lower(); + llvm::DenseSet UniqueTrigrams; std::string Chars; - for (unsigned I = 0; I < Query.size(); ++I) { + for (unsigned I = 0; I < LowercaseQuery.size(); ++I) { if (Roles[I] != Head && Roles[I] != Tail) continue; // Skip delimiters. Chars.push_back(LowercaseQuery[I]); @@ -121,6 +138,18 @@ std::vector generateQueryTrigrams(llvm::StringRef Query) { UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); } + // For queries with very few letters, generateIdentifierTrigrams emulates + // outputs of this function to match the semantics. + if (UniqueTrigrams.empty()) { + // If bigram can't be formed out of letters/numbers, we prepend separator. + std::string Result(1, LowercaseQuery.front()); + for (unsigned I = 1; I < LowercaseQuery.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + Result += LowercaseQuery[I]; + UniqueTrigrams.insert( + Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2))); + } + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp index 60d0db081dbb..249cc101a74c 100644 --- a/clang-tools-extra/clangd/unittests/DexTests.cpp +++ b/clang-tools-extra/clangd/unittests/DexTests.cpp @@ -386,30 +386,35 @@ TEST(DexTrigrams, IdentifierTrigrams) { trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"})); EXPECT_THAT(identifierTrigramTokens("abc_def"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde", - "cde", "def"})); + trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade", + "bcd", "bde", "cde", "def"})); EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"), - trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"})); + trigramsAre({"a", "b", "ab", "bc", "abc", "bcd", "cde"})); EXPECT_THAT(identifierTrigramTokens("unique_ptr"), - trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip", - "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", - "uep", "ept", "ptr"})); + trigramsAre({"u", "p", "un", "up", "pt", "uni", "unp", + "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt", + "que", "qup", "qpt", "uep", "ept", "ptr"})); - EXPECT_THAT( - identifierTrigramTokens("TUDecl"), - trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"})); + EXPECT_THAT(identifierTrigramTokens("TUDecl"), + trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude", + "dec", "ecl"})); EXPECT_THAT(identifierTrigramTokens("IsOK"), - trigramsAre({"i", "is", "io", "iso", "iok", "sok"})); + trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"})); - EXPECT_THAT( - identifierTrigramTokens("abc_defGhij__klm"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd", - "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk", - "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl", - "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); + EXPECT_THAT(identifierTrigramTokens("_pb"), + trigramsAre({"_", "_p", "p", "pb"})); + EXPECT_THAT(identifierTrigramTokens("__pb"), + trigramsAre({"_", "_p", "p", "pb"})); + + EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"), + trigramsAre({"a", "d", "ab", "ad", "dg", "de", "abc", + "abd", "ade", "adg", "bcd", "bde", "bdg", "cde", + "cdg", "def", "deg", "dgh", "dgk", "efg", "egh", + "egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij", + "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); } TEST(DexTrigrams, QueryTrigrams) { @@ -419,8 +424,16 @@ TEST(DexTrigrams, QueryTrigrams) { EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({})); EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"})); - EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"})); - EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"})); + + EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"})); + + EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"})); + EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"})); EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); @@ -525,25 +538,45 @@ TEST(DexTest, FuzzyMatch) { } TEST(DexTest, ShortQuery) { - auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), + auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(), RelationSlab()); FuzzyFindRequest Req; Req.AnyScope = true; bool Incomplete; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "Empty string is not a short query"; - Req.Query = "t"; + Req.Query = "o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head as unigram"; + + Req.Query = "_o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram"; + + Req.Query = "on"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram"; + + Req.Query = "ot"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first two heads as bigram"; + + Req.Query = "tw"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using second head and tail as bigram"; + + Req.Query = "tf"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using second and third heads as bigram"; + + Req.Query = "fo"; EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); EXPECT_TRUE(Incomplete) << "Short queries have different semantics"; - Req.Query = "tt"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); - EXPECT_TRUE(Incomplete) << "Short queries have different semantics"; - - Req.Query = "ttf"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + Req.Query = "tfs"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "3-char string is not a short query"; }