forked from OSchip/llvm-project
[clangd] Dex Trigrams: Improve query trigram generation
These are the trigrams for queries right now: - "va" -> {Trigram("va")} - "va_" -> {} (empty) This is suboptimal since the resulting query will discard the query information and return all symbols, some of which will be later be scored expensively (fuzzy matching score). This is related to https://github.com/clangd/clangd/issues/39 but does not fix it. Accidentally, because of that incorrect behavior, when user types "tok::va" there are no results (the issue is that `tok::kw___builtin_va_arg` does not have "va" token) but when "tok::va_" is typed, expected result (`tok::kw___builtin_va_arg`) shows up by accident. This is because the dex query transformer will only lookup symbols within the `tok::` namespace. There won't be many, so the returned results will contain symbol we need; this symbol will be filtered out by the expensive checks and that will be displayed in the editor. Reviewed By: sammccall Differential Revision: https://reviews.llvm.org/D113995
This commit is contained in:
parent
9094a2285b
commit
976a74d7d2
|
@ -12,10 +12,14 @@
|
|||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include <cctype>
|
||||
#include <limits>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
namespace clang {
|
||||
namespace clangd {
|
||||
|
@ -25,21 +29,22 @@ namespace dex {
|
|||
template <typename Func>
|
||||
static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
|
||||
// Apply fuzzy matching text segmentation.
|
||||
std::vector<CharRole> Roles(Identifier.size());
|
||||
llvm::SmallVector<CharRole> Roles(Identifier.size());
|
||||
calculateRoles(Identifier,
|
||||
llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
|
||||
|
||||
std::string LowercaseIdentifier = Identifier.lower();
|
||||
|
||||
// For each character, store indices of the characters to which fuzzy matching
|
||||
// algorithm can jump. There are 3 possible variants:
|
||||
// algorithm can jump. There are 2 possible variants:
|
||||
//
|
||||
// * Next Tail - next character from the same segment
|
||||
// * Next Head - front character of the next segment
|
||||
//
|
||||
// Next stores tuples of three indices in the presented order, if a variant is
|
||||
// not available then 0 is stored.
|
||||
std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
|
||||
llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
|
||||
LowercaseIdentifier.size());
|
||||
unsigned NextTail = 0, NextHead = 0;
|
||||
for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
|
||||
Next[I] = {{NextTail, NextHead}};
|
||||
|
@ -51,14 +56,14 @@ static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
|
|||
|
||||
// Iterate through valid sequences of three characters Fuzzy Matcher can
|
||||
// process.
|
||||
for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
|
||||
for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
|
||||
// Skip delimiters.
|
||||
if (Roles[I] != Head && Roles[I] != Tail)
|
||||
continue;
|
||||
for (const unsigned J : Next[I]) {
|
||||
for (unsigned J : Next[I]) {
|
||||
if (J == 0)
|
||||
continue;
|
||||
for (const unsigned K : Next[J]) {
|
||||
for (unsigned K : Next[J]) {
|
||||
if (K == 0)
|
||||
continue;
|
||||
Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
|
||||
|
@ -66,16 +71,29 @@ static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
|
|||
}
|
||||
}
|
||||
}
|
||||
// Emit short-query trigrams: FooBar -> f, fo, fb.
|
||||
if (!LowercaseIdentifier.empty())
|
||||
Out(Trigram(LowercaseIdentifier[0]));
|
||||
if (LowercaseIdentifier.size() >= 2)
|
||||
Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
|
||||
for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
|
||||
if (Roles[I] == Head) {
|
||||
Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
|
||||
// Short queries semantics are different. When the user dosn't type enough
|
||||
// symbols to form trigrams, we still want to serve meaningful results. To
|
||||
// achieve that, we form incomplete trigrams (bi- and unigrams) for the
|
||||
// identifiers and also generate short trigrams on the query side from what
|
||||
// is available. We allow a small number of short trigram types in order to
|
||||
// prevent excessive memory usage and increase the quality of the search.
|
||||
// Only the first few symbols are allowed to be used in incomplete trigrams.
|
||||
//
|
||||
// Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
|
||||
// "_", "_a", "a", "ab", "ad", "d", "de", "dg"
|
||||
for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
|
||||
// The first symbol might be a separator, so the loop condition should be
|
||||
// stopping as soon as there is no next head or we have seen two heads.
|
||||
if (Roles[Position] == Head)
|
||||
++HeadsSeen;
|
||||
Out(Trigram(LowercaseIdentifier[Position]));
|
||||
for (unsigned I : Next[Position])
|
||||
if (I != 0)
|
||||
Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
|
||||
Position = Next[Position][1];
|
||||
if (Position == 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void generateIdentifierTrigrams(llvm::StringRef Identifier,
|
||||
|
@ -101,17 +119,16 @@ void generateIdentifierTrigrams(llvm::StringRef Identifier,
|
|||
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
|
||||
if (Query.empty())
|
||||
return {};
|
||||
std::string LowercaseQuery = Query.lower();
|
||||
if (Query.size() < 3) // short-query trigrams only
|
||||
return {Token(Token::Kind::Trigram, LowercaseQuery)};
|
||||
|
||||
// Apply fuzzy matching text segmentation.
|
||||
std::vector<CharRole> Roles(Query.size());
|
||||
llvm::SmallVector<CharRole> Roles(Query.size());
|
||||
calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
|
||||
|
||||
std::string LowercaseQuery = Query.lower();
|
||||
|
||||
llvm::DenseSet<Token> UniqueTrigrams;
|
||||
std::string Chars;
|
||||
for (unsigned I = 0; I < Query.size(); ++I) {
|
||||
for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
|
||||
if (Roles[I] != Head && Roles[I] != Tail)
|
||||
continue; // Skip delimiters.
|
||||
Chars.push_back(LowercaseQuery[I]);
|
||||
|
@ -121,6 +138,18 @@ std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
|
|||
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
|
||||
}
|
||||
|
||||
// For queries with very few letters, generateIdentifierTrigrams emulates
|
||||
// outputs of this function to match the semantics.
|
||||
if (UniqueTrigrams.empty()) {
|
||||
// If bigram can't be formed out of letters/numbers, we prepend separator.
|
||||
std::string Result(1, LowercaseQuery.front());
|
||||
for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
|
||||
if (Roles[I] == Head || Roles[I] == Tail)
|
||||
Result += LowercaseQuery[I];
|
||||
UniqueTrigrams.insert(
|
||||
Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
|
||||
}
|
||||
|
||||
return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
|
||||
}
|
||||
|
||||
|
|
|
@ -386,30 +386,35 @@ TEST(DexTrigrams, IdentifierTrigrams) {
|
|||
trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));
|
||||
|
||||
EXPECT_THAT(identifierTrigramTokens("abc_def"),
|
||||
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
|
||||
"cde", "def"}));
|
||||
trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade",
|
||||
"bcd", "bde", "cde", "def"}));
|
||||
|
||||
EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"),
|
||||
trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"}));
|
||||
trigramsAre({"a", "b", "ab", "bc", "abc", "bcd", "cde"}));
|
||||
|
||||
EXPECT_THAT(identifierTrigramTokens("unique_ptr"),
|
||||
trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
|
||||
"npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
|
||||
"uep", "ept", "ptr"}));
|
||||
trigramsAre({"u", "p", "un", "up", "pt", "uni", "unp",
|
||||
"upt", "niq", "nip", "npt", "iqu", "iqp", "ipt",
|
||||
"que", "qup", "qpt", "uep", "ept", "ptr"}));
|
||||
|
||||
EXPECT_THAT(
|
||||
identifierTrigramTokens("TUDecl"),
|
||||
trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));
|
||||
EXPECT_THAT(identifierTrigramTokens("TUDecl"),
|
||||
trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude",
|
||||
"dec", "ecl"}));
|
||||
|
||||
EXPECT_THAT(identifierTrigramTokens("IsOK"),
|
||||
trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));
|
||||
trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"}));
|
||||
|
||||
EXPECT_THAT(
|
||||
identifierTrigramTokens("abc_defGhij__klm"),
|
||||
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd",
|
||||
"bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk",
|
||||
"efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",
|
||||
"hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"}));
|
||||
EXPECT_THAT(identifierTrigramTokens("_pb"),
|
||||
trigramsAre({"_", "_p", "p", "pb"}));
|
||||
EXPECT_THAT(identifierTrigramTokens("__pb"),
|
||||
trigramsAre({"_", "_p", "p", "pb"}));
|
||||
|
||||
EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"),
|
||||
trigramsAre({"a", "d", "ab", "ad", "dg", "de", "abc",
|
||||
"abd", "ade", "adg", "bcd", "bde", "bdg", "cde",
|
||||
"cdg", "def", "deg", "dgh", "dgk", "efg", "egh",
|
||||
"egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij",
|
||||
"hik", "hkl", "ijk", "ikl", "jkl", "klm"}));
|
||||
}
|
||||
|
||||
TEST(DexTrigrams, QueryTrigrams) {
|
||||
|
@ -419,8 +424,16 @@ TEST(DexTrigrams, QueryTrigrams) {
|
|||
|
||||
EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({}));
|
||||
EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
|
||||
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"}));
|
||||
|
||||
EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"}));
|
||||
|
||||
EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"}));
|
||||
EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"}));
|
||||
|
||||
EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
|
||||
|
||||
|
@ -525,25 +538,45 @@ TEST(DexTest, FuzzyMatch) {
|
|||
}
|
||||
|
||||
TEST(DexTest, ShortQuery) {
|
||||
auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(),
|
||||
auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(),
|
||||
RelationSlab());
|
||||
FuzzyFindRequest Req;
|
||||
Req.AnyScope = true;
|
||||
bool Incomplete;
|
||||
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_FALSE(Incomplete) << "Empty string is not a short query";
|
||||
|
||||
Req.Query = "t";
|
||||
Req.Query = "o";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_TRUE(Incomplete) << "Using first head as unigram";
|
||||
|
||||
Req.Query = "_o";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram";
|
||||
|
||||
Req.Query = "on";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram";
|
||||
|
||||
Req.Query = "ot";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_TRUE(Incomplete) << "Using first two heads as bigram";
|
||||
|
||||
Req.Query = "tw";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_TRUE(Incomplete) << "Using second head and tail as bigram";
|
||||
|
||||
Req.Query = "tf";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_TRUE(Incomplete) << "Using second and third heads as bigram";
|
||||
|
||||
Req.Query = "fo";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
|
||||
EXPECT_TRUE(Incomplete) << "Short queries have different semantics";
|
||||
|
||||
Req.Query = "tt";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
|
||||
EXPECT_TRUE(Incomplete) << "Short queries have different semantics";
|
||||
|
||||
Req.Query = "ttf";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
|
||||
Req.Query = "tfs";
|
||||
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
|
||||
EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue