forked from OSchip/llvm-project
[include-fixer] Add fuzzy SymbolIndex, where identifier needn't match exactly.
Summary: Add fuzzy SymbolIndex, where identifier needn't match exactly. The purpose for this is global autocomplete in clangd. The query will be a partial identifier up to the cursor, and the results will be suggestions. It's in include-fixer because: - it handles SymbolInfos, actually SymbolIndex is exactly the right interface - it's a good harness for lit testing the fuzzy YAML index - (Laziness: we can't unit test clangd until reorganizing with a tool/ dir) Other questionable choices: - FuzzySymbolIndex, which just refines the contract of SymbolIndex. This is an interface to allow extension to large monorepos (*cough*) - an always-true safety check that Identifier == Name is removed from SymbolIndexManager, as it's not true for fuzzy matching - exposing -db=fuzzyYaml from include-fixer is not a very useful feature, and a non-orthogonal ui (fuzziness vs data source). -db=fixed is similar though. Reviewers: bkramer Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D30720 llvm-svn: 297630
This commit is contained in:
parent
6023a2501c
commit
9c5ebf7039
|
@ -6,6 +6,7 @@ add_clang_library(clangIncludeFixer
|
|||
IncludeFixer.cpp
|
||||
IncludeFixerContext.cpp
|
||||
InMemorySymbolIndex.cpp
|
||||
FuzzySymbolIndex.cpp
|
||||
SymbolIndexManager.cpp
|
||||
YamlSymbolIndex.cpp
|
||||
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
//===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "FuzzySymbolIndex.h"
|
||||
#include "llvm/Support/Regex.h"
|
||||
|
||||
using clang::find_all_symbols::SymbolAndSignals;
|
||||
using llvm::StringRef;
|
||||
|
||||
namespace clang {
|
||||
namespace include_fixer {
|
||||
namespace {
|
||||
|
||||
class MemSymbolIndex : public FuzzySymbolIndex {
|
||||
public:
|
||||
MemSymbolIndex(std::vector<SymbolAndSignals> Symbols) {
|
||||
for (auto &Symbol : Symbols) {
|
||||
auto Tokens = tokenize(Symbol.Symbol.getName());
|
||||
this->Symbols.emplace_back(
|
||||
StringRef(llvm::join(Tokens.begin(), Tokens.end(), " ")),
|
||||
std::move(Symbol));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<SymbolAndSignals> search(StringRef Query) override {
|
||||
auto Tokens = tokenize(Query);
|
||||
llvm::Regex Pattern("^" + queryRegexp(Tokens));
|
||||
std::vector<SymbolAndSignals> Results;
|
||||
for (const Entry &E : Symbols)
|
||||
if (Pattern.match(E.first))
|
||||
Results.push_back(E.second);
|
||||
return Results;
|
||||
}
|
||||
|
||||
private:
|
||||
using Entry = std::pair<llvm::SmallString<32>, SymbolAndSignals>;
|
||||
std::vector<Entry> Symbols;
|
||||
};
|
||||
|
||||
// Helpers for tokenize state machine.
|
||||
enum TokenizeState {
|
||||
EMPTY, // No pending characters.
|
||||
ONE_BIG, // Read one uppercase letter, could be WORD or Word.
|
||||
BIG_WORD, // Reading an uppercase WORD.
|
||||
SMALL_WORD, // Reading a lowercase word.
|
||||
NUMBER // Reading a number.
|
||||
};
|
||||
|
||||
enum CharType { UPPER, LOWER, DIGIT, MISC };
|
||||
CharType classify(char c) {
|
||||
if (isupper(c))
|
||||
return UPPER;
|
||||
if (islower(c))
|
||||
return LOWER;
|
||||
if (isdigit(c))
|
||||
return DIGIT;
|
||||
return MISC;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<std::string> FuzzySymbolIndex::tokenize(StringRef Text) {
|
||||
std::vector<std::string> Result;
|
||||
// State describes the treatment of text from Start to I.
|
||||
// Once text is Flush()ed into Result, we're done with it and advance Start.
|
||||
TokenizeState State = EMPTY;
|
||||
size_t Start = 0;
|
||||
auto Flush = [&](size_t End) {
|
||||
if (State != EMPTY) {
|
||||
Result.push_back(Text.substr(Start, End - Start).lower());
|
||||
State = EMPTY;
|
||||
}
|
||||
Start = End;
|
||||
};
|
||||
for (size_t I = 0; I < Text.size(); ++I) {
|
||||
CharType Type = classify(Text[I]);
|
||||
if (Type == MISC)
|
||||
Flush(I);
|
||||
else if (Type == LOWER)
|
||||
switch (State) {
|
||||
case BIG_WORD:
|
||||
Flush(I - 1); // FOOBar: first token is FOO, not FOOB.
|
||||
LLVM_FALLTHROUGH;
|
||||
case ONE_BIG:
|
||||
State = SMALL_WORD;
|
||||
LLVM_FALLTHROUGH;
|
||||
case SMALL_WORD:
|
||||
break;
|
||||
default:
|
||||
Flush(I);
|
||||
State = SMALL_WORD;
|
||||
}
|
||||
else if (Type == UPPER)
|
||||
switch (State) {
|
||||
case ONE_BIG:
|
||||
State = BIG_WORD;
|
||||
LLVM_FALLTHROUGH;
|
||||
case BIG_WORD:
|
||||
break;
|
||||
default:
|
||||
Flush(I);
|
||||
State = ONE_BIG;
|
||||
}
|
||||
else if (Type == DIGIT && State != NUMBER) {
|
||||
Flush(I);
|
||||
State = NUMBER;
|
||||
}
|
||||
}
|
||||
Flush(Text.size());
|
||||
return Result;
|
||||
}
|
||||
|
||||
std::string
|
||||
FuzzySymbolIndex::queryRegexp(const std::vector<std::string> &Tokens) {
|
||||
std::string Result;
|
||||
for (size_t I = 0; I < Tokens.size(); ++I) {
|
||||
if (I)
|
||||
Result.append("[[:alnum:]]* ");
|
||||
for (size_t J = 0; J < Tokens[I].size(); ++J) {
|
||||
if (J)
|
||||
Result.append("([[:alnum:]]* )?");
|
||||
Result.push_back(Tokens[I][J]);
|
||||
}
|
||||
}
|
||||
return Result;
|
||||
}
|
||||
|
||||
llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
|
||||
FuzzySymbolIndex::createFromYAML(StringRef FilePath) {
|
||||
auto Buffer = llvm::MemoryBuffer::getFile(FilePath);
|
||||
if (!Buffer)
|
||||
return llvm::errorCodeToError(Buffer.getError());
|
||||
return llvm::make_unique<MemSymbolIndex>(
|
||||
find_all_symbols::ReadSymbolInfosFromYAML(Buffer.get()->getBuffer()));
|
||||
}
|
||||
|
||||
} // namespace include_fixer
|
||||
} // namespace clang
|
|
@ -0,0 +1,55 @@
|
|||
//===--- FuzzySymbolIndex.h - Lookup symbols for autocomplete ---*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
|
||||
#define LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
|
||||
|
||||
#include "SymbolIndex.h"
|
||||
#include "find-all-symbols/SymbolInfo.h"
|
||||
#include "llvm/ADT/SmallString.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Support/Error.h"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace clang {
|
||||
namespace include_fixer {
|
||||
|
||||
// A FuzzySymbolIndex retrieves top-level symbols matching a query string.
|
||||
//
|
||||
// It refines the contract of SymbolIndex::search to do fuzzy matching:
|
||||
// - symbol names are tokenized: "unique ptr", "string ref".
|
||||
// - query must match prefixes of symbol tokens: [upt]
|
||||
// - if the query has multiple tokens, splits must match: [StR], not [STr].
|
||||
// Helpers for tokenization and regex matching are provided.
|
||||
//
|
||||
// Implementations may choose to truncate results, refuse short queries, etc.
|
||||
class FuzzySymbolIndex : public SymbolIndex {
|
||||
public:
|
||||
// Loads the specified include-fixer database and returns an index serving it.
|
||||
static llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
|
||||
createFromYAML(llvm::StringRef File);
|
||||
|
||||
// Helpers for implementing indexes:
|
||||
|
||||
// Transforms a symbol name or query into a sequence of tokens.
|
||||
// - URLHandlerCallback --> [url, handler, callback]
|
||||
// - snake_case11 --> [snake, case, 11]
|
||||
// - _WTF$ --> [wtf]
|
||||
static std::vector<std::string> tokenize(llvm::StringRef Text);
|
||||
|
||||
// Transforms query tokens into an unanchored regexp to match symbol tokens.
|
||||
// - [fe f] --> /f(\w* )?e\w* f/, matches [fee fie foe].
|
||||
static std::string queryRegexp(const std::vector<std::string> &Tokens);
|
||||
};
|
||||
|
||||
} // namespace include_fixer
|
||||
} // namespace clang
|
||||
|
||||
#endif // LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
|
|
@ -103,46 +103,44 @@ SymbolIndexManager::search(llvm::StringRef Identifier,
|
|||
for (auto &SymAndSig : Symbols) {
|
||||
const SymbolInfo &Symbol = SymAndSig.Symbol;
|
||||
// Match the identifier name without qualifier.
|
||||
if (Symbol.getName() == Names.back()) {
|
||||
bool IsMatched = true;
|
||||
auto SymbolContext = Symbol.getContexts().begin();
|
||||
auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name.
|
||||
// Match the remaining context names.
|
||||
while (IdentiferContext != Names.rend() &&
|
||||
SymbolContext != Symbol.getContexts().end()) {
|
||||
if (SymbolContext->second == *IdentiferContext) {
|
||||
++IdentiferContext;
|
||||
++SymbolContext;
|
||||
} else if (SymbolContext->first ==
|
||||
find_all_symbols::SymbolInfo::ContextType::EnumDecl) {
|
||||
// Skip non-scoped enum context.
|
||||
++SymbolContext;
|
||||
} else {
|
||||
IsMatched = false;
|
||||
break;
|
||||
}
|
||||
bool IsMatched = true;
|
||||
auto SymbolContext = Symbol.getContexts().begin();
|
||||
auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name.
|
||||
// Match the remaining context names.
|
||||
while (IdentiferContext != Names.rend() &&
|
||||
SymbolContext != Symbol.getContexts().end()) {
|
||||
if (SymbolContext->second == *IdentiferContext) {
|
||||
++IdentiferContext;
|
||||
++SymbolContext;
|
||||
} else if (SymbolContext->first ==
|
||||
find_all_symbols::SymbolInfo::ContextType::EnumDecl) {
|
||||
// Skip non-scoped enum context.
|
||||
++SymbolContext;
|
||||
} else {
|
||||
IsMatched = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If the name was qualified we only want to add results if we evaluated
|
||||
// all contexts.
|
||||
if (IsFullyQualified)
|
||||
IsMatched &= (SymbolContext == Symbol.getContexts().end());
|
||||
// If the name was qualified we only want to add results if we evaluated
|
||||
// all contexts.
|
||||
if (IsFullyQualified)
|
||||
IsMatched &= (SymbolContext == Symbol.getContexts().end());
|
||||
|
||||
// FIXME: Support full match. At this point, we only find symbols in
|
||||
// database which end with the same contexts with the identifier.
|
||||
if (IsMatched && IdentiferContext == Names.rend()) {
|
||||
// If we're in a situation where we took a prefix but the thing we
|
||||
// found couldn't possibly have a nested member ignore it.
|
||||
if (TookPrefix &&
|
||||
(Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function ||
|
||||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable ||
|
||||
Symbol.getSymbolKind() ==
|
||||
SymbolInfo::SymbolKind::EnumConstantDecl ||
|
||||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro))
|
||||
continue;
|
||||
// FIXME: Support full match. At this point, we only find symbols in
|
||||
// database which end with the same contexts with the identifier.
|
||||
if (IsMatched && IdentiferContext == Names.rend()) {
|
||||
// If we're in a situation where we took a prefix but the thing we
|
||||
// found couldn't possibly have a nested member ignore it.
|
||||
if (TookPrefix &&
|
||||
(Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function ||
|
||||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable ||
|
||||
Symbol.getSymbolKind() ==
|
||||
SymbolInfo::SymbolKind::EnumConstantDecl ||
|
||||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro))
|
||||
continue;
|
||||
|
||||
MatchedSymbols.push_back(std::move(SymAndSig));
|
||||
}
|
||||
MatchedSymbols.push_back(std::move(SymAndSig));
|
||||
}
|
||||
}
|
||||
Names.pop_back();
|
||||
|
@ -152,7 +150,7 @@ SymbolIndexManager::search(llvm::StringRef Identifier,
|
|||
rank(MatchedSymbols, FileName);
|
||||
// Strip signals, they are no longer needed.
|
||||
std::vector<SymbolInfo> Res;
|
||||
for (const auto &SymAndSig : MatchedSymbols)
|
||||
for (auto &SymAndSig : MatchedSymbols)
|
||||
Res.push_back(std::move(SymAndSig.Symbol));
|
||||
return Res;
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "FuzzySymbolIndex.h"
|
||||
#include "InMemorySymbolIndex.h"
|
||||
#include "IncludeFixer.h"
|
||||
#include "IncludeFixerContext.h"
|
||||
|
@ -83,14 +84,16 @@ namespace {
|
|||
cl::OptionCategory IncludeFixerCategory("Tool options");
|
||||
|
||||
enum DatabaseFormatTy {
|
||||
fixed, ///< Hard-coded mapping.
|
||||
yaml, ///< Yaml database created by find-all-symbols.
|
||||
fixed, ///< Hard-coded mapping.
|
||||
yaml, ///< Yaml database created by find-all-symbols.
|
||||
fuzzyYaml, ///< Yaml database with fuzzy-matched identifiers.
|
||||
};
|
||||
|
||||
cl::opt<DatabaseFormatTy> DatabaseFormat(
|
||||
"db", cl::desc("Specify input format"),
|
||||
cl::values(clEnumVal(fixed, "Hard-coded mapping"),
|
||||
clEnumVal(yaml, "Yaml database created by find-all-symbols")),
|
||||
clEnumVal(yaml, "Yaml database created by find-all-symbols"),
|
||||
clEnumVal(fuzzyYaml, "Yaml database, with fuzzy-matched names")),
|
||||
cl::init(yaml), cl::cat(IncludeFixerCategory));
|
||||
|
||||
cl::opt<std::string> Input("input",
|
||||
|
@ -215,6 +218,21 @@ createSymbolIndexManager(StringRef FilePath) {
|
|||
SymbolIndexMgr->addSymbolIndex(std::move(CreateYamlIdx));
|
||||
break;
|
||||
}
|
||||
case fuzzyYaml: {
|
||||
// This mode is not very useful, because we don't correct the identifier.
|
||||
// It's main purpose is to expose FuzzySymbolIndex to tests.
|
||||
SymbolIndexMgr->addSymbolIndex(
|
||||
[]() -> std::unique_ptr<include_fixer::SymbolIndex> {
|
||||
auto DB = include_fixer::FuzzySymbolIndex::createFromYAML(Input);
|
||||
if (!DB) {
|
||||
llvm::errs() << "Couldn't load fuzzy YAML db: "
|
||||
<< llvm::toString(DB.takeError()) << '\n';
|
||||
return nullptr;
|
||||
}
|
||||
return std::move(*DB);
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
return SymbolIndexMgr;
|
||||
}
|
||||
|
|
|
@ -10,6 +10,17 @@ Type: Class
|
|||
Seen: 1
|
||||
Used: 0
|
||||
---
|
||||
Name: foo_bar
|
||||
Contexts:
|
||||
- ContextType: Namespace
|
||||
ContextName: a
|
||||
- ContextType: Namespace
|
||||
ContextName: b
|
||||
FilePath: foobar.h
|
||||
Type: Class
|
||||
Seen: 0
|
||||
Used: 0
|
||||
---
|
||||
Name: bar
|
||||
Contexts:
|
||||
- ContextType: Namespace
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
// RUN: sed -e 's#//.*$##' %s > %t.cpp
|
||||
// RUN: clang-include-fixer -db=fuzzyYaml -input=%p/Inputs/fake_yaml_db.yaml %t.cpp --
|
||||
// RUN: FileCheck %s -input-file=%t.cpp
|
||||
|
||||
// include-fixer will add the include, but doesn't complete the symbol.
|
||||
// CHECK: #include "foobar.h"
|
||||
// CHECK: fba f;
|
||||
|
||||
b::a::fba f;
|
|
@ -13,6 +13,7 @@ include_directories(${CLANG_SOURCE_DIR})
|
|||
|
||||
add_extra_unittest(IncludeFixerTests
|
||||
IncludeFixerTest.cpp
|
||||
FuzzySymbolIndexTests.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(IncludeFixerTests
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
//===-- FuzzySymbolIndexTests.cpp - Fuzzy symbol index unit tests ---------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "FuzzySymbolIndex.h"
|
||||
#include "gmock/gmock.h"
|
||||
#include "llvm/Support/Regex.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using testing::ElementsAre;
|
||||
using testing::Not;
|
||||
|
||||
namespace clang {
|
||||
namespace include_fixer {
|
||||
namespace {
|
||||
|
||||
TEST(FuzzySymbolIndexTest, Tokenize) {
|
||||
EXPECT_THAT(FuzzySymbolIndex::tokenize("URLHandlerCallback"),
|
||||
ElementsAre("url", "handler", "callback"));
|
||||
EXPECT_THAT(FuzzySymbolIndex::tokenize("snake_case11"),
|
||||
ElementsAre("snake", "case", "11"));
|
||||
EXPECT_THAT(FuzzySymbolIndex::tokenize("__$42!!BOB\nbob"),
|
||||
ElementsAre("42", "bob", "bob"));
|
||||
}
|
||||
|
||||
MATCHER_P(MatchesSymbol, Identifier, "") {
|
||||
llvm::Regex Pattern("^" + arg);
|
||||
std::string err;
|
||||
if (!Pattern.isValid(err)) {
|
||||
*result_listener << "invalid regex: " << err;
|
||||
return false;
|
||||
}
|
||||
auto Tokens = FuzzySymbolIndex::tokenize(Identifier);
|
||||
std::string Target = llvm::join(Tokens.begin(), Tokens.end(), " ");
|
||||
*result_listener << "matching against '" << Target << "'";
|
||||
return llvm::Regex("^" + arg).match(Target);
|
||||
}
|
||||
|
||||
TEST(FuzzySymbolIndexTest, QueryRegexp) {
|
||||
auto QueryRegexp = [](const std::string &query) {
|
||||
return FuzzySymbolIndex::queryRegexp(FuzzySymbolIndex::tokenize(query));
|
||||
};
|
||||
EXPECT_THAT(QueryRegexp("uhc"), MatchesSymbol("URLHandlerCallback"));
|
||||
EXPECT_THAT(QueryRegexp("urhaca"), MatchesSymbol("URLHandlerCallback"));
|
||||
EXPECT_THAT(QueryRegexp("uhcb"), Not(MatchesSymbol("URLHandlerCallback")))
|
||||
<< "Non-prefix";
|
||||
EXPECT_THAT(QueryRegexp("uc"), Not(MatchesSymbol("URLHandlerCallback")))
|
||||
<< "Skip token";
|
||||
|
||||
EXPECT_THAT(QueryRegexp("uptr"), MatchesSymbol("unique_ptr"));
|
||||
EXPECT_THAT(QueryRegexp("UniP"), MatchesSymbol("unique_ptr"));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace include_fixer
|
||||
} // namespace clang
|
Loading…
Reference in New Issue