[include-fixer] Add fuzzy SymbolIndex, where identifier needn't match exactly.

Summary:
Add fuzzy SymbolIndex, where identifier needn't match exactly.

The purpose for this is global autocomplete in clangd. The query will be a
partial identifier up to the cursor, and the results will be suggestions.

It's in include-fixer because:

  - it handles SymbolInfos, actually SymbolIndex is exactly the right interface
  - it's a good harness for lit testing the fuzzy YAML index
  - (Laziness: we can't unit test clangd until reorganizing with a tool/ dir)

Other questionable choices:

  - FuzzySymbolIndex, which just refines the contract of SymbolIndex. This is
    an interface to allow extension to large monorepos (*cough*)
  - an always-true safety check that Identifier == Name is removed from
    SymbolIndexManager, as it's not true for fuzzy matching
  - exposing -db=fuzzyYaml from include-fixer is not a very useful feature, and
    a non-orthogonal ui (fuzziness vs data source). -db=fixed is similar though.

Reviewers: bkramer

Subscribers: cfe-commits, mgorny

Differential Revision: https://reviews.llvm.org/D30720

llvm-svn: 297630
This commit is contained in:
Sam McCall 2017-03-13 15:55:59 +00:00
parent 6023a2501c
commit 9c5ebf7039
9 changed files with 337 additions and 40 deletions

View File

@ -6,6 +6,7 @@ add_clang_library(clangIncludeFixer
IncludeFixer.cpp
IncludeFixerContext.cpp
InMemorySymbolIndex.cpp
FuzzySymbolIndex.cpp
SymbolIndexManager.cpp
YamlSymbolIndex.cpp

View File

@ -0,0 +1,143 @@
//===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "FuzzySymbolIndex.h"
#include "llvm/Support/Regex.h"
using clang::find_all_symbols::SymbolAndSignals;
using llvm::StringRef;
namespace clang {
namespace include_fixer {
namespace {
class MemSymbolIndex : public FuzzySymbolIndex {
public:
MemSymbolIndex(std::vector<SymbolAndSignals> Symbols) {
for (auto &Symbol : Symbols) {
auto Tokens = tokenize(Symbol.Symbol.getName());
this->Symbols.emplace_back(
StringRef(llvm::join(Tokens.begin(), Tokens.end(), " ")),
std::move(Symbol));
}
}
std::vector<SymbolAndSignals> search(StringRef Query) override {
auto Tokens = tokenize(Query);
llvm::Regex Pattern("^" + queryRegexp(Tokens));
std::vector<SymbolAndSignals> Results;
for (const Entry &E : Symbols)
if (Pattern.match(E.first))
Results.push_back(E.second);
return Results;
}
private:
using Entry = std::pair<llvm::SmallString<32>, SymbolAndSignals>;
std::vector<Entry> Symbols;
};
// Helpers for tokenize state machine.
enum TokenizeState {
EMPTY, // No pending characters.
ONE_BIG, // Read one uppercase letter, could be WORD or Word.
BIG_WORD, // Reading an uppercase WORD.
SMALL_WORD, // Reading a lowercase word.
NUMBER // Reading a number.
};
enum CharType { UPPER, LOWER, DIGIT, MISC };
CharType classify(char c) {
if (isupper(c))
return UPPER;
if (islower(c))
return LOWER;
if (isdigit(c))
return DIGIT;
return MISC;
}
} // namespace
std::vector<std::string> FuzzySymbolIndex::tokenize(StringRef Text) {
std::vector<std::string> Result;
// State describes the treatment of text from Start to I.
// Once text is Flush()ed into Result, we're done with it and advance Start.
TokenizeState State = EMPTY;
size_t Start = 0;
auto Flush = [&](size_t End) {
if (State != EMPTY) {
Result.push_back(Text.substr(Start, End - Start).lower());
State = EMPTY;
}
Start = End;
};
for (size_t I = 0; I < Text.size(); ++I) {
CharType Type = classify(Text[I]);
if (Type == MISC)
Flush(I);
else if (Type == LOWER)
switch (State) {
case BIG_WORD:
Flush(I - 1); // FOOBar: first token is FOO, not FOOB.
LLVM_FALLTHROUGH;
case ONE_BIG:
State = SMALL_WORD;
LLVM_FALLTHROUGH;
case SMALL_WORD:
break;
default:
Flush(I);
State = SMALL_WORD;
}
else if (Type == UPPER)
switch (State) {
case ONE_BIG:
State = BIG_WORD;
LLVM_FALLTHROUGH;
case BIG_WORD:
break;
default:
Flush(I);
State = ONE_BIG;
}
else if (Type == DIGIT && State != NUMBER) {
Flush(I);
State = NUMBER;
}
}
Flush(Text.size());
return Result;
}
std::string
FuzzySymbolIndex::queryRegexp(const std::vector<std::string> &Tokens) {
std::string Result;
for (size_t I = 0; I < Tokens.size(); ++I) {
if (I)
Result.append("[[:alnum:]]* ");
for (size_t J = 0; J < Tokens[I].size(); ++J) {
if (J)
Result.append("([[:alnum:]]* )?");
Result.push_back(Tokens[I][J]);
}
}
return Result;
}
llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
FuzzySymbolIndex::createFromYAML(StringRef FilePath) {
auto Buffer = llvm::MemoryBuffer::getFile(FilePath);
if (!Buffer)
return llvm::errorCodeToError(Buffer.getError());
return llvm::make_unique<MemSymbolIndex>(
find_all_symbols::ReadSymbolInfosFromYAML(Buffer.get()->getBuffer()));
}
} // namespace include_fixer
} // namespace clang

View File

@ -0,0 +1,55 @@
//===--- FuzzySymbolIndex.h - Lookup symbols for autocomplete ---*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
#define LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H
#include "SymbolIndex.h"
#include "find-all-symbols/SymbolInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Error.h"
#include <string>
#include <vector>
namespace clang {
namespace include_fixer {
// A FuzzySymbolIndex retrieves top-level symbols matching a query string.
//
// It refines the contract of SymbolIndex::search to do fuzzy matching:
// - symbol names are tokenized: "unique ptr", "string ref".
// - query must match prefixes of symbol tokens: [upt]
// - if the query has multiple tokens, splits must match: [StR], not [STr].
// Helpers for tokenization and regex matching are provided.
//
// Implementations may choose to truncate results, refuse short queries, etc.
class FuzzySymbolIndex : public SymbolIndex {
public:
// Loads the specified include-fixer database and returns an index serving it.
static llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
createFromYAML(llvm::StringRef File);
// Helpers for implementing indexes:
// Transforms a symbol name or query into a sequence of tokens.
// - URLHandlerCallback --> [url, handler, callback]
// - snake_case11 --> [snake, case, 11]
// - _WTF$ --> [wtf]
static std::vector<std::string> tokenize(llvm::StringRef Text);
// Transforms query tokens into an unanchored regexp to match symbol tokens.
// - [fe f] --> /f(\w* )?e\w* f/, matches [fee fie foe].
static std::string queryRegexp(const std::vector<std::string> &Tokens);
};
} // namespace include_fixer
} // namespace clang
#endif // LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H

View File

@ -103,46 +103,44 @@ SymbolIndexManager::search(llvm::StringRef Identifier,
for (auto &SymAndSig : Symbols) {
const SymbolInfo &Symbol = SymAndSig.Symbol;
// Match the identifier name without qualifier.
if (Symbol.getName() == Names.back()) {
bool IsMatched = true;
auto SymbolContext = Symbol.getContexts().begin();
auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name.
// Match the remaining context names.
while (IdentiferContext != Names.rend() &&
SymbolContext != Symbol.getContexts().end()) {
if (SymbolContext->second == *IdentiferContext) {
++IdentiferContext;
++SymbolContext;
} else if (SymbolContext->first ==
find_all_symbols::SymbolInfo::ContextType::EnumDecl) {
// Skip non-scoped enum context.
++SymbolContext;
} else {
IsMatched = false;
break;
}
bool IsMatched = true;
auto SymbolContext = Symbol.getContexts().begin();
auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name.
// Match the remaining context names.
while (IdentiferContext != Names.rend() &&
SymbolContext != Symbol.getContexts().end()) {
if (SymbolContext->second == *IdentiferContext) {
++IdentiferContext;
++SymbolContext;
} else if (SymbolContext->first ==
find_all_symbols::SymbolInfo::ContextType::EnumDecl) {
// Skip non-scoped enum context.
++SymbolContext;
} else {
IsMatched = false;
break;
}
}
// If the name was qualified we only want to add results if we evaluated
// all contexts.
if (IsFullyQualified)
IsMatched &= (SymbolContext == Symbol.getContexts().end());
// If the name was qualified we only want to add results if we evaluated
// all contexts.
if (IsFullyQualified)
IsMatched &= (SymbolContext == Symbol.getContexts().end());
// FIXME: Support full match. At this point, we only find symbols in
// database which end with the same contexts with the identifier.
if (IsMatched && IdentiferContext == Names.rend()) {
// If we're in a situation where we took a prefix but the thing we
// found couldn't possibly have a nested member ignore it.
if (TookPrefix &&
(Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function ||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable ||
Symbol.getSymbolKind() ==
SymbolInfo::SymbolKind::EnumConstantDecl ||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro))
continue;
// FIXME: Support full match. At this point, we only find symbols in
// database which end with the same contexts with the identifier.
if (IsMatched && IdentiferContext == Names.rend()) {
// If we're in a situation where we took a prefix but the thing we
// found couldn't possibly have a nested member ignore it.
if (TookPrefix &&
(Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function ||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable ||
Symbol.getSymbolKind() ==
SymbolInfo::SymbolKind::EnumConstantDecl ||
Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro))
continue;
MatchedSymbols.push_back(std::move(SymAndSig));
}
MatchedSymbols.push_back(std::move(SymAndSig));
}
}
Names.pop_back();
@ -152,7 +150,7 @@ SymbolIndexManager::search(llvm::StringRef Identifier,
rank(MatchedSymbols, FileName);
// Strip signals, they are no longer needed.
std::vector<SymbolInfo> Res;
for (const auto &SymAndSig : MatchedSymbols)
for (auto &SymAndSig : MatchedSymbols)
Res.push_back(std::move(SymAndSig.Symbol));
return Res;
}

View File

@ -7,6 +7,7 @@
//
//===----------------------------------------------------------------------===//
#include "FuzzySymbolIndex.h"
#include "InMemorySymbolIndex.h"
#include "IncludeFixer.h"
#include "IncludeFixerContext.h"
@ -83,14 +84,16 @@ namespace {
cl::OptionCategory IncludeFixerCategory("Tool options");
enum DatabaseFormatTy {
fixed, ///< Hard-coded mapping.
yaml, ///< Yaml database created by find-all-symbols.
fixed, ///< Hard-coded mapping.
yaml, ///< Yaml database created by find-all-symbols.
fuzzyYaml, ///< Yaml database with fuzzy-matched identifiers.
};
cl::opt<DatabaseFormatTy> DatabaseFormat(
"db", cl::desc("Specify input format"),
cl::values(clEnumVal(fixed, "Hard-coded mapping"),
clEnumVal(yaml, "Yaml database created by find-all-symbols")),
clEnumVal(yaml, "Yaml database created by find-all-symbols"),
clEnumVal(fuzzyYaml, "Yaml database, with fuzzy-matched names")),
cl::init(yaml), cl::cat(IncludeFixerCategory));
cl::opt<std::string> Input("input",
@ -215,6 +218,21 @@ createSymbolIndexManager(StringRef FilePath) {
SymbolIndexMgr->addSymbolIndex(std::move(CreateYamlIdx));
break;
}
case fuzzyYaml: {
// This mode is not very useful, because we don't correct the identifier.
// It's main purpose is to expose FuzzySymbolIndex to tests.
SymbolIndexMgr->addSymbolIndex(
[]() -> std::unique_ptr<include_fixer::SymbolIndex> {
auto DB = include_fixer::FuzzySymbolIndex::createFromYAML(Input);
if (!DB) {
llvm::errs() << "Couldn't load fuzzy YAML db: "
<< llvm::toString(DB.takeError()) << '\n';
return nullptr;
}
return std::move(*DB);
});
break;
}
}
return SymbolIndexMgr;
}

View File

@ -10,6 +10,17 @@ Type: Class
Seen: 1
Used: 0
---
Name: foo_bar
Contexts:
- ContextType: Namespace
ContextName: a
- ContextType: Namespace
ContextName: b
FilePath: foobar.h
Type: Class
Seen: 0
Used: 0
---
Name: bar
Contexts:
- ContextType: Namespace

View File

@ -0,0 +1,9 @@
// RUN: sed -e 's#//.*$##' %s > %t.cpp
// RUN: clang-include-fixer -db=fuzzyYaml -input=%p/Inputs/fake_yaml_db.yaml %t.cpp --
// RUN: FileCheck %s -input-file=%t.cpp
// include-fixer will add the include, but doesn't complete the symbol.
// CHECK: #include "foobar.h"
// CHECK: fba f;
b::a::fba f;

View File

@ -13,6 +13,7 @@ include_directories(${CLANG_SOURCE_DIR})
add_extra_unittest(IncludeFixerTests
IncludeFixerTest.cpp
FuzzySymbolIndexTests.cpp
)
target_link_libraries(IncludeFixerTests

View File

@ -0,0 +1,61 @@
//===-- FuzzySymbolIndexTests.cpp - Fuzzy symbol index unit tests ---------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "FuzzySymbolIndex.h"
#include "gmock/gmock.h"
#include "llvm/Support/Regex.h"
#include "gtest/gtest.h"
using testing::ElementsAre;
using testing::Not;
namespace clang {
namespace include_fixer {
namespace {
TEST(FuzzySymbolIndexTest, Tokenize) {
EXPECT_THAT(FuzzySymbolIndex::tokenize("URLHandlerCallback"),
ElementsAre("url", "handler", "callback"));
EXPECT_THAT(FuzzySymbolIndex::tokenize("snake_case11"),
ElementsAre("snake", "case", "11"));
EXPECT_THAT(FuzzySymbolIndex::tokenize("__$42!!BOB\nbob"),
ElementsAre("42", "bob", "bob"));
}
MATCHER_P(MatchesSymbol, Identifier, "") {
llvm::Regex Pattern("^" + arg);
std::string err;
if (!Pattern.isValid(err)) {
*result_listener << "invalid regex: " << err;
return false;
}
auto Tokens = FuzzySymbolIndex::tokenize(Identifier);
std::string Target = llvm::join(Tokens.begin(), Tokens.end(), " ");
*result_listener << "matching against '" << Target << "'";
return llvm::Regex("^" + arg).match(Target);
}
TEST(FuzzySymbolIndexTest, QueryRegexp) {
auto QueryRegexp = [](const std::string &query) {
return FuzzySymbolIndex::queryRegexp(FuzzySymbolIndex::tokenize(query));
};
EXPECT_THAT(QueryRegexp("uhc"), MatchesSymbol("URLHandlerCallback"));
EXPECT_THAT(QueryRegexp("urhaca"), MatchesSymbol("URLHandlerCallback"));
EXPECT_THAT(QueryRegexp("uhcb"), Not(MatchesSymbol("URLHandlerCallback")))
<< "Non-prefix";
EXPECT_THAT(QueryRegexp("uc"), Not(MatchesSymbol("URLHandlerCallback")))
<< "Skip token";
EXPECT_THAT(QueryRegexp("uptr"), MatchesSymbol("unique_ptr"));
EXPECT_THAT(QueryRegexp("UniP"), MatchesSymbol("unique_ptr"));
}
} // namespace
} // namespace include_fixer
} // namespace clang