diff --git a/clang-tools-extra/include-fixer/CMakeLists.txt b/clang-tools-extra/include-fixer/CMakeLists.txt index 9313030106a7..3981fcd39162 100644 --- a/clang-tools-extra/include-fixer/CMakeLists.txt +++ b/clang-tools-extra/include-fixer/CMakeLists.txt @@ -6,6 +6,7 @@ add_clang_library(clangIncludeFixer IncludeFixer.cpp IncludeFixerContext.cpp InMemorySymbolIndex.cpp + FuzzySymbolIndex.cpp SymbolIndexManager.cpp YamlSymbolIndex.cpp diff --git a/clang-tools-extra/include-fixer/FuzzySymbolIndex.cpp b/clang-tools-extra/include-fixer/FuzzySymbolIndex.cpp new file mode 100644 index 000000000000..d91f4552ca74 --- /dev/null +++ b/clang-tools-extra/include-fixer/FuzzySymbolIndex.cpp @@ -0,0 +1,143 @@ +//===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "FuzzySymbolIndex.h" +#include "llvm/Support/Regex.h" + +using clang::find_all_symbols::SymbolAndSignals; +using llvm::StringRef; + +namespace clang { +namespace include_fixer { +namespace { + +class MemSymbolIndex : public FuzzySymbolIndex { +public: + MemSymbolIndex(std::vector Symbols) { + for (auto &Symbol : Symbols) { + auto Tokens = tokenize(Symbol.Symbol.getName()); + this->Symbols.emplace_back( + StringRef(llvm::join(Tokens.begin(), Tokens.end(), " ")), + std::move(Symbol)); + } + } + + std::vector search(StringRef Query) override { + auto Tokens = tokenize(Query); + llvm::Regex Pattern("^" + queryRegexp(Tokens)); + std::vector Results; + for (const Entry &E : Symbols) + if (Pattern.match(E.first)) + Results.push_back(E.second); + return Results; + } + +private: + using Entry = std::pair, SymbolAndSignals>; + std::vector Symbols; +}; + +// Helpers for tokenize state machine. +enum TokenizeState { + EMPTY, // No pending characters. + ONE_BIG, // Read one uppercase letter, could be WORD or Word. + BIG_WORD, // Reading an uppercase WORD. + SMALL_WORD, // Reading a lowercase word. + NUMBER // Reading a number. +}; + +enum CharType { UPPER, LOWER, DIGIT, MISC }; +CharType classify(char c) { + if (isupper(c)) + return UPPER; + if (islower(c)) + return LOWER; + if (isdigit(c)) + return DIGIT; + return MISC; +} + +} // namespace + +std::vector FuzzySymbolIndex::tokenize(StringRef Text) { + std::vector Result; + // State describes the treatment of text from Start to I. + // Once text is Flush()ed into Result, we're done with it and advance Start. + TokenizeState State = EMPTY; + size_t Start = 0; + auto Flush = [&](size_t End) { + if (State != EMPTY) { + Result.push_back(Text.substr(Start, End - Start).lower()); + State = EMPTY; + } + Start = End; + }; + for (size_t I = 0; I < Text.size(); ++I) { + CharType Type = classify(Text[I]); + if (Type == MISC) + Flush(I); + else if (Type == LOWER) + switch (State) { + case BIG_WORD: + Flush(I - 1); // FOOBar: first token is FOO, not FOOB. + LLVM_FALLTHROUGH; + case ONE_BIG: + State = SMALL_WORD; + LLVM_FALLTHROUGH; + case SMALL_WORD: + break; + default: + Flush(I); + State = SMALL_WORD; + } + else if (Type == UPPER) + switch (State) { + case ONE_BIG: + State = BIG_WORD; + LLVM_FALLTHROUGH; + case BIG_WORD: + break; + default: + Flush(I); + State = ONE_BIG; + } + else if (Type == DIGIT && State != NUMBER) { + Flush(I); + State = NUMBER; + } + } + Flush(Text.size()); + return Result; +} + +std::string +FuzzySymbolIndex::queryRegexp(const std::vector &Tokens) { + std::string Result; + for (size_t I = 0; I < Tokens.size(); ++I) { + if (I) + Result.append("[[:alnum:]]* "); + for (size_t J = 0; J < Tokens[I].size(); ++J) { + if (J) + Result.append("([[:alnum:]]* )?"); + Result.push_back(Tokens[I][J]); + } + } + return Result; +} + +llvm::Expected> +FuzzySymbolIndex::createFromYAML(StringRef FilePath) { + auto Buffer = llvm::MemoryBuffer::getFile(FilePath); + if (!Buffer) + return llvm::errorCodeToError(Buffer.getError()); + return llvm::make_unique( + find_all_symbols::ReadSymbolInfosFromYAML(Buffer.get()->getBuffer())); +} + +} // namespace include_fixer +} // namespace clang diff --git a/clang-tools-extra/include-fixer/FuzzySymbolIndex.h b/clang-tools-extra/include-fixer/FuzzySymbolIndex.h new file mode 100644 index 000000000000..245191eb5fd0 --- /dev/null +++ b/clang-tools-extra/include-fixer/FuzzySymbolIndex.h @@ -0,0 +1,55 @@ +//===--- FuzzySymbolIndex.h - Lookup symbols for autocomplete ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H +#define LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H + +#include "SymbolIndex.h" +#include "find-all-symbols/SymbolInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include +#include + +namespace clang { +namespace include_fixer { + +// A FuzzySymbolIndex retrieves top-level symbols matching a query string. +// +// It refines the contract of SymbolIndex::search to do fuzzy matching: +// - symbol names are tokenized: "unique ptr", "string ref". +// - query must match prefixes of symbol tokens: [upt] +// - if the query has multiple tokens, splits must match: [StR], not [STr]. +// Helpers for tokenization and regex matching are provided. +// +// Implementations may choose to truncate results, refuse short queries, etc. +class FuzzySymbolIndex : public SymbolIndex { +public: + // Loads the specified include-fixer database and returns an index serving it. + static llvm::Expected> + createFromYAML(llvm::StringRef File); + + // Helpers for implementing indexes: + + // Transforms a symbol name or query into a sequence of tokens. + // - URLHandlerCallback --> [url, handler, callback] + // - snake_case11 --> [snake, case, 11] + // - _WTF$ --> [wtf] + static std::vector tokenize(llvm::StringRef Text); + + // Transforms query tokens into an unanchored regexp to match symbol tokens. + // - [fe f] --> /f(\w* )?e\w* f/, matches [fee fie foe]. + static std::string queryRegexp(const std::vector &Tokens); +}; + +} // namespace include_fixer +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_INCLUDE_FIXER_FUZZY_SYMBOL_INDEX_H diff --git a/clang-tools-extra/include-fixer/SymbolIndexManager.cpp b/clang-tools-extra/include-fixer/SymbolIndexManager.cpp index 1cb309761ea9..31ecc613d1ca 100644 --- a/clang-tools-extra/include-fixer/SymbolIndexManager.cpp +++ b/clang-tools-extra/include-fixer/SymbolIndexManager.cpp @@ -103,46 +103,44 @@ SymbolIndexManager::search(llvm::StringRef Identifier, for (auto &SymAndSig : Symbols) { const SymbolInfo &Symbol = SymAndSig.Symbol; // Match the identifier name without qualifier. - if (Symbol.getName() == Names.back()) { - bool IsMatched = true; - auto SymbolContext = Symbol.getContexts().begin(); - auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name. - // Match the remaining context names. - while (IdentiferContext != Names.rend() && - SymbolContext != Symbol.getContexts().end()) { - if (SymbolContext->second == *IdentiferContext) { - ++IdentiferContext; - ++SymbolContext; - } else if (SymbolContext->first == - find_all_symbols::SymbolInfo::ContextType::EnumDecl) { - // Skip non-scoped enum context. - ++SymbolContext; - } else { - IsMatched = false; - break; - } + bool IsMatched = true; + auto SymbolContext = Symbol.getContexts().begin(); + auto IdentiferContext = Names.rbegin() + 1; // Skip identifier name. + // Match the remaining context names. + while (IdentiferContext != Names.rend() && + SymbolContext != Symbol.getContexts().end()) { + if (SymbolContext->second == *IdentiferContext) { + ++IdentiferContext; + ++SymbolContext; + } else if (SymbolContext->first == + find_all_symbols::SymbolInfo::ContextType::EnumDecl) { + // Skip non-scoped enum context. + ++SymbolContext; + } else { + IsMatched = false; + break; } + } - // If the name was qualified we only want to add results if we evaluated - // all contexts. - if (IsFullyQualified) - IsMatched &= (SymbolContext == Symbol.getContexts().end()); + // If the name was qualified we only want to add results if we evaluated + // all contexts. + if (IsFullyQualified) + IsMatched &= (SymbolContext == Symbol.getContexts().end()); - // FIXME: Support full match. At this point, we only find symbols in - // database which end with the same contexts with the identifier. - if (IsMatched && IdentiferContext == Names.rend()) { - // If we're in a situation where we took a prefix but the thing we - // found couldn't possibly have a nested member ignore it. - if (TookPrefix && - (Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function || - Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable || - Symbol.getSymbolKind() == - SymbolInfo::SymbolKind::EnumConstantDecl || - Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro)) - continue; + // FIXME: Support full match. At this point, we only find symbols in + // database which end with the same contexts with the identifier. + if (IsMatched && IdentiferContext == Names.rend()) { + // If we're in a situation where we took a prefix but the thing we + // found couldn't possibly have a nested member ignore it. + if (TookPrefix && + (Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Function || + Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Variable || + Symbol.getSymbolKind() == + SymbolInfo::SymbolKind::EnumConstantDecl || + Symbol.getSymbolKind() == SymbolInfo::SymbolKind::Macro)) + continue; - MatchedSymbols.push_back(std::move(SymAndSig)); - } + MatchedSymbols.push_back(std::move(SymAndSig)); } } Names.pop_back(); @@ -152,7 +150,7 @@ SymbolIndexManager::search(llvm::StringRef Identifier, rank(MatchedSymbols, FileName); // Strip signals, they are no longer needed. std::vector Res; - for (const auto &SymAndSig : MatchedSymbols) + for (auto &SymAndSig : MatchedSymbols) Res.push_back(std::move(SymAndSig.Symbol)); return Res; } diff --git a/clang-tools-extra/include-fixer/tool/ClangIncludeFixer.cpp b/clang-tools-extra/include-fixer/tool/ClangIncludeFixer.cpp index 2740c4d23a24..9db129cd7e03 100644 --- a/clang-tools-extra/include-fixer/tool/ClangIncludeFixer.cpp +++ b/clang-tools-extra/include-fixer/tool/ClangIncludeFixer.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "FuzzySymbolIndex.h" #include "InMemorySymbolIndex.h" #include "IncludeFixer.h" #include "IncludeFixerContext.h" @@ -83,14 +84,16 @@ namespace { cl::OptionCategory IncludeFixerCategory("Tool options"); enum DatabaseFormatTy { - fixed, ///< Hard-coded mapping. - yaml, ///< Yaml database created by find-all-symbols. + fixed, ///< Hard-coded mapping. + yaml, ///< Yaml database created by find-all-symbols. + fuzzyYaml, ///< Yaml database with fuzzy-matched identifiers. }; cl::opt DatabaseFormat( "db", cl::desc("Specify input format"), cl::values(clEnumVal(fixed, "Hard-coded mapping"), - clEnumVal(yaml, "Yaml database created by find-all-symbols")), + clEnumVal(yaml, "Yaml database created by find-all-symbols"), + clEnumVal(fuzzyYaml, "Yaml database, with fuzzy-matched names")), cl::init(yaml), cl::cat(IncludeFixerCategory)); cl::opt Input("input", @@ -215,6 +218,21 @@ createSymbolIndexManager(StringRef FilePath) { SymbolIndexMgr->addSymbolIndex(std::move(CreateYamlIdx)); break; } + case fuzzyYaml: { + // This mode is not very useful, because we don't correct the identifier. + // It's main purpose is to expose FuzzySymbolIndex to tests. + SymbolIndexMgr->addSymbolIndex( + []() -> std::unique_ptr { + auto DB = include_fixer::FuzzySymbolIndex::createFromYAML(Input); + if (!DB) { + llvm::errs() << "Couldn't load fuzzy YAML db: " + << llvm::toString(DB.takeError()) << '\n'; + return nullptr; + } + return std::move(*DB); + }); + break; + } } return SymbolIndexMgr; } diff --git a/clang-tools-extra/test/include-fixer/Inputs/fake_yaml_db.yaml b/clang-tools-extra/test/include-fixer/Inputs/fake_yaml_db.yaml index 654384d5c6fd..b599005e2c3f 100644 --- a/clang-tools-extra/test/include-fixer/Inputs/fake_yaml_db.yaml +++ b/clang-tools-extra/test/include-fixer/Inputs/fake_yaml_db.yaml @@ -10,6 +10,17 @@ Type: Class Seen: 1 Used: 0 --- +Name: foo_bar +Contexts: + - ContextType: Namespace + ContextName: a + - ContextType: Namespace + ContextName: b +FilePath: foobar.h +Type: Class +Seen: 0 +Used: 0 +--- Name: bar Contexts: - ContextType: Namespace diff --git a/clang-tools-extra/test/include-fixer/yaml_fuzzy.cpp b/clang-tools-extra/test/include-fixer/yaml_fuzzy.cpp new file mode 100644 index 000000000000..705542dd0d10 --- /dev/null +++ b/clang-tools-extra/test/include-fixer/yaml_fuzzy.cpp @@ -0,0 +1,9 @@ +// RUN: sed -e 's#//.*$##' %s > %t.cpp +// RUN: clang-include-fixer -db=fuzzyYaml -input=%p/Inputs/fake_yaml_db.yaml %t.cpp -- +// RUN: FileCheck %s -input-file=%t.cpp + +// include-fixer will add the include, but doesn't complete the symbol. +// CHECK: #include "foobar.h" +// CHECK: fba f; + +b::a::fba f; diff --git a/clang-tools-extra/unittests/include-fixer/CMakeLists.txt b/clang-tools-extra/unittests/include-fixer/CMakeLists.txt index f82a1876bd18..caab1cb3974d 100644 --- a/clang-tools-extra/unittests/include-fixer/CMakeLists.txt +++ b/clang-tools-extra/unittests/include-fixer/CMakeLists.txt @@ -13,6 +13,7 @@ include_directories(${CLANG_SOURCE_DIR}) add_extra_unittest(IncludeFixerTests IncludeFixerTest.cpp + FuzzySymbolIndexTests.cpp ) target_link_libraries(IncludeFixerTests diff --git a/clang-tools-extra/unittests/include-fixer/FuzzySymbolIndexTests.cpp b/clang-tools-extra/unittests/include-fixer/FuzzySymbolIndexTests.cpp new file mode 100644 index 000000000000..17679c5d379c --- /dev/null +++ b/clang-tools-extra/unittests/include-fixer/FuzzySymbolIndexTests.cpp @@ -0,0 +1,61 @@ +//===-- FuzzySymbolIndexTests.cpp - Fuzzy symbol index unit tests ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "FuzzySymbolIndex.h" +#include "gmock/gmock.h" +#include "llvm/Support/Regex.h" +#include "gtest/gtest.h" + +using testing::ElementsAre; +using testing::Not; + +namespace clang { +namespace include_fixer { +namespace { + +TEST(FuzzySymbolIndexTest, Tokenize) { + EXPECT_THAT(FuzzySymbolIndex::tokenize("URLHandlerCallback"), + ElementsAre("url", "handler", "callback")); + EXPECT_THAT(FuzzySymbolIndex::tokenize("snake_case11"), + ElementsAre("snake", "case", "11")); + EXPECT_THAT(FuzzySymbolIndex::tokenize("__$42!!BOB\nbob"), + ElementsAre("42", "bob", "bob")); +} + +MATCHER_P(MatchesSymbol, Identifier, "") { + llvm::Regex Pattern("^" + arg); + std::string err; + if (!Pattern.isValid(err)) { + *result_listener << "invalid regex: " << err; + return false; + } + auto Tokens = FuzzySymbolIndex::tokenize(Identifier); + std::string Target = llvm::join(Tokens.begin(), Tokens.end(), " "); + *result_listener << "matching against '" << Target << "'"; + return llvm::Regex("^" + arg).match(Target); +} + +TEST(FuzzySymbolIndexTest, QueryRegexp) { + auto QueryRegexp = [](const std::string &query) { + return FuzzySymbolIndex::queryRegexp(FuzzySymbolIndex::tokenize(query)); + }; + EXPECT_THAT(QueryRegexp("uhc"), MatchesSymbol("URLHandlerCallback")); + EXPECT_THAT(QueryRegexp("urhaca"), MatchesSymbol("URLHandlerCallback")); + EXPECT_THAT(QueryRegexp("uhcb"), Not(MatchesSymbol("URLHandlerCallback"))) + << "Non-prefix"; + EXPECT_THAT(QueryRegexp("uc"), Not(MatchesSymbol("URLHandlerCallback"))) + << "Skip token"; + + EXPECT_THAT(QueryRegexp("uptr"), MatchesSymbol("unique_ptr")); + EXPECT_THAT(QueryRegexp("UniP"), MatchesSymbol("unique_ptr")); +} + +} // namespace +} // namespace include_fixer +} // namespace clang