[clangd] Boost code completion results that were named in the last few lines.

Summary: The hope is this will catch a few patterns with repetition: SomeClass* S = ^SomeClass::Create() int getFrobnicator() { return ^frobnicator_; } // discard the factory, it's no longer valid. ^MyFactory.reset(); Without triggering antipatterns too often: return Point(x.first, x.^second); I'm going to gather some data on whether this turns out to be a win overall. Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, jfb, kadircet, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D61537 llvm-svn: 360030
2019-05-06 10:25:10 +00:00 · 2019-05-06 10:25:10 +00:00 · 9fb22b2c86
parent 4c3d579096
commit 9fb22b2c86
10 changed files with 142 additions and 6 deletions
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@ -54,7 +54,9 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
@ -1215,6 +1217,7 @@ class CodeCompleteFlow {
  llvm::Optional<OpaqueType> PreferredType; // Initialized once Sema runs.
  // Whether to query symbols from any scope. Initialized once Sema runs.
  bool AllScopes = false;
+  llvm::StringSet<> ContextWords;
  // Include-insertion and proximity scoring rely on the include structure.
  // This is available after Sema has run.
  llvm::Optional<IncludeInserter> Inserter;  // Available during runWithSema.
@ -1237,6 +1240,7 @@ public:
    trace::Span Tracer("CodeCompleteFlow");
    HeuristicPrefix =
        guessCompletionPrefix(SemaCCInput.Contents, SemaCCInput.Offset);
+    populateContextWords(SemaCCInput.Contents);
    if (Opts.Index && SpecFuzzyFind && SpecFuzzyFind->CachedReq.hasValue()) {
      assert(!SpecFuzzyFind->Result.valid());
      SpecReq = speculativeFuzzyFindRequestForCompletion(
@ -1323,6 +1327,7 @@ public:
    trace::Span Tracer("CodeCompleteWithoutSema");
    // Fill in fields normally set by runWithSema()
    HeuristicPrefix = guessCompletionPrefix(Content, Offset);
+    populateContextWords(Content);
    CCContextKind = CodeCompletionContext::CCC_Recovery;
    Filter = FuzzyMatcher(HeuristicPrefix.Name);
    auto Pos = offsetToPosition(Content, Offset);
@ -1380,6 +1385,24 @@ public:
  }

 private:
+  void populateContextWords(llvm::StringRef Content) {
+    // Take last 3 lines before the completion point.
+    unsigned RangeEnd = HeuristicPrefix.Qualifier.begin() - Content.data(),
+             RangeBegin = RangeEnd;
+    for (size_t I = 0; I < 3 && RangeBegin > 0; ++I) {
+      auto PrevNL = Content.rfind('\n', RangeBegin - 1);
+      if (PrevNL == StringRef::npos) {
+        RangeBegin = 0;
+        break;
+      }
+      RangeBegin = PrevNL + 1;
+    }
+
+    ContextWords = collectWords(Content.slice(RangeBegin, RangeEnd));
+    dlog("Completion context words: {0}",
+         llvm::join(ContextWords.keys(), ", "));
+  }
+
  // This is called by run() once Sema code completion is done, but before the
  // Sema data structures are torn down. It does all the real work.
  CodeCompleteResult runWithSema() {
@ -1563,12 +1586,14 @@ private:
    SymbolQualitySignals Quality;
    SymbolRelevanceSignals Relevance;
    Relevance.Context = CCContextKind;
+    Relevance.Name = Bundle.front().Name;
    Relevance.Query = SymbolRelevanceSignals::CodeComplete;
    Relevance.FileProximityMatch = FileProximity.getPointer();
    if (ScopeProximity)
      Relevance.ScopeProximityMatch = ScopeProximity.getPointer();
    if (PreferredType)
      Relevance.HadContextType = true;
+    Relevance.ContextWords = &ContextWords;

    auto &First = Bundle.front();
    if (auto FuzzyScore = fuzzyScore(First))
--- a/clang-tools-extra/clangd/FindSymbols.cpp
+++ b/clang-tools-extra/clangd/FindSymbols.cpp
@ -100,6 +100,7 @@ getWorkspaceSymbols(llvm::StringRef Query, int Limit,
    SymbolQualitySignals Quality;
    Quality.merge(Sym);
    SymbolRelevanceSignals Relevance;
+    Relevance.Name = Sym.Name;
    Relevance.Query = SymbolRelevanceSignals::Generic;
    if (auto NameMatch = Filter.match(Sym.Name))
      Relevance.NameMatch = *NameMatch;
--- a/clang-tools-extra/clangd/Quality.cpp
+++ b/clang-tools-extra/clangd/Quality.cpp
@ -336,6 +336,15 @@ static float scopeBoost(ScopeDistance &Distance,
  return std::max(0.65, 2.0 * std::pow(0.6, D / 2.0));
 }

+static llvm::Optional<llvm::StringRef>
+wordMatching(llvm::StringRef Name, const llvm::StringSet<> *ContextWords) {
+  if (ContextWords)
+    for (const auto& Word : ContextWords->keys())
+      if (Name.contains_lower(Word))
+        return Word;
+  return llvm::None;
+}
+
 float SymbolRelevanceSignals::evaluate() const {
  float Score = 1;

@ -357,6 +366,9 @@ float SymbolRelevanceSignals::evaluate() const {
    Score *=
        SemaSaysInScope ? 2.0 : scopeBoost(*ScopeProximityMatch, SymbolScope);

+  if (wordMatching(Name, ContextWords))
+    Score *= 1.5;
+
  // Symbols like local variables may only be referenced within their scope.
  // Conversely if we're in that scope, it's likely we'll reference them.
  if (Query == CodeComplete) {
@ -413,7 +425,12 @@ float SymbolRelevanceSignals::evaluate() const {
 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                              const SymbolRelevanceSignals &S) {
  OS << llvm::formatv("=== Symbol relevance: {0}\n", S.evaluate());
+  OS << llvm::formatv("\tName: {0}\n", S.Name);
  OS << llvm::formatv("\tName match: {0}\n", S.NameMatch);
+  if (S.ContextWords)
+    OS << llvm::formatv(
+        "\tMatching context word: {0}\n",
+        wordMatching(S.Name, S.ContextWords).getValueOr("<none>"));
  OS << llvm::formatv("\tForbidden: {0}\n", S.Forbidden);
  OS << llvm::formatv("\tNeedsFixIts: {0}\n", S.NeedsFixIts);
  OS << llvm::formatv("\tIsInstanceMember: {0}\n", S.IsInstanceMember);
--- a/clang-tools-extra/clangd/Quality.h
+++ b/clang-tools-extra/clangd/Quality.h
@ -32,13 +32,14 @@
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include <algorithm>
 #include <functional>
 #include <vector>

 namespace llvm {
 class raw_ostream;
-}
+} // namespace llvm

 namespace clang {
 class CodeCompletionResult;
@ -84,8 +85,12 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &,

 /// Attributes of a symbol-query pair that affect how much we like it.
 struct SymbolRelevanceSignals {
+  /// The name of the symbol (for ContextWords). Must be explicitly assigned.
+  llvm::StringRef Name;
  /// 0-1+ fuzzy-match score for unqualified name. Must be explicitly assigned.
  float NameMatch = 1;
+  /// Lowercase words relevant to the context (e.g. near the completion point).
+  llvm::StringSet<>* ContextWords = nullptr;
  bool Forbidden = false; // Unavailable (e.g const) or inaccessible (private).
  /// Whether fixits needs to be applied for that completion or not.
  bool NeedsFixIts = false;
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@ -8,6 +8,7 @@
 #include "SourceCode.h"

 #include "Context.h"
+#include "FuzzyMatch.h"
 #include "Logger.h"
 #include "Protocol.h"
 #include "clang/AST/ASTContext.h"
@ -18,6 +19,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@ -602,5 +604,43 @@ std::vector<std::string> visibleNamespaces(llvm::StringRef Code,
  return Found;
 }

+llvm::StringSet<> collectWords(llvm::StringRef Content) {
+  // We assume short words are not significant.
+  // We may want to consider other stopwords, e.g. language keywords.
+  // (A very naive implementation showed no benefit, but lexing might do better)
+  static constexpr int MinWordLength = 4;
+
+  std::vector<CharRole> Roles(Content.size());
+  calculateRoles(Content, Roles);
+
+  llvm::StringSet<> Result;
+  llvm::SmallString<256> Word;
+  auto Flush = [&] {
+    if (Word.size() >= MinWordLength) {
+      for (char &C : Word)
+        C = llvm::toLower(C);
+      Result.insert(Word);
+    }
+    Word.clear();
+  };
+  for (unsigned I = 0; I < Content.size(); ++I) {
+    switch (Roles[I]) {
+    case Head:
+      Flush();
+      LLVM_FALLTHROUGH;
+    case Tail:
+      Word.push_back(Content[I]);
+      break;
+    case Unknown:
+    case Separator:
+      Flush();
+      break;
+    }
+  }
+  Flush();
+
+  return Result;
+}
+
 } // namespace clangd
 } // namespace clang
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@ -20,6 +20,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Format/Format.h"
 #include "clang/Tooling/Core/Replacement.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SHA1.h"

@ -165,6 +166,13 @@ cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces,
 llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content,
                                             const format::FormatStyle &Style);

+/// Collects words from the source code.
+/// Unlike collectIdentifiers:
+/// - also finds text in comments:
+/// - splits text into words
+/// - drops stopwords like "get" and "for"
+llvm::StringSet<> collectWords(llvm::StringRef Content);
+
 /// Heuristically determine namespaces visible at a point, without parsing Code.
 /// This considers using-directives and enclosing namespace-declarations that
 /// are visible (and not obfuscated) in the file itself (not headers).
--- a/clang-tools-extra/clangd/test/completion-auto-trigger.test
+++ b/clang-tools-extra/clangd/test/completion-auto-trigger.test
@ -23,7 +23,7 @@
 # CHECK-NEXT:        "insertTextFormat": 1,
 # CHECK-NEXT:        "kind": 5,
 # CHECK-NEXT:        "label": " size",
-# CHECK-NEXT:        "sortText": "3eacccccsize",
+# CHECK-NEXT:        "sortText": "{{.*}}size",
 # CHECK-NEXT:        "textEdit": {
 # CHECK-NEXT:          "newText": "size",
 # CHECK-NEXT:          "range": {
@ -45,7 +45,7 @@
 # CHECK-NEXT:         "insertTextFormat": 1,
 # CHECK-NEXT:         "kind": 10,
 # CHECK-NEXT:         "label": " default_capacity",
-# CHECK-NEXT:         "sortText": "3fd70a3ddefault_capacity",
+# CHECK-NEXT:         "sortText": "{{.*}}default_capacity",
 # CHECK-NEXT:         "textEdit": {
 # CHECK-NEXT:           "newText": "default_capacity",
 # CHECK-NEXT:           "range": {
@ -84,7 +84,7 @@
 # CHECK-NEXT:        "insertTextFormat": 1,
 # CHECK-NEXT:        "kind": 6,
 # CHECK-NEXT:        "label": " ns_member",
-# CHECK-NEXT:        "sortText": "3f2cccccns_member",
+# CHECK-NEXT:        "sortText": "{{.*}}ns_member",
 # CHECK-NEXT:        "textEdit": {
 # CHECK-NEXT:          "newText": "ns_member",
 # CHECK-NEXT:          "range": {
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@ -174,6 +174,7 @@ struct ClassWithMembers {
  int BBB();
  int CCC();
 };
+
 int main() { ClassWithMembers().^ }
      )cpp",
                             /*IndexSymbols=*/{}, Opts);
@ -324,7 +325,7 @@ TEST(CompletionTest, CompletionOptions) {
  }
 }

-TEST(CompletionTest, Priorities) {
+TEST(CompletionTest, Accessible) {
  auto Internal = completions(R"cpp(
      class Foo {
        public: void pub();
@ -334,7 +335,7 @@ TEST(CompletionTest, Priorities) {
      void Foo::pub() { this->^ }
  )cpp");
  EXPECT_THAT(Internal.Completions,
-              HasSubsequence(Named("priv"), Named("prot"), Named("pub")));
+              AllOf(Has("priv"), Has("prot"), Has("pub")));

  auto External = completions(R"cpp(
      class Foo {
@ -502,6 +503,21 @@ TEST(CompletionTest, ReferencesAffectRanking) {
              HasSubsequence(Named("absl"), Named("absb")));
 }

+TEST(CompletionTest, ContextWords) {
+  auto Results = completions(R"cpp(
+  enum class Color { RED, YELLOW, BLUE };
+
+  // (blank lines so the definition above isn't "context")
+
+  // "It was a yellow car," he said. "Big yellow car, new."
+  auto Finish = Color::^
+  )cpp");
+  // Yellow would normally sort last (alphabetic).
+  // But the recent mention shuold bump it up.
+  ASSERT_THAT(Results.Completions,
+              HasSubsequence(Named("YELLOW"), Named("BLUE")));
+}
+
 TEST(CompletionTest, GlobalQualified) {
  auto Results = completions(
      R"cpp(
--- a/clang-tools-extra/clangd/unittests/QualityTests.cpp
+++ b/clang-tools-extra/clangd/unittests/QualityTests.cpp
@ -292,6 +292,16 @@ TEST(QualityTests, SymbolRelevanceSignalsSanity) {
  SymbolRelevanceSignals InBaseClass;
  InBaseClass.InBaseClass = true;
  EXPECT_LT(InBaseClass.evaluate(), Default.evaluate());
+
+  llvm::StringSet<> Words = {"one", "two", "three"};
+  SymbolRelevanceSignals WithoutMatchingWord;
+  WithoutMatchingWord.ContextWords = &Words;
+  WithoutMatchingWord.Name = "four";
+  EXPECT_EQ(WithoutMatchingWord.evaluate(), Default.evaluate());
+  SymbolRelevanceSignals WithMatchingWord;
+  WithMatchingWord.ContextWords = &Words;
+  WithMatchingWord.Name = "TheTwoTowers";
+  EXPECT_GT(WithMatchingWord.evaluate(), Default.evaluate());
 }

 TEST(QualityTests, ScopeProximity) {
--- a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@ -22,6 +22,7 @@ namespace {

 using llvm::Failed;
 using llvm::HasValue;
+using ::testing::UnorderedElementsAreArray;

 MATCHER_P2(Pos, Line, Col, "") {
  return arg.line == int(Line) && arg.character == int(Col);
@ -322,6 +323,19 @@ TEST(SourceCodeTests, CollectIdentifiers) {
  EXPECT_EQ(IDs["foo"], 2u);
 }

+TEST(SourceCodeTests, CollectWords) {
+  auto Words = collectWords(R"cpp(
+  #define FIZZ_BUZZ
+  // this is a comment
+  std::string getSomeText() { return "magic word"; }
+  )cpp");
+  std::set<std::string> ActualWords(Words.keys().begin(), Words.keys().end());
+  std::set<std::string> ExpectedWords = {"define",  "fizz",    "buzz",  "this",
+                                         "comment", "string", "some", "text",
+                                         "return",  "magic",  "word"};
+  EXPECT_EQ(ActualWords, ExpectedWords);
+}
+
 TEST(SourceCodeTests, VisibleNamespaces) {
  std::vector<std::pair<const char *, std::vector<std::string>>> Cases = {
      {