[clangd] Tune the fuzzy-matching algorithm

Summary: To reduce the gap between prefix and initialism matches. The motivation is producing better scoring in one particular example, but the change does not seem to cause large regressions in other cases. The examples is matching 'up' against 'unique_ptr' and 'upper_bound'. Before the change, we had: - "[u]nique_[p]tr" with a score of 0.3, - "[up]per_bound" with a score of 1.0. A 3x difference meant that symbol quality signals were almost always ignored and 'upper_bound' was always ranked higher. However, intuitively, the match scores should be very close for the two. After the change we have the following scores: - "[u]nique_[p]tr" with a score of 0.75, - "[up]per_bound" with a score of 1.0. Reviewers: ioeric Reviewed By: ioeric Subscribers: MaskRay, jkorous, arphaman, kadircet, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D59300 llvm-svn: 356261
2019-03-15 14:00:49 +00:00 · 2019-03-15 14:00:49 +00:00 · 373bee85c2
parent 339daae806
commit 373bee85c2
2 changed files with 28 additions and 12 deletions
--- a/clang-tools-extra/clangd/FuzzyMatch.cpp
+++ b/clang-tools-extra/clangd/FuzzyMatch.cpp
@ -71,7 +71,7 @@ static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
 // Score field is 15 bits wide, min value is -2^14, we use half of that.
 static constexpr int AwfulScore = -(1 << 13);
 static bool isAwful(int S) { return S < AwfulScore / 2; }
-static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score.
+static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
 FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
    : PatN(std::min<int>(MaxPat, Pattern.size())),
@ -267,24 +267,31 @@ bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
 }
 int FuzzyMatcher::skipPenalty(int W, Action Last) const {
-  int S = 0;
+  if (W == 0) // Skipping the first character.
    return 3;
  if (WordRole[W] == Head) // Skipping a segment.
-    S += 1;
+    return 1; // We want to keep this lower than a consecutive match bonus.
-  if (Last == Match) // Non-consecutive match.
+  // Instead of penalizing non-consecutive matches, we give a bonus to a
-    S += 2;          // We'd rather skip a segment than split our match.
+  // consecutive match in matchBonus. This produces a better score distribution
-  return S;
+  // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
  return 0;
 }
 int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
  assert(LowPat[P] == LowWord[W]);
  int S = 1;
-  // Bonus: pattern so far is a (case-insensitive) prefix of the word.
+  bool IsPatSingleCase =
-  if (P == W) // We can't skip pattern characters, so we must have matched all.
+      (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
    ++S;
  // Bonus: case matches, or a Head in the pattern aligns with one in the word.
-  if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) ||
+  // Single-case patterns lack segmentation signals and we assume any character
-      (PatRole[P] == Head && WordRole[W] == Head))
+  // can be a head of a segment.
  if (Pat[P] == Word[W] ||
      (WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
    ++S;
  // Bonus: a consecutive match. First character match also gets a bonus to
  // ensure prefix final match score normalizes to 1.0.
  if (W == 0 || Last == Match)
    S += 2;
  // Penalty: matching inside a segment (and previous char wasn't matched).
  if (WordRole[W] == Tail && P && Last == Miss)
    S -= 3;
--- a/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
+++ b/clang-tools-extra/unittests/clangd/FuzzyMatchTests.cpp
@ -9,6 +9,7 @@
 #include "FuzzyMatch.h"
 #include "llvm/ADT/StringExtras.h"
 #include "gmock/gmock-matchers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@ -247,6 +248,8 @@ TEST(FuzzyMatch, Ranking) {
  EXPECT_THAT("foo", ranks("[foo]", "[Foo]"));
  EXPECT_THAT("onMes",
              ranks("[onMes]sage", "[onmes]sage", "[on]This[M]ega[Es]capes"));
  EXPECT_THAT("onmes",
              ranks("[onmes]sage", "[onMes]sage", "[on]This[M]ega[Es]capes"));
  EXPECT_THAT("CC", ranks("[C]amel[C]ase", "[c]amel[C]ase"));
  EXPECT_THAT("cC", ranks("[c]amel[C]ase", "[C]amel[C]ase"));
  EXPECT_THAT("p", ranks("[p]", "[p]arse", "[p]osix", "[p]afdsa", "[p]ath"));
@ -270,12 +273,18 @@ TEST(FuzzyMatch, Ranking) {
 // Verify some bounds so we know scores fall in the right range.
 // Testing exact scores is fragile, so we prefer Ranking tests.
 TEST(FuzzyMatch, Scoring) {
-  EXPECT_THAT("abs", matches("[a]w[B]xYz[S]", 0.f));
+  EXPECT_THAT("abs", matches("[a]w[B]xYz[S]", 7.f / 12.f));
  EXPECT_THAT("abs", matches("[abs]l", 1.f));
  EXPECT_THAT("abs", matches("[abs]", 2.f));
  EXPECT_THAT("Abs", matches("[abs]", 2.f));
 }
 TEST(FuzzyMatch, InitialismAndPrefix) {
  // We want these scores to be roughly the same.
  EXPECT_THAT("up", matches("[u]nique_[p]tr", 3.f / 4.f));
  EXPECT_THAT("up", matches("[up]per_bound", 1.f));
 }
 // Returns pretty-printed segmentation of Text.
 // e.g. std::basic_string --> +--  +---- +-----
 std::string segment(llvm::StringRef Text) {