[clangd] Tune the fuzzy-matching algorithm

Summary:
To reduce the gap between prefix and initialism matches.

The motivation is producing better scoring in one particular example,
but the change does not seem to cause large regressions in other cases.

The examples is matching 'up' against 'unique_ptr' and 'upper_bound'.
Before the change, we had:
  - "[u]nique_[p]tr" with a score of 0.3,
  - "[up]per_bound" with a score of 1.0.

A 3x difference meant that symbol quality signals were almost always ignored
and 'upper_bound' was always ranked higher.

However, intuitively, the match scores should be very close for the two.

After the change we have the following scores:
- "[u]nique_[p]tr" with a score of 0.75,
- "[up]per_bound" with a score of 1.0.

Reviewers: ioeric

Reviewed By: ioeric

Subscribers: MaskRay, jkorous, arphaman, kadircet, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D59300

llvm-svn: 356261
This commit is contained in:
Ilya Biryukov 2019-03-15 14:00:49 +00:00
parent 339daae806
commit 373bee85c2
2 changed files with 28 additions and 12 deletions

View File

@ -71,7 +71,7 @@ static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
// Score field is 15 bits wide, min value is -2^14, we use half of that. // Score field is 15 bits wide, min value is -2^14, we use half of that.
static constexpr int AwfulScore = -(1 << 13); static constexpr int AwfulScore = -(1 << 13);
static bool isAwful(int S) { return S < AwfulScore / 2; } static bool isAwful(int S) { return S < AwfulScore / 2; }
static constexpr int PerfectBonus = 3; // Perfect per-pattern-char score. static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern) FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
: PatN(std::min<int>(MaxPat, Pattern.size())), : PatN(std::min<int>(MaxPat, Pattern.size())),
@ -267,24 +267,31 @@ bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
} }
int FuzzyMatcher::skipPenalty(int W, Action Last) const { int FuzzyMatcher::skipPenalty(int W, Action Last) const {
int S = 0; if (W == 0) // Skipping the first character.
return 3;
if (WordRole[W] == Head) // Skipping a segment. if (WordRole[W] == Head) // Skipping a segment.
S += 1; return 1; // We want to keep this lower than a consecutive match bonus.
if (Last == Match) // Non-consecutive match. // Instead of penalizing non-consecutive matches, we give a bonus to a
S += 2; // We'd rather skip a segment than split our match. // consecutive match in matchBonus. This produces a better score distribution
return S; // than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
return 0;
} }
int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
assert(LowPat[P] == LowWord[W]); assert(LowPat[P] == LowWord[W]);
int S = 1; int S = 1;
// Bonus: pattern so far is a (case-insensitive) prefix of the word. bool IsPatSingleCase =
if (P == W) // We can't skip pattern characters, so we must have matched all. (PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
++S;
// Bonus: case matches, or a Head in the pattern aligns with one in the word. // Bonus: case matches, or a Head in the pattern aligns with one in the word.
if ((Pat[P] == Word[W] && ((PatTypeSet & 1 << Upper) || P == W)) || // Single-case patterns lack segmentation signals and we assume any character
(PatRole[P] == Head && WordRole[W] == Head)) // can be a head of a segment.
if (Pat[P] == Word[W] ||
(WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
++S; ++S;
// Bonus: a consecutive match. First character match also gets a bonus to
// ensure prefix final match score normalizes to 1.0.
if (W == 0 || Last == Match)
S += 2;
// Penalty: matching inside a segment (and previous char wasn't matched). // Penalty: matching inside a segment (and previous char wasn't matched).
if (WordRole[W] == Tail && P && Last == Miss) if (WordRole[W] == Tail && P && Last == Miss)
S -= 3; S -= 3;

View File

@ -9,6 +9,7 @@
#include "FuzzyMatch.h" #include "FuzzyMatch.h"
#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringExtras.h"
#include "gmock/gmock-matchers.h"
#include "gmock/gmock.h" #include "gmock/gmock.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
@ -247,6 +248,8 @@ TEST(FuzzyMatch, Ranking) {
EXPECT_THAT("foo", ranks("[foo]", "[Foo]")); EXPECT_THAT("foo", ranks("[foo]", "[Foo]"));
EXPECT_THAT("onMes", EXPECT_THAT("onMes",
ranks("[onMes]sage", "[onmes]sage", "[on]This[M]ega[Es]capes")); ranks("[onMes]sage", "[onmes]sage", "[on]This[M]ega[Es]capes"));
EXPECT_THAT("onmes",
ranks("[onmes]sage", "[onMes]sage", "[on]This[M]ega[Es]capes"));
EXPECT_THAT("CC", ranks("[C]amel[C]ase", "[c]amel[C]ase")); EXPECT_THAT("CC", ranks("[C]amel[C]ase", "[c]amel[C]ase"));
EXPECT_THAT("cC", ranks("[c]amel[C]ase", "[C]amel[C]ase")); EXPECT_THAT("cC", ranks("[c]amel[C]ase", "[C]amel[C]ase"));
EXPECT_THAT("p", ranks("[p]", "[p]arse", "[p]osix", "[p]afdsa", "[p]ath")); EXPECT_THAT("p", ranks("[p]", "[p]arse", "[p]osix", "[p]afdsa", "[p]ath"));
@ -270,12 +273,18 @@ TEST(FuzzyMatch, Ranking) {
// Verify some bounds so we know scores fall in the right range. // Verify some bounds so we know scores fall in the right range.
// Testing exact scores is fragile, so we prefer Ranking tests. // Testing exact scores is fragile, so we prefer Ranking tests.
TEST(FuzzyMatch, Scoring) { TEST(FuzzyMatch, Scoring) {
EXPECT_THAT("abs", matches("[a]w[B]xYz[S]", 0.f)); EXPECT_THAT("abs", matches("[a]w[B]xYz[S]", 7.f / 12.f));
EXPECT_THAT("abs", matches("[abs]l", 1.f)); EXPECT_THAT("abs", matches("[abs]l", 1.f));
EXPECT_THAT("abs", matches("[abs]", 2.f)); EXPECT_THAT("abs", matches("[abs]", 2.f));
EXPECT_THAT("Abs", matches("[abs]", 2.f)); EXPECT_THAT("Abs", matches("[abs]", 2.f));
} }
TEST(FuzzyMatch, InitialismAndPrefix) {
// We want these scores to be roughly the same.
EXPECT_THAT("up", matches("[u]nique_[p]tr", 3.f / 4.f));
EXPECT_THAT("up", matches("[up]per_bound", 1.f));
}
// Returns pretty-printed segmentation of Text. // Returns pretty-printed segmentation of Text.
// e.g. std::basic_string --> +-- +---- +----- // e.g. std::basic_string --> +-- +---- +-----
std::string segment(llvm::StringRef Text) { std::string segment(llvm::StringRef Text) {