forked from OSchip/llvm-project
402 lines
15 KiB
C++
402 lines
15 KiB
C++
//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
|
|
// we consider the possible partial match states:
|
|
//
|
|
// u n i q u e _ p t r
|
|
// +---------------------
|
|
// |A . . . . . . . . . .
|
|
// u|
|
|
// |. . . . . . . . . . .
|
|
// _|
|
|
// |. . . . . . . O . . .
|
|
// p|
|
|
// |. . . . . . . . . . B
|
|
//
|
|
// Each dot represents some prefix of the pattern being matched against some
|
|
// prefix of the word.
|
|
// - A is the initial state: '' matched against ''
|
|
// - O is an intermediate state: 'u_' matched against 'unique_'
|
|
// - B is the target state: 'u_p' matched against 'unique_ptr'
|
|
//
|
|
// We aim to find the best path from A->B.
|
|
// - Moving right (consuming a word character)
|
|
// Always legal: not all word characters must match.
|
|
// - Moving diagonally (consuming both a word and pattern character)
|
|
// Legal if the characters match.
|
|
// - Moving down (consuming a pattern character) is never legal.
|
|
// Never legal: all pattern characters must match something.
|
|
// Characters are matched case-insensitively.
|
|
// The first pattern character may only match the start of a word segment.
|
|
//
|
|
// The scoring is based on heuristics:
|
|
// - when matching a character, apply a bonus or penalty depending on the
|
|
// match quality (does case match, do word segments align, etc)
|
|
// - when skipping a character, apply a penalty if it hurts the match
|
|
// (it starts a word segment, or splits the matched region, etc)
|
|
//
|
|
// These heuristics require the ability to "look backward" one character, to
|
|
// see whether it was matched or not. Therefore the dynamic-programming matrix
|
|
// has an extra dimension (last character matched).
|
|
// Each entry also has an additional flag indicating whether the last-but-one
|
|
// character matched, which is needed to trace back through the scoring table
|
|
// and reconstruct the match.
|
|
//
|
|
// We treat strings as byte-sequences, so only ASCII has first-class support.
|
|
//
|
|
// This algorithm was inspired by VS code's client-side filtering, and aims
|
|
// to be mostly-compatible.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "FuzzyMatch.h"
|
|
#include "llvm/ADT/Optional.h"
|
|
#include "llvm/Support/Format.h"
|
|
|
|
namespace clang {
|
|
namespace clangd {
|
|
|
|
constexpr int FuzzyMatcher::MaxPat;
|
|
constexpr int FuzzyMatcher::MaxWord;
|
|
|
|
static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
|
|
// A "negative infinity" score that won't overflow.
|
|
// We use this to mark unreachable states and forbidden solutions.
|
|
// Score field is 15 bits wide, min value is -2^14, we use half of that.
|
|
static constexpr int AwfulScore = -(1 << 13);
|
|
static bool isAwful(int S) { return S < AwfulScore / 2; }
|
|
static constexpr int PerfectBonus = 4; // Perfect per-pattern-char score.
|
|
|
|
FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
|
|
: PatN(std::min<int>(MaxPat, Pattern.size())),
|
|
ScoreScale(PatN ? float{1} / (PerfectBonus * PatN) : 0), WordN(0) {
|
|
std::copy(Pattern.begin(), Pattern.begin() + PatN, Pat);
|
|
for (int I = 0; I < PatN; ++I)
|
|
LowPat[I] = lower(Pat[I]);
|
|
Scores[0][0][Miss] = {0, Miss};
|
|
Scores[0][0][Match] = {AwfulScore, Miss};
|
|
for (int P = 0; P <= PatN; ++P)
|
|
for (int W = 0; W < P; ++W)
|
|
for (Action A : {Miss, Match})
|
|
Scores[P][W][A] = {AwfulScore, Miss};
|
|
PatTypeSet = calculateRoles(llvm::StringRef(Pat, PatN),
|
|
llvm::makeMutableArrayRef(PatRole, PatN));
|
|
}
|
|
|
|
llvm::Optional<float> FuzzyMatcher::match(llvm::StringRef Word) {
|
|
if (!(WordContainsPattern = init(Word)))
|
|
return llvm::None;
|
|
if (!PatN)
|
|
return 1;
|
|
buildGraph();
|
|
auto Best = std::max(Scores[PatN][WordN][Miss].Score,
|
|
Scores[PatN][WordN][Match].Score);
|
|
if (isAwful(Best))
|
|
return llvm::None;
|
|
float Score =
|
|
ScoreScale * std::min(PerfectBonus * PatN, std::max<int>(0, Best));
|
|
// If the pattern is as long as the word, we have an exact string match,
|
|
// since every pattern character must match something.
|
|
if (WordN == PatN)
|
|
Score *= 2; // May not be perfect 2 if case differs in a significant way.
|
|
return Score;
|
|
}
|
|
|
|
// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
|
|
// The top 6 bits of the char select the byte, the bottom 2 select the offset.
|
|
// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
|
|
constexpr static uint8_t CharTypes[] = {
|
|
0x00, 0x00, 0x00, 0x00, // Control characters
|
|
0x00, 0x00, 0x00, 0x00, // Control characters
|
|
0xff, 0xff, 0xff, 0xff, // Punctuation
|
|
0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
|
|
0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
|
|
0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
|
|
0x57, 0x55, 0x55, 0x55, // ` and a-o
|
|
0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
|
|
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
|
|
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // (probably UTF-8).
|
|
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
|
|
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
|
|
};
|
|
|
|
// The Role can be determined from the Type of a character and its neighbors:
|
|
//
|
|
// Example | Chars | Type | Role
|
|
// ---------+--------------+-----
|
|
// F(o)oBar | Foo | Ull | Tail
|
|
// Foo(B)ar | oBa | lUl | Head
|
|
// (f)oo | ^fo | Ell | Head
|
|
// H(T)TP | HTT | UUU | Tail
|
|
//
|
|
// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
|
|
// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
|
|
// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
|
|
constexpr static uint8_t CharRoles[] = {
|
|
// clang-format off
|
|
// Curr= Empty Lower Upper Separ
|
|
/* Prev=Empty */ 0x00, 0xaa, 0xaa, 0xff, // At start, Lower|Upper->Head
|
|
/* Prev=Lower */ 0x00, 0x55, 0xaa, 0xff, // In word, Upper->Head;Lower->Tail
|
|
/* Prev=Upper */ 0x00, 0x55, 0x59, 0xff, // Ditto, but U(U)U->Tail
|
|
/* Prev=Separ */ 0x00, 0xaa, 0xaa, 0xff, // After separator, like at start
|
|
// clang-format on
|
|
};
|
|
|
|
template <typename T> static T packedLookup(const uint8_t *Data, int I) {
|
|
return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
|
|
}
|
|
CharTypeSet calculateRoles(llvm::StringRef Text,
|
|
llvm::MutableArrayRef<CharRole> Roles) {
|
|
assert(Text.size() == Roles.size());
|
|
if (Text.size() == 0)
|
|
return 0;
|
|
CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
|
|
CharTypeSet TypeSet = 1 << Type;
|
|
// Types holds a sliding window of (Prev, Curr, Next) types.
|
|
// Initial value is (Empty, Empty, type of Text[0]).
|
|
int Types = Type;
|
|
// Rotate slides in the type of the next character.
|
|
auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
|
|
for (unsigned I = 0; I < Text.size() - 1; ++I) {
|
|
// For each character, rotate in the next, and look up the role.
|
|
Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
|
|
TypeSet |= 1 << Type;
|
|
Rotate(Type);
|
|
Roles[I] = packedLookup<CharRole>(CharRoles, Types);
|
|
}
|
|
// For the last character, the "next character" is Empty.
|
|
Rotate(Empty);
|
|
Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
|
|
return TypeSet;
|
|
}
|
|
|
|
// Sets up the data structures matching Word.
|
|
// Returns false if we can cheaply determine that no match is possible.
|
|
bool FuzzyMatcher::init(llvm::StringRef NewWord) {
|
|
WordN = std::min<int>(MaxWord, NewWord.size());
|
|
if (PatN > WordN)
|
|
return false;
|
|
std::copy(NewWord.begin(), NewWord.begin() + WordN, Word);
|
|
if (PatN == 0)
|
|
return true;
|
|
for (int I = 0; I < WordN; ++I)
|
|
LowWord[I] = lower(Word[I]);
|
|
|
|
// Cheap subsequence check.
|
|
for (int W = 0, P = 0; P != PatN; ++W) {
|
|
if (W == WordN)
|
|
return false;
|
|
if (LowWord[W] == LowPat[P])
|
|
++P;
|
|
}
|
|
|
|
// FIXME: some words are hard to tokenize algorithmically.
|
|
// e.g. vsprintf is V S Print F, and should match [pri] but not [int].
|
|
// We could add a tokenization dictionary for common stdlib names.
|
|
WordTypeSet = calculateRoles(llvm::StringRef(Word, WordN),
|
|
llvm::makeMutableArrayRef(WordRole, WordN));
|
|
return true;
|
|
}
|
|
|
|
// The forwards pass finds the mappings of Pattern onto Word.
|
|
// Score = best score achieved matching Word[..W] against Pat[..P].
|
|
// Unlike other tables, indices range from 0 to N *inclusive*
|
|
// Matched = whether we chose to match Word[W] with Pat[P] or not.
|
|
//
|
|
// Points are mostly assigned to matched characters, with 1 being a good score
|
|
// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
|
|
// This range is not strict: we can apply larger bonuses/penalties, or penalize
|
|
// non-matched characters.
|
|
void FuzzyMatcher::buildGraph() {
|
|
for (int W = 0; W < WordN; ++W) {
|
|
Scores[0][W + 1][Miss] = {Scores[0][W][Miss].Score - skipPenalty(W, Miss),
|
|
Miss};
|
|
Scores[0][W + 1][Match] = {AwfulScore, Miss};
|
|
}
|
|
for (int P = 0; P < PatN; ++P) {
|
|
for (int W = P; W < WordN; ++W) {
|
|
auto &Score = Scores[P + 1][W + 1], &PreMiss = Scores[P + 1][W];
|
|
|
|
auto MatchMissScore = PreMiss[Match].Score;
|
|
auto MissMissScore = PreMiss[Miss].Score;
|
|
if (P < PatN - 1) { // Skipping trailing characters is always free.
|
|
MatchMissScore -= skipPenalty(W, Match);
|
|
MissMissScore -= skipPenalty(W, Miss);
|
|
}
|
|
Score[Miss] = (MatchMissScore > MissMissScore)
|
|
? ScoreInfo{MatchMissScore, Match}
|
|
: ScoreInfo{MissMissScore, Miss};
|
|
|
|
auto &PreMatch = Scores[P][W];
|
|
auto MatchMatchScore =
|
|
allowMatch(P, W, Match)
|
|
? PreMatch[Match].Score + matchBonus(P, W, Match)
|
|
: AwfulScore;
|
|
auto MissMatchScore = allowMatch(P, W, Miss)
|
|
? PreMatch[Miss].Score + matchBonus(P, W, Miss)
|
|
: AwfulScore;
|
|
Score[Match] = (MatchMatchScore > MissMatchScore)
|
|
? ScoreInfo{MatchMatchScore, Match}
|
|
: ScoreInfo{MissMatchScore, Miss};
|
|
}
|
|
}
|
|
}
|
|
|
|
bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const {
|
|
if (LowPat[P] != LowWord[W])
|
|
return false;
|
|
// We require a "strong" match:
|
|
// - for the first pattern character. [foo] !~ "barefoot"
|
|
// - after a gap. [pat] !~ "patnther"
|
|
if (Last == Miss) {
|
|
// We're banning matches outright, so conservatively accept some other cases
|
|
// where our segmentation might be wrong:
|
|
// - allow matching B in ABCDef (but not in NDEBUG)
|
|
// - we'd like to accept print in sprintf, but too many false positives
|
|
if (WordRole[W] == Tail &&
|
|
(Word[W] == LowWord[W] || !(WordTypeSet & 1 << Lower)))
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int FuzzyMatcher::skipPenalty(int W, Action Last) const {
|
|
if (W == 0) // Skipping the first character.
|
|
return 3;
|
|
if (WordRole[W] == Head) // Skipping a segment.
|
|
return 1; // We want to keep this lower than a consecutive match bonus.
|
|
// Instead of penalizing non-consecutive matches, we give a bonus to a
|
|
// consecutive match in matchBonus. This produces a better score distribution
|
|
// than penalties in case of small patterns, e.g. 'up' for 'unique_ptr'.
|
|
return 0;
|
|
}
|
|
|
|
int FuzzyMatcher::matchBonus(int P, int W, Action Last) const {
|
|
assert(LowPat[P] == LowWord[W]);
|
|
int S = 1;
|
|
bool IsPatSingleCase =
|
|
(PatTypeSet == 1 << Lower) || (PatTypeSet == 1 << Upper);
|
|
// Bonus: case matches, or a Head in the pattern aligns with one in the word.
|
|
// Single-case patterns lack segmentation signals and we assume any character
|
|
// can be a head of a segment.
|
|
if (Pat[P] == Word[W] ||
|
|
(WordRole[W] == Head && (IsPatSingleCase || PatRole[P] == Head)))
|
|
++S;
|
|
// Bonus: a consecutive match. First character match also gets a bonus to
|
|
// ensure prefix final match score normalizes to 1.0.
|
|
if (W == 0 || Last == Match)
|
|
S += 2;
|
|
// Penalty: matching inside a segment (and previous char wasn't matched).
|
|
if (WordRole[W] == Tail && P && Last == Miss)
|
|
S -= 3;
|
|
// Penalty: a Head in the pattern matches in the middle of a word segment.
|
|
if (PatRole[P] == Head && WordRole[W] == Tail)
|
|
--S;
|
|
// Penalty: matching the first pattern character in the middle of a segment.
|
|
if (P == 0 && WordRole[W] == Tail)
|
|
S -= 4;
|
|
assert(S <= PerfectBonus);
|
|
return S;
|
|
}
|
|
|
|
llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
|
|
llvm::SmallString<256> Result;
|
|
OS << "=== Match \"" << llvm::StringRef(Word, WordN) << "\" against ["
|
|
<< llvm::StringRef(Pat, PatN) << "] ===\n";
|
|
if (PatN == 0) {
|
|
OS << "Pattern is empty: perfect match.\n";
|
|
return Result = llvm::StringRef(Word, WordN);
|
|
}
|
|
if (WordN == 0) {
|
|
OS << "Word is empty: no match.\n";
|
|
return Result;
|
|
}
|
|
if (!WordContainsPattern) {
|
|
OS << "Substring check failed.\n";
|
|
return Result;
|
|
} else if (isAwful(std::max(Scores[PatN][WordN][Match].Score,
|
|
Scores[PatN][WordN][Miss].Score))) {
|
|
OS << "Substring check passed, but all matches are forbidden\n";
|
|
}
|
|
if (!(PatTypeSet & 1 << Upper))
|
|
OS << "Lowercase query, so scoring ignores case\n";
|
|
|
|
// Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
|
|
// The Score table has cumulative scores, subtracting along this path gives
|
|
// us the per-letter scores.
|
|
Action Last =
|
|
(Scores[PatN][WordN][Match].Score > Scores[PatN][WordN][Miss].Score)
|
|
? Match
|
|
: Miss;
|
|
int S[MaxWord];
|
|
Action A[MaxWord];
|
|
for (int W = WordN - 1, P = PatN - 1; W >= 0; --W) {
|
|
A[W] = Last;
|
|
const auto &Cell = Scores[P + 1][W + 1][Last];
|
|
if (Last == Match)
|
|
--P;
|
|
const auto &Prev = Scores[P + 1][W][Cell.Prev];
|
|
S[W] = Cell.Score - Prev.Score;
|
|
Last = Cell.Prev;
|
|
}
|
|
for (int I = 0; I < WordN; ++I) {
|
|
if (A[I] == Match && (I == 0 || A[I - 1] == Miss))
|
|
Result.push_back('[');
|
|
if (A[I] == Miss && I > 0 && A[I - 1] == Match)
|
|
Result.push_back(']');
|
|
Result.push_back(Word[I]);
|
|
}
|
|
if (A[WordN - 1] == Match)
|
|
Result.push_back(']');
|
|
|
|
for (char C : llvm::StringRef(Word, WordN))
|
|
OS << " " << C << " ";
|
|
OS << "\n";
|
|
for (int I = 0, J = 0; I < WordN; I++)
|
|
OS << " " << (A[I] == Match ? Pat[J++] : ' ') << " ";
|
|
OS << "\n";
|
|
for (int I = 0; I < WordN; I++)
|
|
OS << llvm::format("%2d ", S[I]);
|
|
OS << "\n";
|
|
|
|
OS << "\nSegmentation:";
|
|
OS << "\n'" << llvm::StringRef(Word, WordN) << "'\n ";
|
|
for (int I = 0; I < WordN; ++I)
|
|
OS << "?-+ "[static_cast<int>(WordRole[I])];
|
|
OS << "\n[" << llvm::StringRef(Pat, PatN) << "]\n ";
|
|
for (int I = 0; I < PatN; ++I)
|
|
OS << "?-+ "[static_cast<int>(PatRole[I])];
|
|
OS << "\n";
|
|
|
|
OS << "\nScoring table (last-Miss, last-Match):\n";
|
|
OS << " | ";
|
|
for (char C : llvm::StringRef(Word, WordN))
|
|
OS << " " << C << " ";
|
|
OS << "\n";
|
|
OS << "-+----" << std::string(WordN * 4, '-') << "\n";
|
|
for (int I = 0; I <= PatN; ++I) {
|
|
for (Action A : {Miss, Match}) {
|
|
OS << ((I && A == Miss) ? Pat[I - 1] : ' ') << "|";
|
|
for (int J = 0; J <= WordN; ++J) {
|
|
if (!isAwful(Scores[I][J][A].Score))
|
|
OS << llvm::format("%3d%c", Scores[I][J][A].Score,
|
|
Scores[I][J][A].Prev == Match ? '*' : ' ');
|
|
else
|
|
OS << " ";
|
|
}
|
|
OS << "\n";
|
|
}
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
} // namespace clangd
|
|
} // namespace clang
|