forked from OSchip/llvm-project
[clangd] FuzzyMatch exposes an API for its word segmentation. NFC
Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 llvm-svn: 337527
This commit is contained in:
parent
a55fc71614
commit
f7d1805171
|
@ -87,8 +87,8 @@ FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
|
|||
for (int W = 0; W < P; ++W)
|
||||
for (Action A : {Miss, Match})
|
||||
Scores[P][W][A] = {AwfulScore, Miss};
|
||||
if (PatN > 0)
|
||||
calculateRoles(Pat, PatRole, PatTypeSet, PatN);
|
||||
PatTypeSet =
|
||||
calculateRoles(StringRef(Pat, PatN), makeMutableArrayRef(PatRole, PatN));
|
||||
}
|
||||
|
||||
Optional<float> FuzzyMatcher::match(StringRef Word) {
|
||||
|
@ -110,25 +110,6 @@ Optional<float> FuzzyMatcher::match(StringRef Word) {
|
|||
return Score;
|
||||
}
|
||||
|
||||
// Segmentation of words and patterns.
|
||||
// A name like "fooBar_baz" consists of several parts foo, bar, baz.
|
||||
// Aligning segmentation of word and pattern improves the fuzzy-match.
|
||||
// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
|
||||
//
|
||||
// First we classify each character into types (uppercase, lowercase, etc).
|
||||
// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
|
||||
|
||||
// We only distinguish the types of characters that affect segmentation.
|
||||
// It's not obvious how to segment digits, we treat them as lowercase letters.
|
||||
// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
|
||||
// This means we require exact (case-sensitive) match.
|
||||
enum FuzzyMatcher::CharType : unsigned char {
|
||||
Empty = 0, // Before-the-start and after-the-end (and control chars).
|
||||
Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
|
||||
Upper = 2, // Uppercase letters.
|
||||
Punctuation = 3, // ASCII punctuation (including Space)
|
||||
};
|
||||
|
||||
// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
|
||||
// The top 6 bits of the char select the byte, the bottom 2 select the offset.
|
||||
// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
|
||||
|
@ -147,17 +128,6 @@ constexpr static uint8_t CharTypes[] = {
|
|||
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
|
||||
};
|
||||
|
||||
// Each character's Role is the Head or Tail of a segment, or a Separator.
|
||||
// e.g. XMLHttpRequest_Async
|
||||
// +--+---+------ +----
|
||||
// ^Head ^Tail ^Separator
|
||||
enum FuzzyMatcher::CharRole : unsigned char {
|
||||
Unknown = 0, // Stray control characters or impossible states.
|
||||
Tail = 1, // Part of a word segment, but not the first character.
|
||||
Head = 2, // The first character of a word segment.
|
||||
Separator = 3, // Punctuation characters that separate word segments.
|
||||
};
|
||||
|
||||
// The Role can be determined from the Type of a character and its neighbors:
|
||||
//
|
||||
// Example | Chars | Type | Role
|
||||
|
@ -183,26 +153,28 @@ constexpr static uint8_t CharRoles[] = {
|
|||
template <typename T> static T packedLookup(const uint8_t *Data, int I) {
|
||||
return static_cast<T>((Data[I >> 2] >> ((I & 3) * 2)) & 3);
|
||||
}
|
||||
void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int &TypeSet,
|
||||
int N) {
|
||||
assert(N > 0);
|
||||
CharTypeSet calculateRoles(StringRef Text, MutableArrayRef<CharRole> Roles) {
|
||||
assert(Text.size() == Roles.size());
|
||||
if (Text.size() == 0)
|
||||
return 0;
|
||||
CharType Type = packedLookup<CharType>(CharTypes, Text[0]);
|
||||
TypeSet = 1 << Type;
|
||||
CharTypeSet TypeSet = 1 << Type;
|
||||
// Types holds a sliding window of (Prev, Curr, Next) types.
|
||||
// Initial value is (Empty, Empty, type of Text[0]).
|
||||
int Types = Type;
|
||||
// Rotate slides in the type of the next character.
|
||||
auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
|
||||
for (int I = 0; I < N - 1; ++I) {
|
||||
for (unsigned I = 0; I < Text.size() - 1; ++I) {
|
||||
// For each character, rotate in the next, and look up the role.
|
||||
Type = packedLookup<CharType>(CharTypes, Text[I + 1]);
|
||||
TypeSet |= 1 << Type;
|
||||
Rotate(Type);
|
||||
*Out++ = packedLookup<CharRole>(CharRoles, Types);
|
||||
Roles[I] = packedLookup<CharRole>(CharRoles, Types);
|
||||
}
|
||||
// For the last character, the "next character" is Empty.
|
||||
Rotate(Empty);
|
||||
*Out++ = packedLookup<CharRole>(CharRoles, Types);
|
||||
Roles[Text.size() - 1] = packedLookup<CharRole>(CharRoles, Types);
|
||||
return TypeSet;
|
||||
}
|
||||
|
||||
// Sets up the data structures matching Word.
|
||||
|
@ -228,7 +200,8 @@ bool FuzzyMatcher::init(StringRef NewWord) {
|
|||
// FIXME: some words are hard to tokenize algorithmically.
|
||||
// e.g. vsprintf is V S Print F, and should match [pri] but not [int].
|
||||
// We could add a tokenization dictionary for common stdlib names.
|
||||
calculateRoles(Word, WordRole, WordTypeSet, WordN);
|
||||
WordTypeSet = calculateRoles(StringRef(Word, WordN),
|
||||
makeMutableArrayRef(WordRole, WordN));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
|
||||
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
|
||||
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/SmallString.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
@ -24,6 +25,48 @@
|
|||
namespace clang {
|
||||
namespace clangd {
|
||||
|
||||
// Utilities for word segmentation.
|
||||
// FuzzyMatcher already incorporates this logic, so most users don't need this.
|
||||
//
|
||||
// A name like "fooBar_baz" consists of several parts foo, bar, baz.
|
||||
// Aligning segmentation of word and pattern improves the fuzzy-match.
|
||||
// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
|
||||
//
|
||||
// First we classify each character into types (uppercase, lowercase, etc).
|
||||
// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.
|
||||
|
||||
// We distinguish the types of characters that affect segmentation.
|
||||
// It's not obvious how to segment digits, we treat them as lowercase letters.
|
||||
// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
|
||||
// This means we require exact (case-sensitive) match for those characters.
|
||||
enum CharType : unsigned char {
|
||||
Empty = 0, // Before-the-start and after-the-end (and control chars).
|
||||
Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.
|
||||
Upper = 2, // Uppercase letters.
|
||||
Punctuation = 3, // ASCII punctuation (including Space)
|
||||
};
|
||||
// A CharTypeSet is a bitfield representing all the character types in a word.
|
||||
// Its bits are 1<<Empty, 1<<Lower, etc.
|
||||
using CharTypeSet = unsigned char;
|
||||
|
||||
// Each character's Role is the Head or Tail of a segment, or a Separator.
|
||||
// e.g. XMLHttpRequest_Async
|
||||
// +--+---+------ +----
|
||||
// ^Head ^Tail ^Separator
|
||||
enum CharRole : unsigned char {
|
||||
Unknown = 0, // Stray control characters or impossible states.
|
||||
Tail = 1, // Part of a word segment, but not the first character.
|
||||
Head = 2, // The first character of a word segment.
|
||||
Separator = 3, // Punctuation characters that separate word segments.
|
||||
};
|
||||
|
||||
// Compute segmentation of Text.
|
||||
// Character roles are stored in Roles (Roles.size() must equal Text.size()).
|
||||
// The set of character types encountered is returned, this may inform
|
||||
// heuristics for dealing with poorly-segmented identifiers like "strndup".
|
||||
CharTypeSet calculateRoles(llvm::StringRef Text,
|
||||
llvm::MutableArrayRef<CharRole> Roles);
|
||||
|
||||
// A matcher capable of matching and scoring strings against a single pattern.
|
||||
// It's optimized for matching against many strings - match() does not allocate.
|
||||
class FuzzyMatcher {
|
||||
|
@ -48,8 +91,6 @@ public:
|
|||
private:
|
||||
// We truncate the pattern and the word to bound the cost of matching.
|
||||
constexpr static int MaxPat = 63, MaxWord = 127;
|
||||
enum CharRole : unsigned char; // For segmentation.
|
||||
enum CharType : unsigned char; // For segmentation.
|
||||
// Action describes how a word character was matched to the pattern.
|
||||
// It should be an enum, but this causes bitfield problems:
|
||||
// - for MSVC the enum type must be explicitly unsigned for correctness
|
||||
|
@ -60,7 +101,6 @@ private:
|
|||
|
||||
bool init(llvm::StringRef Word);
|
||||
void buildGraph();
|
||||
void calculateRoles(const char *Text, CharRole *Out, int &Types, int N);
|
||||
bool allowMatch(int P, int W, Action Last) const;
|
||||
int skipPenalty(int W, Action Last) const;
|
||||
int matchBonus(int P, int W, Action Last) const;
|
||||
|
@ -70,7 +110,7 @@ private:
|
|||
int PatN; // Length
|
||||
char LowPat[MaxPat]; // Pattern in lowercase
|
||||
CharRole PatRole[MaxPat]; // Pattern segmentation info
|
||||
int PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
|
||||
CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters
|
||||
float ScoreScale; // Normalizes scores for the pattern length.
|
||||
|
||||
// Word data is initialized on each call to match(), mostly by init().
|
||||
|
@ -78,7 +118,7 @@ private:
|
|||
int WordN; // Length
|
||||
char LowWord[MaxWord]; // Word in lowercase
|
||||
CharRole WordRole[MaxWord]; // Word segmentation info
|
||||
int WordTypeSet; // Bitmask of 1<<CharType for all Word characters
|
||||
CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters
|
||||
bool WordContainsPattern; // Simple substring check
|
||||
|
||||
// Cumulative best-match score table.
|
||||
|
|
|
@ -273,6 +273,29 @@ TEST(FuzzyMatch, Scoring) {
|
|||
EXPECT_THAT("Abs", matches("[abs]", 2.f));
|
||||
}
|
||||
|
||||
// Returns pretty-printed segmentation of Text.
|
||||
// e.g. std::basic_string --> +-- +---- +-----
|
||||
std::string segment(StringRef Text) {
|
||||
std::vector<CharRole> Roles(Text.size());
|
||||
calculateRoles(Text, Roles);
|
||||
std::string Printed;
|
||||
for (unsigned I = 0; I < Text.size(); ++I)
|
||||
Printed.push_back("?-+ "[static_cast<unsigned>(Roles[I])]);
|
||||
return Printed;
|
||||
}
|
||||
|
||||
// this is a no-op hack so clang-format will vertically align our testcases.
|
||||
StringRef returns(StringRef Text) { return Text; }
|
||||
|
||||
TEST(FuzzyMatch, Segmentation) {
|
||||
EXPECT_THAT(segment("std::basic_string"), //
|
||||
returns("+-- +---- +-----"));
|
||||
EXPECT_THAT(segment("XMLHttpRequest"), //
|
||||
returns("+--+---+------"));
|
||||
EXPECT_THAT(segment("t3h PeNgU1N oF d00m!!!!!!!!"), //
|
||||
returns("+-- +-+-+-+ ++ +--- "));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace clangd
|
||||
} // namespace clang
|
||||
|
|
Loading…
Reference in New Issue