llvm-project/clang-tools-extra/clangd/FuzzyMatch.h

//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements fuzzy-matching of strings against identifiers.
// It indicates both the existence and quality of a match:
// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H

#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"

namespace clang {
namespace clangd {

// Utilities for word segmentation.
// FuzzyMatcher already incorporates this logic, so most users don't need this.
//
// A name like "fooBar_baz" consists of several parts foo, bar, baz.
// Aligning segmentation of word and pattern improves the fuzzy-match.
// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
//
// First we classify each character into types (uppercase, lowercase, etc).
// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.

// We distinguish the types of characters that affect segmentation.
// It's not obvious how to segment digits, we treat them as lowercase letters.
// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
// This means we require exact (case-sensitive) match for those characters.
enum CharType : unsigned char {
  Empty = 0,       // Before-the-start and after-the-end (and control chars).
  Lower = 1,       // Lowercase letters, digits, and non-ASCII bytes.
  Upper = 2,       // Uppercase letters.
  Punctuation = 3, // ASCII punctuation (including Space)
};
// A CharTypeSet is a bitfield representing all the character types in a word.
// Its bits are 1<<Empty, 1<<Lower, etc.
using CharTypeSet = unsigned char;

// Each character's Role is the Head or Tail of a segment, or a Separator.
// e.g. XMLHttpRequest_Async
//      +--+---+------ +----
//      ^Head   ^Tail ^Separator
enum CharRole : unsigned char {
  Unknown = 0,   // Stray control characters or impossible states.
  Tail = 1,      // Part of a word segment, but not the first character.
  Head = 2,      // The first character of a word segment.
  Separator = 3, // Punctuation characters that separate word segments.
};

// Compute segmentation of Text.
// Character roles are stored in Roles (Roles.size() must equal Text.size()).
// The set of character types encountered is returned, this may inform
// heuristics for dealing with poorly-segmented identifiers like "strndup".
CharTypeSet calculateRoles(llvm::StringRef Text,
                           llvm::MutableArrayRef<CharRole> Roles);

// A matcher capable of matching and scoring strings against a single pattern.
// It's optimized for matching against many strings - match() does not allocate.
class FuzzyMatcher {
public:
  // Characters beyond MaxPat are ignored.
  FuzzyMatcher(llvm::StringRef Pattern);

  // If Word matches the pattern, return a score indicating the quality match.
  // Scores usually fall in a [0,1] range, with 1 being a very good score.
  // "Super" scores in (1,2] are possible if the pattern is the full word.
  // Characters beyond MaxWord are ignored.
  llvm::Optional<float> match(llvm::StringRef Word);

  llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }
  bool empty() const { return PatN == 0; }

  // Dump internal state from the last match() to the stream, for debugging.
  // Returns the pattern with [] around matched characters, e.g.
  //   [u_p] + "unique_ptr" --> "[u]nique[_p]tr"
  llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;

private:
  // We truncate the pattern and the word to bound the cost of matching.
  constexpr static int MaxPat = 63, MaxWord = 127;
  // Action describes how a word character was matched to the pattern.
  // It should be an enum, but this causes bitfield problems:
  //   - for MSVC the enum type must be explicitly unsigned for correctness
  //   - GCC 4.8 complains not all values fit if the type is unsigned
  using Action = bool;
  constexpr static Action Miss = false; // Word character was skipped.
  constexpr static Action Match = true; // Matched against a pattern character.

  bool init(llvm::StringRef Word);
  void buildGraph();
  bool allowMatch(int P, int W, Action Last) const;
  int skipPenalty(int W, Action Last) const;
  int matchBonus(int P, int W, Action Last) const;

  // Pattern data is initialized by the constructor, then constant.
  char Pat[MaxPat];         // Pattern data
  int PatN;                 // Length
  char LowPat[MaxPat];      // Pattern in lowercase
  CharRole PatRole[MaxPat]; // Pattern segmentation info
  CharTypeSet PatTypeSet;   // Bitmask of 1<<CharType for all Pattern characters
  float ScoreScale;         // Normalizes scores for the pattern length.

  // Word data is initialized on each call to match(), mostly by init().
  char Word[MaxWord];         // Word data
  int WordN;                  // Length
  char LowWord[MaxWord];      // Word in lowercase
  CharRole WordRole[MaxWord]; // Word segmentation info
  CharTypeSet WordTypeSet;    // Bitmask of 1<<CharType for all Word characters
  bool WordContainsPattern;   // Simple substring check

  // Cumulative best-match score table.
  // Boundary conditions are filled in by the constructor.
  // The rest is repopulated for each match(), by buildGraph().
  struct ScoreInfo {
    signed int Score : 15;
    Action Prev : 1;
  };
  ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];
};

} // namespace clangd
} // namespace clang

#endif
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`//===--- FuzzyMatch.h - Approximate identifier matching ---------- C++--===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This file implements fuzzy-matching of strings against identifiers.`
			`// It indicates both the existence and quality of a match:`
			`// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H`
			`#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H`

[clangd] FuzzyMatch exposes an API for its word segmentation. NFC Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 llvm-svn: 337527 2018-07-20 16:01:37 +08:00			`#include "llvm/ADT/ArrayRef.h"`
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`#include "llvm/ADT/Optional.h"`
			`#include "llvm/ADT/SmallString.h"`
			`#include "llvm/ADT/StringRef.h"`
			`#include "llvm/Support/raw_ostream.h"`

			`namespace clang {`
			`namespace clangd {`

[clangd] FuzzyMatch exposes an API for its word segmentation. NFC Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 llvm-svn: 337527 2018-07-20 16:01:37 +08:00			`// Utilities for word segmentation.`
			`// FuzzyMatcher already incorporates this logic, so most users don't need this.`
			`//`
			`// A name like "fooBar_baz" consists of several parts foo, bar, baz.`
			`// Aligning segmentation of word and pattern improves the fuzzy-match.`
			`// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"`
			`//`
			`// First we classify each character into types (uppercase, lowercase, etc).`
			`// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.`

			`// We distinguish the types of characters that affect segmentation.`
			`// It's not obvious how to segment digits, we treat them as lowercase letters.`
			`// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.`
			`// This means we require exact (case-sensitive) match for those characters.`
			`enum CharType : unsigned char {`
			`Empty = 0, // Before-the-start and after-the-end (and control chars).`
			`Lower = 1, // Lowercase letters, digits, and non-ASCII bytes.`
			`Upper = 2, // Uppercase letters.`
			`Punctuation = 3, // ASCII punctuation (including Space)`
			`};`
			`// A CharTypeSet is a bitfield representing all the character types in a word.`
			`// Its bits are 1<<Empty, 1<<Lower, etc.`
			`using CharTypeSet = unsigned char;`

			`// Each character's Role is the Head or Tail of a segment, or a Separator.`
			`// e.g. XMLHttpRequest_Async`
			`// +--+---+------ +----`
			`// ^Head ^Tail ^Separator`
			`enum CharRole : unsigned char {`
			`Unknown = 0, // Stray control characters or impossible states.`
			`Tail = 1, // Part of a word segment, but not the first character.`
			`Head = 2, // The first character of a word segment.`
			`Separator = 3, // Punctuation characters that separate word segments.`
			`};`

			`// Compute segmentation of Text.`
			`// Character roles are stored in Roles (Roles.size() must equal Text.size()).`
			`// The set of character types encountered is returned, this may inform`
			`// heuristics for dealing with poorly-segmented identifiers like "strndup".`
			`CharTypeSet calculateRoles(llvm::StringRef Text,`
			`llvm::MutableArrayRef<CharRole> Roles);`

[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`// A matcher capable of matching and scoring strings against a single pattern.`
			`// It's optimized for matching against many strings - match() does not allocate.`
			`class FuzzyMatcher {`
			`public:`
			`// Characters beyond MaxPat are ignored.`
			`FuzzyMatcher(llvm::StringRef Pattern);`

[clangd] Boost fuzzy match score by 2x (so a maximum of 2) when the query is the full identifier name. Summary: Fix a couple of bugs in tests an in Quality to keep tests passing. Reviewers: ioeric Subscribers: ilya-biryukov, MaskRay, jkorous, cfe-commits Differential Revision: https://reviews.llvm.org/D47815 llvm-svn: 334089 2018-06-06 20:38:37 +08:00			`// If Word matches the pattern, return a score indicating the quality match.`
			`// Scores usually fall in a [0,1] range, with 1 being a very good score.`
			`// "Super" scores in (1,2] are possible if the pattern is the full word.`
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`// Characters beyond MaxWord are ignored.`
			`llvm::Optional<float> match(llvm::StringRef Word);`

[clangd] Merge index-provided completions with those from Sema. Summary: - we match on USR, and do a field-by-field merge if both have results - scoring is post-merge, with both sets of information available (for now, sema priority is used if available, static score for index results) - limit is applied to the complete result set (previously index ignored limit) - CompletionItem is only produces for the returned results - If the user doesn't type a scope, we send the global scope for completion (we can improve this after D42073) Reviewers: ioeric Subscribers: klimek, ilya-biryukov, mgrang, cfe-commits Differential Revision: https://reviews.llvm.org/D42181 llvm-svn: 322945 2018-01-19 22:34:02 +08:00			`llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); }`
			`bool empty() const { return PatN == 0; }`
[clangd] Incorporate fuzzy-match into result rankings. Summary: The scoring function is fuzzy-match-quality * existing quality score. Reviewers: ioeric Subscribers: klimek, cfe-commits, ilya-biryukov Differential Revision: https://reviews.llvm.org/D40780 llvm-svn: 322377 2018-01-13 00:16:09 +08:00
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`// Dump internal state from the last match() to the stream, for debugging.`
			`// Returns the pattern with [] around matched characters, e.g.`
			`// [u_p] + "unique_ptr" --> "[u]nique[_p]tr"`
			`llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const;`

			`private:`
			`// We truncate the pattern and the word to bound the cost of matching.`
			`constexpr static int MaxPat = 63, MaxWord = 127;`
[clangd] FuzzyMatch: forbid tail-tail matches after a miss: [pat] !~ "panther" Summary: This is a small code change but vastly reduces noise in code completion results. The intent of allowing this was to let [sc] ~ "strncpy" and [strcpy] ~ "strncpy" however the benefits for unsegmented names aren't IMO worth the costs. Test cases should be representative of the changes here. Reviewers: ilya-biryukov Subscribers: ioeric, MaskRay, jkorous, cfe-commits Differential Revision: https://reviews.llvm.org/D47950 llvm-svn: 334712 2018-06-14 21:50:30 +08:00			`// Action describes how a word character was matched to the pattern.`
			`// It should be an enum, but this causes bitfield problems:`
[clangd] Avoid enum in bitfields, can't satisfy old GCC and new MSVC llvm-svn: 319608 2017-12-02 12:15:55 +08:00			`// - for MSVC the enum type must be explicitly unsigned for correctness`
			`// - GCC 4.8 complains not all values fit if the type is unsigned`
			`using Action = bool;`
[clangd] FuzzyMatch: forbid tail-tail matches after a miss: [pat] !~ "panther" Summary: This is a small code change but vastly reduces noise in code completion results. The intent of allowing this was to let [sc] ~ "strncpy" and [strcpy] ~ "strncpy" however the benefits for unsegmented names aren't IMO worth the costs. Test cases should be representative of the changes here. Reviewers: ilya-biryukov Subscribers: ioeric, MaskRay, jkorous, cfe-commits Differential Revision: https://reviews.llvm.org/D47950 llvm-svn: 334712 2018-06-14 21:50:30 +08:00			`constexpr static Action Miss = false; // Word character was skipped.`
			`constexpr static Action Match = true; // Matched against a pattern character.`
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00
			`bool init(llvm::StringRef Word);`
			`void buildGraph();`
[clangd] FuzzyMatch: forbid tail-tail matches after a miss: [pat] !~ "panther" Summary: This is a small code change but vastly reduces noise in code completion results. The intent of allowing this was to let [sc] ~ "strncpy" and [strcpy] ~ "strncpy" however the benefits for unsegmented names aren't IMO worth the costs. Test cases should be representative of the changes here. Reviewers: ilya-biryukov Subscribers: ioeric, MaskRay, jkorous, cfe-commits Differential Revision: https://reviews.llvm.org/D47950 llvm-svn: 334712 2018-06-14 21:50:30 +08:00			`bool allowMatch(int P, int W, Action Last) const;`
[clangd] Fix unintentionally loose fuzzy matching, and the tests masking it. Summary: The intent was that [ar] doesn't match "FooBar"; the first character must match a Head character (hard requirement, not just a low score). This matches VSCode, and was "tested" but the tests were defective. The tests expected matches("FooBar") to fail for lack of a match. But instead it fails because the string should be annotated - matches("FooB[ar]"). This patch makes matches("FooBar") ignore annotations, as was intended. Fixing the code to reject weak matches for the first char causes problems: - [bre] no longer matches "HTMLBRElement". We allow matching against an uppercase char even if we don't think it's head. Only do this if there's at least one lowercase, to avoid triggering on MACROS - [print] no longer matches "sprintf". This is hard to fix without false positives (e.g. [int] vs "sprintf"]) This patch leaves this case broken. A future patch will add a dictionary providing custom segmentation to common names from the standard library. Fixed a couple of index tests that indirectly relied on broken fuzzy matching. Added const in a couple of missing places for consistency with new code. Subscribers: klimek, ilya-biryukov, jkorous-apple, ioeric, cfe-commits Differential Revision: https://reviews.llvm.org/D44003 llvm-svn: 326721 2018-03-06 01:34:33 +08:00			`int skipPenalty(int W, Action Last) const;`
			`int matchBonus(int P, int W, Action Last) const;`
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00
			`// Pattern data is initialized by the constructor, then constant.`
			`char Pat[MaxPat]; // Pattern data`
			`int PatN; // Length`
			`char LowPat[MaxPat]; // Pattern in lowercase`
			`CharRole PatRole[MaxPat]; // Pattern segmentation info`
[clangd] FuzzyMatch exposes an API for its word segmentation. NFC Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 llvm-svn: 337527 2018-07-20 16:01:37 +08:00			`CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters`
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`float ScoreScale; // Normalizes scores for the pattern length.`

			`// Word data is initialized on each call to match(), mostly by init().`
			`char Word[MaxWord]; // Word data`
			`int WordN; // Length`
			`char LowWord[MaxWord]; // Word in lowercase`
			`CharRole WordRole[MaxWord]; // Word segmentation info`
[clangd] FuzzyMatch exposes an API for its word segmentation. NFC Summary: This is intended to be used for indexing, e.g. in D49417 Reviewers: ioeric, omtcyfz Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, cfe-commits Differential Revision: https://reviews.llvm.org/D49540 llvm-svn: 337527 2018-07-20 16:01:37 +08:00			`CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters`
[clangd] Fuzzy match scorer Summary: This will be used for rescoring code completion results based on partial identifiers. Short-term use: - we want to limit the number of code completion results returned to improve performance of global completion. The scorer will be used to rerank the results to return when the user has applied a filter. Long-term use case: - ranking of completion results from in-memory index - merging of completion results from multiple sources (merging usually works best when done at the component-score level, rescoring the fuzzy-match quality avoids different backends needing to have comparable scores) Reviewers: ilya-biryukov Subscribers: cfe-commits, mgorny Differential Revision: https://reviews.llvm.org/D40060 llvm-svn: 319557 2017-12-02 01:08:02 +08:00			`bool WordContainsPattern; // Simple substring check`

			`// Cumulative best-match score table.`
			`// Boundary conditions are filled in by the constructor.`
			`// The rest is repopulated for each match(), by buildGraph().`
			`struct ScoreInfo {`
			`signed int Score : 15;`
			`Action Prev : 1;`
			`};`
			`ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2];`
			`};`

			`} // namespace clangd`
			`} // namespace clang`

			`#endif`