llvm-project/clang-tools-extra/clangd/Quality.h

//===--- Quality.h - Ranking alternatives for ambiguous queries -*- C++-*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===---------------------------------------------------------------------===//
///
/// Some operations such as code completion produce a set of candidates.
/// Usually the user can choose between them, but we should put the best options
/// at the top (they're easier to select, and more likely to be seen).
///
/// This file defines building blocks for ranking candidates.
/// It's used by the features directly and also in the implementation of
/// indexes, as indexes also need to heuristically limit their results.
///
/// The facilities here are:
///   - retrieving scoring signals from e.g. indexes, AST, CodeCompletionString
///     These are structured in a way that they can be debugged, and are fairly
///     consistent regardless of the source.
///   - compute scores from scoring signals. These are suitable for sorting.
///   - sorting utilities like the TopN container.
/// These could be split up further to isolate dependencies if we care.
///
//===---------------------------------------------------------------------===//
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_QUALITY_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_QUALITY_H
#include "llvm/ADT/StringRef.h"
#include <algorithm>
#include <functional>
#include <vector>
namespace llvm {
class raw_ostream;
}
namespace clang {
class CodeCompletionResult;
namespace clangd {
struct Symbol;

// Signals structs are designed to be aggregated from 0 or more sources.
// A default instance has neutral signals, and sources are merged into it.
// They can be dumped for debugging, and evaluate()d into a score.

/// Attributes of a symbol that affect how much we like it.
struct SymbolQualitySignals {
  unsigned SemaCCPriority = 0; // 1-80, 1 is best. 0 means absent.
                               // FIXME: this is actually a mix of symbol
                               //        quality and relevance. Untangle this.
  bool Deprecated = false;
  unsigned References = 0;

  void merge(const CodeCompletionResult &SemaCCResult);
  void merge(const Symbol &IndexResult);

  // Condense these signals down to a single number, higher is better.
  float evaluate() const;
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
                              const SymbolQualitySignals &);

/// Attributes of a symbol-query pair that affect how much we like it.
struct SymbolRelevanceSignals {
  // 0-1 fuzzy-match score for unqualified name. Must be explicitly assigned.
  float NameMatch = 1;
  bool Forbidden = false; // Unavailable (e.g const) or inaccessible (private).
  /// Proximity between the best declaration and the query location. [0-1] score
  /// where 1 is closest
  float ProximityScore = 0;

  void merge(const CodeCompletionResult &SemaResult);

  // Condense these signals down to a single number, higher is better.
  float evaluate() const;
};
llvm::raw_ostream &operator<<(llvm::raw_ostream &,
                              const SymbolRelevanceSignals &);

/// Combine symbol quality and relevance into a single score.
float evaluateSymbolAndRelevance(float SymbolQuality, float SymbolRelevance);

/// TopN<T> is a lossy container that preserves only the "best" N elements.
template <typename T, typename Compare = std::greater<T>> class TopN {
public:
  using value_type = T;
  TopN(size_t N, Compare Greater = Compare())
      : N(N), Greater(std::move(Greater)) {}

  // Adds a candidate to the set.
  // Returns true if a candidate was dropped to get back under N.
  bool push(value_type &&V) {
    bool Dropped = false;
    if (Heap.size() >= N) {
      Dropped = true;
      if (N > 0 && Greater(V, Heap.front())) {
        std::pop_heap(Heap.begin(), Heap.end(), Greater);
        Heap.back() = std::move(V);
        std::push_heap(Heap.begin(), Heap.end(), Greater);
      }
    } else {
      Heap.push_back(std::move(V));
      std::push_heap(Heap.begin(), Heap.end(), Greater);
    }
    assert(Heap.size() <= N);
    assert(std::is_heap(Heap.begin(), Heap.end(), Greater));
    return Dropped;
  }

  // Returns candidates from best to worst.
  std::vector<value_type> items() && {
    std::sort_heap(Heap.begin(), Heap.end(), Greater);
    assert(Heap.size() <= N);
    return std::move(Heap);
  }

private:
  const size_t N;
  std::vector<value_type> Heap; // Min-heap, comparator is Greater.
  Compare Greater;
};

/// Returns a string that sorts in the same order as (-Score, Tiebreak), for
/// LSP. (The highest score compares smallest so it sorts at the top).
std::string sortText(float Score, llvm::StringRef Tiebreak = "");

} // namespace clangd
} // namespace clang

#endif
[clangd] Extract scoring/ranking logic, and shave yaks. Summary: Code completion scoring was embedded in CodeComplete.cpp, which is bad: - awkward to test. The mechanisms (extracting info from index/sema) can be unit-tested well, the policy (scoring) should be quantitatively measured. Neither was easily possible, and debugging was hard. The intermediate signal struct makes this easier. - hard to reuse. This is a bug in workspaceSymbols: it just presents the results in the index order, which is not sorted in practice, it needs to rank them! Also, index implementations care about scoring (both query-dependent and independent) in order to truncate result lists appropriately. The main yak shaved here is the build() function that had 3 variants across unit tests is unified in TestTU.h (rather than adding a 4th variant). Reviewers: ilya-biryukov Subscribers: klimek, mgorny, ioeric, MaskRay, jkorous, mgrang, cfe-commits Differential Revision: https://reviews.llvm.org/D46524 llvm-svn: 332378 2018-05-16 01:43:27 +08:00			`//===--- Quality.h - Ranking alternatives for ambiguous queries -- C++--===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===---------------------------------------------------------------------===//`
			`///`
			`/// Some operations such as code completion produce a set of candidates.`
			`/// Usually the user can choose between them, but we should put the best options`
			`/// at the top (they're easier to select, and more likely to be seen).`
			`///`
			`/// This file defines building blocks for ranking candidates.`
			`/// It's used by the features directly and also in the implementation of`
			`/// indexes, as indexes also need to heuristically limit their results.`
			`///`
			`/// The facilities here are:`
			`/// - retrieving scoring signals from e.g. indexes, AST, CodeCompletionString`
			`/// These are structured in a way that they can be debugged, and are fairly`
			`/// consistent regardless of the source.`
			`/// - compute scores from scoring signals. These are suitable for sorting.`
			`/// - sorting utilities like the TopN container.`
			`/// These could be split up further to isolate dependencies if we care.`
			`///`
			`//===---------------------------------------------------------------------===//`
			`#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_QUALITY_H`
			`#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_QUALITY_H`
			`#include "llvm/ADT/StringRef.h"`
			`#include <algorithm>`
			`#include <functional>`
			`#include <vector>`
			`namespace llvm {`
			`class raw_ostream;`
			`}`
			`namespace clang {`
			`class CodeCompletionResult;`
			`namespace clangd {`
			`struct Symbol;`

			`// Signals structs are designed to be aggregated from 0 or more sources.`
			`// A default instance has neutral signals, and sources are merged into it.`
			`// They can be dumped for debugging, and evaluate()d into a score.`

			`/// Attributes of a symbol that affect how much we like it.`
			`struct SymbolQualitySignals {`
			`unsigned SemaCCPriority = 0; // 1-80, 1 is best. 0 means absent.`
			`// FIXME: this is actually a mix of symbol`
			`// quality and relevance. Untangle this.`
			`bool Deprecated = false;`
			`unsigned References = 0;`

			`void merge(const CodeCompletionResult &SemaCCResult);`
			`void merge(const Symbol &IndexResult);`

			`// Condense these signals down to a single number, higher is better.`
			`float evaluate() const;`
			`};`
			`llvm::raw_ostream &operator<<(llvm::raw_ostream &,`
			`const SymbolQualitySignals &);`

			`/// Attributes of a symbol-query pair that affect how much we like it.`
			`struct SymbolRelevanceSignals {`
			`// 0-1 fuzzy-match score for unqualified name. Must be explicitly assigned.`
			`float NameMatch = 1;`
			`bool Forbidden = false; // Unavailable (e.g const) or inaccessible (private).`
[clangd] Boost scores for decls from current file in completion Summary: This should, arguably, give better ranking. Reviewers: ioeric, sammccall Reviewed By: sammccall Subscribers: mgorny, klimek, MaskRay, jkorous, mgrang, cfe-commits Differential Revision: https://reviews.llvm.org/D46943 llvm-svn: 333906 2018-06-04 22:50:59 +08:00			`/// Proximity between the best declaration and the query location. [0-1] score`
			`/// where 1 is closest`
			`float ProximityScore = 0;`
[clangd] Extract scoring/ranking logic, and shave yaks. Summary: Code completion scoring was embedded in CodeComplete.cpp, which is bad: - awkward to test. The mechanisms (extracting info from index/sema) can be unit-tested well, the policy (scoring) should be quantitatively measured. Neither was easily possible, and debugging was hard. The intermediate signal struct makes this easier. - hard to reuse. This is a bug in workspaceSymbols: it just presents the results in the index order, which is not sorted in practice, it needs to rank them! Also, index implementations care about scoring (both query-dependent and independent) in order to truncate result lists appropriately. The main yak shaved here is the build() function that had 3 variants across unit tests is unified in TestTU.h (rather than adding a 4th variant). Reviewers: ilya-biryukov Subscribers: klimek, mgorny, ioeric, MaskRay, jkorous, mgrang, cfe-commits Differential Revision: https://reviews.llvm.org/D46524 llvm-svn: 332378 2018-05-16 01:43:27 +08:00
			`void merge(const CodeCompletionResult &SemaResult);`

			`// Condense these signals down to a single number, higher is better.`
			`float evaluate() const;`
			`};`
			`llvm::raw_ostream &operator<<(llvm::raw_ostream &,`
			`const SymbolRelevanceSignals &);`

			`/// Combine symbol quality and relevance into a single score.`
			`float evaluateSymbolAndRelevance(float SymbolQuality, float SymbolRelevance);`

			`/// TopN<T> is a lossy container that preserves only the "best" N elements.`
			`template <typename T, typename Compare = std::greater<T>> class TopN {`
			`public:`
			`using value_type = T;`
			`TopN(size_t N, Compare Greater = Compare())`
			`: N(N), Greater(std::move(Greater)) {}`

			`// Adds a candidate to the set.`
			`// Returns true if a candidate was dropped to get back under N.`
			`bool push(value_type &&V) {`
			`bool Dropped = false;`
			`if (Heap.size() >= N) {`
			`Dropped = true;`
			`if (N > 0 && Greater(V, Heap.front())) {`
			`std::pop_heap(Heap.begin(), Heap.end(), Greater);`
			`Heap.back() = std::move(V);`
			`std::push_heap(Heap.begin(), Heap.end(), Greater);`
			`}`
			`} else {`
			`Heap.push_back(std::move(V));`
			`std::push_heap(Heap.begin(), Heap.end(), Greater);`
			`}`
			`assert(Heap.size() <= N);`
			`assert(std::is_heap(Heap.begin(), Heap.end(), Greater));`
			`return Dropped;`
			`}`

			`// Returns candidates from best to worst.`
			`std::vector<value_type> items() && {`
			`std::sort_heap(Heap.begin(), Heap.end(), Greater);`
			`assert(Heap.size() <= N);`
			`return std::move(Heap);`
			`}`

			`private:`
			`const size_t N;`
			`std::vector<value_type> Heap; // Min-heap, comparator is Greater.`
			`Compare Greater;`
			`};`

[clangd] clang-format the source code. NFC llvm-svn: 333537 2018-05-30 20:41:19 +08:00			`/// Returns a string that sorts in the same order as (-Score, Tiebreak), for`
			`/// LSP. (The highest score compares smallest so it sorts at the top).`
[clangd] Extract scoring/ranking logic, and shave yaks. Summary: Code completion scoring was embedded in CodeComplete.cpp, which is bad: - awkward to test. The mechanisms (extracting info from index/sema) can be unit-tested well, the policy (scoring) should be quantitatively measured. Neither was easily possible, and debugging was hard. The intermediate signal struct makes this easier. - hard to reuse. This is a bug in workspaceSymbols: it just presents the results in the index order, which is not sorted in practice, it needs to rank them! Also, index implementations care about scoring (both query-dependent and independent) in order to truncate result lists appropriately. The main yak shaved here is the build() function that had 3 variants across unit tests is unified in TestTU.h (rather than adding a 4th variant). Reviewers: ilya-biryukov Subscribers: klimek, mgorny, ioeric, MaskRay, jkorous, mgrang, cfe-commits Differential Revision: https://reviews.llvm.org/D46524 llvm-svn: 332378 2018-05-16 01:43:27 +08:00			`std::string sortText(float Score, llvm::StringRef Tiebreak = "");`

			`} // namespace clangd`
			`} // namespace clang`

			`#endif`