llvm-project/clang-tools-extra/clangd/index/SymbolID.h

//===--- SymbolID.h ----------------------------------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H

#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/raw_ostream.h"
#include <array>
#include <cstddef>
#include <cstdint>
#include <string>

namespace clang {
namespace clangd {

// The class identifies a particular C++ symbol (class, function, method, etc).
//
// As USRs (Unified Symbol Resolution) could be large, especially for functions
// with long type arguments, SymbolID is using truncated SHA1(USR) values to
// guarantee the uniqueness of symbols while using a relatively small amount of
// memory (vs storing USRs directly).
//
// SymbolID can be used as key in the symbol indexes to lookup the symbol.
class SymbolID {
public:
  SymbolID() = default;
  explicit SymbolID(llvm::StringRef USR);

  bool operator==(const SymbolID &Sym) const {
    return HashValue == Sym.HashValue;
  }
  bool operator!=(const SymbolID &Sym) const { return !(*this == Sym); }
  bool operator<(const SymbolID &Sym) const {
    return HashValue < Sym.HashValue;
  }

  // The stored hash is truncated to RawSize bytes.
  // This trades off memory against the number of symbols we can handle.
  constexpr static size_t RawSize = 8;
  llvm::StringRef raw() const;
  static SymbolID fromRaw(llvm::StringRef);

  // Returns a hex encoded string.
  std::string str() const;
  static llvm::Expected<SymbolID> fromStr(llvm::StringRef);

  bool isNull() const { return *this == SymbolID(); }
  explicit operator bool() const { return !isNull(); }

private:
  std::array<uint8_t, RawSize> HashValue{};
};

inline llvm::hash_code hash_value(const SymbolID &ID) {
  // We already have a good hash, just return the first bytes.
  static_assert(sizeof(size_t) <= SymbolID::RawSize,
                "size_t longer than SHA1!");
  size_t Result;
  memcpy(&Result, ID.raw().data(), sizeof(size_t));
  return llvm::hash_code(Result);
}

// Write SymbolID into the given stream. SymbolID is encoded as ID.str().
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID);

} // namespace clangd
} // namespace clang

namespace llvm {
// Support SymbolIDs as DenseMap keys.
template <> struct DenseMapInfo<clang::clangd::SymbolID> {
  static inline clang::clangd::SymbolID getEmptyKey() {
    static clang::clangd::SymbolID EmptyKey("EMPTYKEY");
    return EmptyKey;
  }
  static inline clang::clangd::SymbolID getTombstoneKey() {
    static clang::clangd::SymbolID TombstoneKey("TOMBSTONEKEY");
    return TombstoneKey;
  }
  static unsigned getHashValue(const clang::clangd::SymbolID &Sym) {
    return hash_value(Sym);
  }
  static bool isEqual(const clang::clangd::SymbolID &LHS,
                      const clang::clangd::SymbolID &RHS) {
    return LHS == RHS;
  }
};
} // namespace llvm

#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H
[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`//===--- SymbolID.h ----------------------------------------------- C++--===//`
			`//`
Update the file headers across all of the LLVM projects in the monorepo to reflect the new license. We understand that people may be surprised that we're moving the header entirely to discuss the new license. We checked this carefully with the Foundation's lawyer and we believe this is the correct approach. Essentially, all code in the project is now made available by the LLVM project under our new license, so you will see that the license headers include that license only. Some of our contributors have contributed code under our old license, and accordingly, we have retained a copy of our old license notice in the top-level files in each project and repository. llvm-svn: 351636 2019-01-19 16:50:56 +08:00			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`//`
			`//===----------------------------------------------------------------------===//`

			`#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H`
			`#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H`

			`#include "llvm/ADT/Hashing.h"`
			`#include "llvm/ADT/StringRef.h"`
			`#include "llvm/Support/Error.h"`
			`#include "llvm/Support/raw_ostream.h"`
			`#include <array>`
[clangd] Performance improvements and cleanup - Inline SymbolID hashing to header - Don't collect references for symbols without a SymbolID - Store referenced symbols, rather than separately storing decls and macros. - Don't defer ref collection to end of translation unit - Perform const_cast when updating reference counts (~0.5% saving) - Introduce caching for getSymbolID in SymbolCollector. (~30% saving) - Don't modify symbolslab if there's no definition location - Don't lex the whole file to deduce spelled tokens, just lex the relevant piece (~8%) Overall this achieves ~38% reduction in time spent inside SymbolCollector compared to baseline (on my machine :)). I'd expect the last optimization to affect dynamic index a lot more, I was testing with clangd-indexer on clangd subfolder of LLVM. As clangd-indexer runs indexing of whole TU at once, we indeed see almost every token from every source included in the TU (hence lexing full files vs just lexing referenced tokens are almost the same), whereas during dynamic indexing we mostly index main file symbols, but we would touch the files defining/declaring those symbols, and lex complete files for nothing, rather than just the token location. The last optimization is also a functional change (added test), previously we used raw tokens from syntax::tokenize, which didn't canonicalize trigraphs/newlines in identifiers, wheres Lexer::getSpelling canonicalizes them. Differential Revision: https://reviews.llvm.org/D122894 2022-04-08 15:56:43 +08:00			`#include <cstddef>`
[clangd] Value initialize SymbolIDs We were default initializing SymbolIDs before, which would leave indeterminate values in underlying std::array. This patch updates the underlying data initalization to be value-init and adds a way to check for validness of a SymbolID. Differential Revision: https://reviews.llvm.org/D90397 2020-10-29 23:04:53 +08:00			`#include <cstdint>`
[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`#include <string>`

			`namespace clang {`
			`namespace clangd {`

			`// The class identifies a particular C++ symbol (class, function, method, etc).`
			`//`
			`// As USRs (Unified Symbol Resolution) could be large, especially for functions`
			`// with long type arguments, SymbolID is using truncated SHA1(USR) values to`
			`// guarantee the uniqueness of symbols while using a relatively small amount of`
			`// memory (vs storing USRs directly).`
			`//`
			`// SymbolID can be used as key in the symbol indexes to lookup the symbol.`
			`class SymbolID {`
			`public:`
			`SymbolID() = default;`
			`explicit SymbolID(llvm::StringRef USR);`

			`bool operator==(const SymbolID &Sym) const {`
			`return HashValue == Sym.HashValue;`
			`}`
[clangd] Performance improvements and cleanup - Inline SymbolID hashing to header - Don't collect references for symbols without a SymbolID - Store referenced symbols, rather than separately storing decls and macros. - Don't defer ref collection to end of translation unit - Perform const_cast when updating reference counts (~0.5% saving) - Introduce caching for getSymbolID in SymbolCollector. (~30% saving) - Don't modify symbolslab if there's no definition location - Don't lex the whole file to deduce spelled tokens, just lex the relevant piece (~8%) Overall this achieves ~38% reduction in time spent inside SymbolCollector compared to baseline (on my machine :)). I'd expect the last optimization to affect dynamic index a lot more, I was testing with clangd-indexer on clangd subfolder of LLVM. As clangd-indexer runs indexing of whole TU at once, we indeed see almost every token from every source included in the TU (hence lexing full files vs just lexing referenced tokens are almost the same), whereas during dynamic indexing we mostly index main file symbols, but we would touch the files defining/declaring those symbols, and lex complete files for nothing, rather than just the token location. The last optimization is also a functional change (added test), previously we used raw tokens from syntax::tokenize, which didn't canonicalize trigraphs/newlines in identifiers, wheres Lexer::getSpelling canonicalizes them. Differential Revision: https://reviews.llvm.org/D122894 2022-04-08 15:56:43 +08:00			`bool operator!=(const SymbolID &Sym) const { return !(*this == Sym); }`
[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`bool operator<(const SymbolID &Sym) const {`
			`return HashValue < Sym.HashValue;`
			`}`

			`// The stored hash is truncated to RawSize bytes.`
			`// This trades off memory against the number of symbols we can handle.`
			`constexpr static size_t RawSize = 8;`
			`llvm::StringRef raw() const;`
			`static SymbolID fromRaw(llvm::StringRef);`

			`// Returns a hex encoded string.`
			`std::string str() const;`
			`static llvm::Expected<SymbolID> fromStr(llvm::StringRef);`

[clangd] Value initialize SymbolIDs We were default initializing SymbolIDs before, which would leave indeterminate values in underlying std::array. This patch updates the underlying data initalization to be value-init and adds a way to check for validness of a SymbolID. Differential Revision: https://reviews.llvm.org/D90397 2020-10-29 23:04:53 +08:00			`bool isNull() const { return *this == SymbolID(); }`
			`explicit operator bool() const { return !isNull(); }`

[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`private:`
[clangd] Value initialize SymbolIDs We were default initializing SymbolIDs before, which would leave indeterminate values in underlying std::array. This patch updates the underlying data initalization to be value-init and adds a way to check for validness of a SymbolID. Differential Revision: https://reviews.llvm.org/D90397 2020-10-29 23:04:53 +08:00			`std::array<uint8_t, RawSize> HashValue{};`
[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`};`

[clangd] Performance improvements and cleanup - Inline SymbolID hashing to header - Don't collect references for symbols without a SymbolID - Store referenced symbols, rather than separately storing decls and macros. - Don't defer ref collection to end of translation unit - Perform const_cast when updating reference counts (~0.5% saving) - Introduce caching for getSymbolID in SymbolCollector. (~30% saving) - Don't modify symbolslab if there's no definition location - Don't lex the whole file to deduce spelled tokens, just lex the relevant piece (~8%) Overall this achieves ~38% reduction in time spent inside SymbolCollector compared to baseline (on my machine :)). I'd expect the last optimization to affect dynamic index a lot more, I was testing with clangd-indexer on clangd subfolder of LLVM. As clangd-indexer runs indexing of whole TU at once, we indeed see almost every token from every source included in the TU (hence lexing full files vs just lexing referenced tokens are almost the same), whereas during dynamic indexing we mostly index main file symbols, but we would touch the files defining/declaring those symbols, and lex complete files for nothing, rather than just the token location. The last optimization is also a functional change (added test), previously we used raw tokens from syntax::tokenize, which didn't canonicalize trigraphs/newlines in identifiers, wheres Lexer::getSpelling canonicalizes them. Differential Revision: https://reviews.llvm.org/D122894 2022-04-08 15:56:43 +08:00			`inline llvm::hash_code hash_value(const SymbolID &ID) {`
			`// We already have a good hash, just return the first bytes.`
			`static_assert(sizeof(size_t) <= SymbolID::RawSize,`
			`"size_t longer than SHA1!");`
			`size_t Result;`
			`memcpy(&Result, ID.raw().data(), sizeof(size_t));`
			`return llvm::hash_code(Result);`
			`}`
[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00
			`// Write SymbolID into the given stream. SymbolID is encoded as ID.str().`
			`llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID);`

			`} // namespace clangd`
			`} // namespace clang`

Moved DenseMap support for SymbolID into SymbolID.h llvm-svn: 355081 2019-02-28 19:00:44 +08:00			`namespace llvm {`
			`// Support SymbolIDs as DenseMap keys.`
			`template <> struct DenseMapInfo<clang::clangd::SymbolID> {`
			`static inline clang::clangd::SymbolID getEmptyKey() {`
			`static clang::clangd::SymbolID EmptyKey("EMPTYKEY");`
			`return EmptyKey;`
			`}`
			`static inline clang::clangd::SymbolID getTombstoneKey() {`
			`static clang::clangd::SymbolID TombstoneKey("TOMBSTONEKEY");`
			`return TombstoneKey;`
			`}`
			`static unsigned getHashValue(const clang::clangd::SymbolID &Sym) {`
			`return hash_value(Sym);`
			`}`
			`static bool isEqual(const clang::clangd::SymbolID &LHS,`
			`const clang::clangd::SymbolID &RHS) {`
			`return LHS == RHS;`
			`}`
			`};`
			`} // namespace llvm`

[clangd][NFC] Move SymbolID to a separate file Prerequisity for textDocument/SymbolInfo Differential Revision: https://reviews.llvm.org/D54799 llvm-svn: 347674 2018-11-28 00:40:34 +08:00			`#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H`