2018-11-28 00:40:34 +08:00
|
|
|
//===--- SymbolID.h ----------------------------------------------*- C++-*-===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2018-11-28 00:40:34 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H
|
|
|
|
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H
|
|
|
|
|
|
|
|
#include "llvm/ADT/Hashing.h"
|
|
|
|
#include "llvm/ADT/StringRef.h"
|
|
|
|
#include "llvm/Support/Error.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <array>
|
[clangd] Performance improvements and cleanup
- Inline SymbolID hashing to header
- Don't collect references for symbols without a SymbolID
- Store referenced symbols, rather than separately storing decls and
macros.
- Don't defer ref collection to end of translation unit
- Perform const_cast when updating reference counts (~0.5% saving)
- Introduce caching for getSymbolID in SymbolCollector. (~30% saving)
- Don't modify symbolslab if there's no definition location
- Don't lex the whole file to deduce spelled tokens, just lex the
relevant piece (~8%)
Overall this achieves ~38% reduction in time spent inside
SymbolCollector compared to baseline (on my machine :)).
I'd expect the last optimization to affect dynamic index a lot more, I
was testing with clangd-indexer on clangd subfolder of LLVM. As
clangd-indexer runs indexing of whole TU at once, we indeed see almost
every token from every source included in the TU (hence lexing full
files vs just lexing referenced tokens are almost the same), whereas
during dynamic indexing we mostly index main file symbols, but we would
touch the files defining/declaring those symbols, and lex complete files
for nothing, rather than just the token location.
The last optimization is also a functional change (added test),
previously we used raw tokens from syntax::tokenize, which didn't
canonicalize trigraphs/newlines in identifiers, wheres
Lexer::getSpelling canonicalizes them.
Differential Revision: https://reviews.llvm.org/D122894
2022-04-08 15:56:43 +08:00
|
|
|
#include <cstddef>
|
2020-10-29 23:04:53 +08:00
|
|
|
#include <cstdint>
|
2018-11-28 00:40:34 +08:00
|
|
|
#include <string>
|
|
|
|
|
|
|
|
namespace clang {
|
|
|
|
namespace clangd {
|
|
|
|
|
|
|
|
// The class identifies a particular C++ symbol (class, function, method, etc).
|
|
|
|
//
|
|
|
|
// As USRs (Unified Symbol Resolution) could be large, especially for functions
|
|
|
|
// with long type arguments, SymbolID is using truncated SHA1(USR) values to
|
|
|
|
// guarantee the uniqueness of symbols while using a relatively small amount of
|
|
|
|
// memory (vs storing USRs directly).
|
|
|
|
//
|
|
|
|
// SymbolID can be used as key in the symbol indexes to lookup the symbol.
|
|
|
|
class SymbolID {
|
|
|
|
public:
|
|
|
|
SymbolID() = default;
|
|
|
|
explicit SymbolID(llvm::StringRef USR);
|
|
|
|
|
|
|
|
bool operator==(const SymbolID &Sym) const {
|
|
|
|
return HashValue == Sym.HashValue;
|
|
|
|
}
|
[clangd] Performance improvements and cleanup
- Inline SymbolID hashing to header
- Don't collect references for symbols without a SymbolID
- Store referenced symbols, rather than separately storing decls and
macros.
- Don't defer ref collection to end of translation unit
- Perform const_cast when updating reference counts (~0.5% saving)
- Introduce caching for getSymbolID in SymbolCollector. (~30% saving)
- Don't modify symbolslab if there's no definition location
- Don't lex the whole file to deduce spelled tokens, just lex the
relevant piece (~8%)
Overall this achieves ~38% reduction in time spent inside
SymbolCollector compared to baseline (on my machine :)).
I'd expect the last optimization to affect dynamic index a lot more, I
was testing with clangd-indexer on clangd subfolder of LLVM. As
clangd-indexer runs indexing of whole TU at once, we indeed see almost
every token from every source included in the TU (hence lexing full
files vs just lexing referenced tokens are almost the same), whereas
during dynamic indexing we mostly index main file symbols, but we would
touch the files defining/declaring those symbols, and lex complete files
for nothing, rather than just the token location.
The last optimization is also a functional change (added test),
previously we used raw tokens from syntax::tokenize, which didn't
canonicalize trigraphs/newlines in identifiers, wheres
Lexer::getSpelling canonicalizes them.
Differential Revision: https://reviews.llvm.org/D122894
2022-04-08 15:56:43 +08:00
|
|
|
bool operator!=(const SymbolID &Sym) const { return !(*this == Sym); }
|
2018-11-28 00:40:34 +08:00
|
|
|
bool operator<(const SymbolID &Sym) const {
|
|
|
|
return HashValue < Sym.HashValue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The stored hash is truncated to RawSize bytes.
|
|
|
|
// This trades off memory against the number of symbols we can handle.
|
|
|
|
constexpr static size_t RawSize = 8;
|
|
|
|
llvm::StringRef raw() const;
|
|
|
|
static SymbolID fromRaw(llvm::StringRef);
|
|
|
|
|
|
|
|
// Returns a hex encoded string.
|
|
|
|
std::string str() const;
|
|
|
|
static llvm::Expected<SymbolID> fromStr(llvm::StringRef);
|
|
|
|
|
2020-10-29 23:04:53 +08:00
|
|
|
bool isNull() const { return *this == SymbolID(); }
|
|
|
|
explicit operator bool() const { return !isNull(); }
|
|
|
|
|
2018-11-28 00:40:34 +08:00
|
|
|
private:
|
2020-10-29 23:04:53 +08:00
|
|
|
std::array<uint8_t, RawSize> HashValue{};
|
2018-11-28 00:40:34 +08:00
|
|
|
};
|
|
|
|
|
[clangd] Performance improvements and cleanup
- Inline SymbolID hashing to header
- Don't collect references for symbols without a SymbolID
- Store referenced symbols, rather than separately storing decls and
macros.
- Don't defer ref collection to end of translation unit
- Perform const_cast when updating reference counts (~0.5% saving)
- Introduce caching for getSymbolID in SymbolCollector. (~30% saving)
- Don't modify symbolslab if there's no definition location
- Don't lex the whole file to deduce spelled tokens, just lex the
relevant piece (~8%)
Overall this achieves ~38% reduction in time spent inside
SymbolCollector compared to baseline (on my machine :)).
I'd expect the last optimization to affect dynamic index a lot more, I
was testing with clangd-indexer on clangd subfolder of LLVM. As
clangd-indexer runs indexing of whole TU at once, we indeed see almost
every token from every source included in the TU (hence lexing full
files vs just lexing referenced tokens are almost the same), whereas
during dynamic indexing we mostly index main file symbols, but we would
touch the files defining/declaring those symbols, and lex complete files
for nothing, rather than just the token location.
The last optimization is also a functional change (added test),
previously we used raw tokens from syntax::tokenize, which didn't
canonicalize trigraphs/newlines in identifiers, wheres
Lexer::getSpelling canonicalizes them.
Differential Revision: https://reviews.llvm.org/D122894
2022-04-08 15:56:43 +08:00
|
|
|
inline llvm::hash_code hash_value(const SymbolID &ID) {
|
|
|
|
// We already have a good hash, just return the first bytes.
|
|
|
|
static_assert(sizeof(size_t) <= SymbolID::RawSize,
|
|
|
|
"size_t longer than SHA1!");
|
|
|
|
size_t Result;
|
|
|
|
memcpy(&Result, ID.raw().data(), sizeof(size_t));
|
|
|
|
return llvm::hash_code(Result);
|
|
|
|
}
|
2018-11-28 00:40:34 +08:00
|
|
|
|
|
|
|
// Write SymbolID into the given stream. SymbolID is encoded as ID.str().
|
|
|
|
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const SymbolID &ID);
|
|
|
|
|
|
|
|
} // namespace clangd
|
|
|
|
} // namespace clang
|
|
|
|
|
2019-02-28 19:00:44 +08:00
|
|
|
namespace llvm {
|
|
|
|
// Support SymbolIDs as DenseMap keys.
|
|
|
|
template <> struct DenseMapInfo<clang::clangd::SymbolID> {
|
|
|
|
static inline clang::clangd::SymbolID getEmptyKey() {
|
|
|
|
static clang::clangd::SymbolID EmptyKey("EMPTYKEY");
|
|
|
|
return EmptyKey;
|
|
|
|
}
|
|
|
|
static inline clang::clangd::SymbolID getTombstoneKey() {
|
|
|
|
static clang::clangd::SymbolID TombstoneKey("TOMBSTONEKEY");
|
|
|
|
return TombstoneKey;
|
|
|
|
}
|
|
|
|
static unsigned getHashValue(const clang::clangd::SymbolID &Sym) {
|
|
|
|
return hash_value(Sym);
|
|
|
|
}
|
|
|
|
static bool isEqual(const clang::clangd::SymbolID &LHS,
|
|
|
|
const clang::clangd::SymbolID &RHS) {
|
|
|
|
return LHS == RHS;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
} // namespace llvm
|
|
|
|
|
2018-11-28 00:40:34 +08:00
|
|
|
#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_SYMBOLID_H
|