[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
//===-- Serialization.cpp - Binary serialization of index data ------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Serialization.h"
|
2018-09-07 02:52:26 +08:00
|
|
|
#include "Index.h"
|
2018-09-07 17:40:36 +08:00
|
|
|
#include "RIFF.h"
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
#include "llvm/Support/Compression.h"
|
|
|
|
#include "llvm/Support/Endian.h"
|
|
|
|
#include "llvm/Support/Error.h"
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
namespace clang {
|
|
|
|
namespace clangd {
|
|
|
|
namespace {
|
|
|
|
Error makeError(const Twine &Msg) {
|
|
|
|
return make_error<StringError>(Msg, inconvertibleErrorCode());
|
|
|
|
}
|
|
|
|
|
|
|
|
// IO PRIMITIVES
|
|
|
|
// We use little-endian 32 bit ints, sometimes with variable-length encoding.
|
|
|
|
|
|
|
|
StringRef consume(StringRef &Data, int N) {
|
|
|
|
StringRef Ret = Data.take_front(N);
|
|
|
|
Data = Data.drop_front(N);
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint8_t consume8(StringRef &Data) {
|
|
|
|
uint8_t Ret = Data.front();
|
|
|
|
Data = Data.drop_front();
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t consume32(StringRef &Data) {
|
|
|
|
auto Ret = support::endian::read32le(Data.bytes_begin());
|
|
|
|
Data = Data.drop_front(4);
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void write32(uint32_t I, raw_ostream &OS) {
|
|
|
|
char buf[4];
|
|
|
|
support::endian::write32le(buf, I);
|
|
|
|
OS.write(buf, sizeof(buf));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Variable-length int encoding (varint) uses the bottom 7 bits of each byte
|
|
|
|
// to encode the number, and the top bit to indicate whether more bytes follow.
|
|
|
|
// e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop].
|
|
|
|
// This represents 0x1a | 0x2f<<7 = 6042.
|
|
|
|
// A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact.
|
|
|
|
void writeVar(uint32_t I, raw_ostream &OS) {
|
|
|
|
constexpr static uint8_t More = 1 << 7;
|
|
|
|
if (LLVM_LIKELY(I < 1 << 7)) {
|
|
|
|
OS.write(I);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
for (;;) {
|
|
|
|
OS.write(I | More);
|
|
|
|
I >>= 7;
|
|
|
|
if (I < 1 << 7) {
|
|
|
|
OS.write(I);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t consumeVar(StringRef &Data) {
|
|
|
|
constexpr static uint8_t More = 1 << 7;
|
|
|
|
uint8_t B = consume8(Data);
|
|
|
|
if (LLVM_LIKELY(!(B & More)))
|
|
|
|
return B;
|
|
|
|
uint32_t Val = B & ~More;
|
|
|
|
for (int Shift = 7; B & More && Shift < 32; Shift += 7) {
|
|
|
|
B = consume8(Data);
|
|
|
|
Val |= (B & ~More) << Shift;
|
|
|
|
}
|
|
|
|
return Val;
|
|
|
|
}
|
|
|
|
|
|
|
|
// STRING TABLE ENCODING
|
|
|
|
// Index data has many string fields, and many strings are identical.
|
|
|
|
// We store each string once, and refer to them by index.
|
|
|
|
//
|
|
|
|
// The string table's format is:
|
2018-09-05 21:17:47 +08:00
|
|
|
// - UncompressedSize : uint32 (or 0 for no compression)
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
// - CompressedData : byte[CompressedSize]
|
|
|
|
//
|
|
|
|
// CompressedData is a zlib-compressed byte[UncompressedSize].
|
|
|
|
// It contains a sequence of null-terminated strings, e.g. "foo\0bar\0".
|
|
|
|
// These are sorted to improve compression.
|
|
|
|
|
|
|
|
// Maps each string to a canonical representation.
|
|
|
|
// Strings remain owned externally (e.g. by SymbolSlab).
|
|
|
|
class StringTableOut {
|
|
|
|
DenseSet<StringRef> Unique;
|
|
|
|
std::vector<StringRef> Sorted;
|
|
|
|
// Since strings are interned, look up can be by pointer.
|
|
|
|
DenseMap<std::pair<const char *, size_t>, unsigned> Index;
|
|
|
|
|
|
|
|
public:
|
2018-09-05 21:17:47 +08:00
|
|
|
StringTableOut() {
|
|
|
|
// Ensure there's at least one string in the table.
|
|
|
|
// Table size zero is reserved to indicate no compression.
|
|
|
|
Unique.insert("");
|
|
|
|
}
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
// Add a string to the table. Overwrites S if an identical string exists.
|
|
|
|
void intern(StringRef &S) { S = *Unique.insert(S).first; };
|
|
|
|
// Finalize the table and write it to OS. No more strings may be added.
|
|
|
|
void finalize(raw_ostream &OS) {
|
|
|
|
Sorted = {Unique.begin(), Unique.end()};
|
|
|
|
std::sort(Sorted.begin(), Sorted.end());
|
|
|
|
for (unsigned I = 0; I < Sorted.size(); ++I)
|
|
|
|
Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I);
|
|
|
|
|
|
|
|
std::string RawTable;
|
|
|
|
for (StringRef S : Sorted) {
|
|
|
|
RawTable.append(S);
|
|
|
|
RawTable.push_back(0);
|
|
|
|
}
|
2018-09-05 21:17:47 +08:00
|
|
|
if (zlib::isAvailable()) {
|
|
|
|
SmallString<1> Compressed;
|
|
|
|
cantFail(zlib::compress(RawTable, Compressed));
|
|
|
|
write32(RawTable.size(), OS);
|
|
|
|
OS << Compressed;
|
|
|
|
} else {
|
|
|
|
write32(0, OS); // No compression.
|
|
|
|
OS << RawTable;
|
|
|
|
}
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
}
|
|
|
|
// Get the ID of an string, which must be interned. Table must be finalized.
|
|
|
|
unsigned index(StringRef S) const {
|
|
|
|
assert(!Sorted.empty() && "table not finalized");
|
|
|
|
assert(Index.count({S.data(), S.size()}) && "string not interned");
|
|
|
|
return Index.find({S.data(), S.size()})->second;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct StringTableIn {
|
|
|
|
BumpPtrAllocator Arena;
|
|
|
|
std::vector<StringRef> Strings;
|
|
|
|
};
|
|
|
|
|
|
|
|
Expected<StringTableIn> readStringTable(StringRef Data) {
|
|
|
|
if (Data.size() < 4)
|
|
|
|
return makeError("Bad string table: not enough metadata");
|
|
|
|
size_t UncompressedSize = consume32(Data);
|
2018-09-05 21:17:47 +08:00
|
|
|
|
|
|
|
StringRef Uncompressed;
|
|
|
|
SmallString<1> UncompressedStorage;
|
|
|
|
if (UncompressedSize == 0) // No compression
|
|
|
|
Uncompressed = Data;
|
|
|
|
else {
|
|
|
|
if (Error E =
|
|
|
|
llvm::zlib::uncompress(Data, UncompressedStorage, UncompressedSize))
|
|
|
|
return std::move(E);
|
|
|
|
Uncompressed = UncompressedStorage;
|
|
|
|
}
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
|
|
|
|
StringTableIn Table;
|
|
|
|
StringSaver Saver(Table.Arena);
|
|
|
|
for (StringRef Rest = Uncompressed; !Rest.empty();) {
|
|
|
|
auto Len = Rest.find(0);
|
|
|
|
if (Len == StringRef::npos)
|
|
|
|
return makeError("Bad string table: not null terminated");
|
|
|
|
Table.Strings.push_back(Saver.save(consume(Rest, Len)));
|
|
|
|
Rest = Rest.drop_front();
|
|
|
|
}
|
2018-09-05 15:52:49 +08:00
|
|
|
return std::move(Table);
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// SYMBOL ENCODING
|
|
|
|
// Each field of clangd::Symbol is encoded in turn (see implementation).
|
|
|
|
// - StringRef fields encode as varint (index into the string table)
|
|
|
|
// - enums encode as the underlying type
|
|
|
|
// - most numbers encode as varint
|
|
|
|
|
|
|
|
// It's useful to the implementation to assume symbols have a bounded size.
|
|
|
|
constexpr size_t SymbolSizeBound = 512;
|
|
|
|
// To ensure the bounded size, restrict the number of include headers stored.
|
|
|
|
constexpr unsigned MaxIncludes = 50;
|
|
|
|
|
|
|
|
void writeSymbol(const Symbol &Sym, const StringTableOut &Strings,
|
|
|
|
raw_ostream &OS) {
|
|
|
|
auto StartOffset = OS.tell();
|
|
|
|
OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists,
|
|
|
|
// symbol IDs should probably be in a string table.
|
|
|
|
OS.write(static_cast<uint8_t>(Sym.SymInfo.Kind));
|
|
|
|
OS.write(static_cast<uint8_t>(Sym.SymInfo.Lang));
|
|
|
|
writeVar(Strings.index(Sym.Name), OS);
|
|
|
|
writeVar(Strings.index(Sym.Scope), OS);
|
|
|
|
for (const auto &Loc : {Sym.Definition, Sym.CanonicalDeclaration}) {
|
|
|
|
writeVar(Strings.index(Loc.FileURI), OS);
|
|
|
|
for (const auto &Endpoint : {Loc.Start, Loc.End}) {
|
|
|
|
writeVar(Endpoint.Line, OS);
|
|
|
|
writeVar(Endpoint.Column, OS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
writeVar(Sym.References, OS);
|
2018-09-07 02:52:26 +08:00
|
|
|
OS.write(static_cast<uint8_t>(Sym.Flags));
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
OS.write(static_cast<uint8_t>(Sym.Origin));
|
|
|
|
writeVar(Strings.index(Sym.Signature), OS);
|
|
|
|
writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS);
|
|
|
|
writeVar(Strings.index(Sym.Documentation), OS);
|
|
|
|
writeVar(Strings.index(Sym.ReturnType), OS);
|
|
|
|
|
|
|
|
auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) {
|
|
|
|
writeVar(Strings.index(Include.IncludeHeader), OS);
|
|
|
|
writeVar(Include.References, OS);
|
|
|
|
};
|
|
|
|
// There are almost certainly few includes, so we can just write them.
|
|
|
|
if (LLVM_LIKELY(Sym.IncludeHeaders.size() <= MaxIncludes)) {
|
|
|
|
writeVar(Sym.IncludeHeaders.size(), OS);
|
|
|
|
for (const auto &Include : Sym.IncludeHeaders)
|
|
|
|
WriteInclude(Include);
|
|
|
|
} else {
|
|
|
|
// If there are too many, make sure we truncate the least important.
|
|
|
|
using Pointer = const Symbol::IncludeHeaderWithReferences *;
|
|
|
|
std::vector<Pointer> Pointers;
|
|
|
|
for (const auto &Include : Sym.IncludeHeaders)
|
|
|
|
Pointers.push_back(&Include);
|
|
|
|
std::sort(Pointers.begin(), Pointers.end(), [](Pointer L, Pointer R) {
|
|
|
|
return L->References > R->References;
|
|
|
|
});
|
|
|
|
Pointers.resize(MaxIncludes);
|
|
|
|
|
|
|
|
writeVar(MaxIncludes, OS);
|
|
|
|
for (Pointer P : Pointers)
|
|
|
|
WriteInclude(*P);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(OS.tell() - StartOffset < SymbolSizeBound && "Symbol length unsafe!");
|
|
|
|
(void)StartOffset; // Unused in NDEBUG;
|
|
|
|
}
|
|
|
|
|
|
|
|
Expected<Symbol> readSymbol(StringRef &Data, const StringTableIn &Strings) {
|
|
|
|
// Usually we can skip bounds checks because the buffer is huge.
|
|
|
|
// Near the end of the buffer, this would be unsafe. In this rare case, copy
|
|
|
|
// the data into a bigger buffer so we can again skip the checks.
|
|
|
|
if (LLVM_UNLIKELY(Data.size() < SymbolSizeBound)) {
|
|
|
|
std::string Buf(Data);
|
|
|
|
Buf.resize(SymbolSizeBound);
|
|
|
|
StringRef ExtendedData = Buf;
|
|
|
|
auto Ret = readSymbol(ExtendedData, Strings);
|
|
|
|
unsigned BytesRead = Buf.size() - ExtendedData.size();
|
|
|
|
if (BytesRead > Data.size())
|
|
|
|
return makeError("read past end of data");
|
|
|
|
Data = Data.drop_front(BytesRead);
|
|
|
|
return Ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define READ_STRING(Field) \
|
|
|
|
do { \
|
|
|
|
auto StringIndex = consumeVar(Data); \
|
|
|
|
if (LLVM_UNLIKELY(StringIndex >= Strings.Strings.size())) \
|
|
|
|
return makeError("Bad string index"); \
|
|
|
|
Field = Strings.Strings[StringIndex]; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
Symbol Sym;
|
|
|
|
Sym.ID = SymbolID::fromRaw(consume(Data, 20));
|
|
|
|
Sym.SymInfo.Kind = static_cast<index::SymbolKind>(consume8(Data));
|
|
|
|
Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(consume8(Data));
|
|
|
|
READ_STRING(Sym.Name);
|
|
|
|
READ_STRING(Sym.Scope);
|
|
|
|
for (SymbolLocation *Loc : {&Sym.Definition, &Sym.CanonicalDeclaration}) {
|
|
|
|
READ_STRING(Loc->FileURI);
|
|
|
|
for (auto &Endpoint : {&Loc->Start, &Loc->End}) {
|
|
|
|
Endpoint->Line = consumeVar(Data);
|
|
|
|
Endpoint->Column = consumeVar(Data);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Sym.References = consumeVar(Data);
|
2018-09-07 02:52:26 +08:00
|
|
|
Sym.Flags = static_cast<Symbol::SymbolFlag>(consume8(Data));
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
Sym.Origin = static_cast<SymbolOrigin>(consume8(Data));
|
|
|
|
READ_STRING(Sym.Signature);
|
|
|
|
READ_STRING(Sym.CompletionSnippetSuffix);
|
|
|
|
READ_STRING(Sym.Documentation);
|
|
|
|
READ_STRING(Sym.ReturnType);
|
|
|
|
unsigned IncludeHeaderN = consumeVar(Data);
|
|
|
|
if (IncludeHeaderN > MaxIncludes)
|
|
|
|
return makeError("too many IncludeHeaders");
|
|
|
|
Sym.IncludeHeaders.resize(IncludeHeaderN);
|
|
|
|
for (auto &I : Sym.IncludeHeaders) {
|
|
|
|
READ_STRING(I.IncludeHeader);
|
|
|
|
I.References = consumeVar(Data);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef READ_STRING
|
2018-09-05 15:52:49 +08:00
|
|
|
return std::move(Sym);
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// FILE ENCODING
|
|
|
|
// A file is a RIFF chunk with type 'CdIx'.
|
|
|
|
// It contains the sections:
|
|
|
|
// - meta: version number
|
|
|
|
// - stri: string table
|
|
|
|
// - symb: symbols
|
|
|
|
|
|
|
|
// The current versioning scheme is simple - non-current versions are rejected.
|
2018-09-05 21:17:47 +08:00
|
|
|
// If you make a breaking change, bump this version number to invalidate stored
|
|
|
|
// data. Later we may want to support some backward compatibility.
|
2018-09-07 02:52:26 +08:00
|
|
|
constexpr static uint32_t Version = 3;
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
|
|
|
|
Expected<IndexFileIn> readIndexFile(StringRef Data) {
|
|
|
|
auto RIFF = riff::readFile(Data);
|
|
|
|
if (!RIFF)
|
|
|
|
return RIFF.takeError();
|
|
|
|
if (RIFF->Type != riff::fourCC("CdIx"))
|
|
|
|
return makeError("wrong RIFF type");
|
|
|
|
StringMap<StringRef> Chunks;
|
|
|
|
for (const auto &Chunk : RIFF->Chunks)
|
|
|
|
Chunks.try_emplace(StringRef(Chunk.ID.data(), Chunk.ID.size()), Chunk.Data);
|
|
|
|
|
|
|
|
for (StringRef RequiredChunk : {"meta", "stri"})
|
|
|
|
if (!Chunks.count(RequiredChunk))
|
|
|
|
return makeError("missing required chunk " + RequiredChunk);
|
|
|
|
|
|
|
|
StringRef Meta = Chunks.lookup("meta");
|
|
|
|
if (Meta.size() < 4 || consume32(Meta) != Version)
|
|
|
|
return makeError("wrong version");
|
|
|
|
|
|
|
|
auto Strings = readStringTable(Chunks.lookup("stri"));
|
|
|
|
if (!Strings)
|
|
|
|
return Strings.takeError();
|
|
|
|
|
|
|
|
IndexFileIn Result;
|
|
|
|
if (Chunks.count("symb")) {
|
|
|
|
StringRef SymbolData = Chunks.lookup("symb");
|
|
|
|
SymbolSlab::Builder Symbols;
|
|
|
|
while (!SymbolData.empty())
|
|
|
|
if (auto Sym = readSymbol(SymbolData, *Strings))
|
|
|
|
Symbols.insert(*Sym);
|
|
|
|
else
|
|
|
|
return Sym.takeError();
|
|
|
|
Result.Symbols = std::move(Symbols).build();
|
|
|
|
}
|
2018-09-05 15:52:49 +08:00
|
|
|
return std::move(Result);
|
[clangd] Define a compact binary serialization fomat for symbol slab/index.
Summary:
This is intended to replace the current YAML format for general use.
It's ~10x more compact than YAML, and ~40% more compact than gzipped YAML:
llvmidx.riff = 20M, llvmidx.yaml = 272M, llvmidx.yaml.gz = 32M
It's also simpler/faster to read and write.
The format is a RIFF container (chunks of (type, size, data)) with:
- a compressed string table
- simple binary encoding of symbols (with varints for compactness)
It can be extended to include occurrences, Dex posting lists, etc.
There's no rich backwards-compatibility scheme, but a version number is included
so we can detect incompatible files and do ad-hoc back-compat.
Alternatives considered:
- compressed YAML or JSON: bulky and slow to load
- llvm bitstream: confusing model and libraries are hard to use. My attempt
produced slightly larger files, and the code was longer and slower.
- protobuf or similar: would be really nice (esp for back-compat) but the
dependency is a big hassle
- ad-hoc binary format without a container: it seems clear we're going
to add posting lists and occurrences here, and that they will benefit
from sharing a string table. The container makes it easy to debug
these pieces in isolation, and make them optional.
Reviewers: ioeric
Subscribers: mgorny, ilya-biryukov, MaskRay, jkorous, mgrang, arphaman, kadircet, cfe-commits
Differential Revision: https://reviews.llvm.org/D51585
llvm-svn: 341375
2018-09-05 00:16:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
raw_ostream &operator<<(raw_ostream &OS, const IndexFileOut &Data) {
|
|
|
|
assert(Data.Symbols && "An index file without symbols makes no sense!");
|
|
|
|
riff::File RIFF;
|
|
|
|
RIFF.Type = riff::fourCC("CdIx");
|
|
|
|
|
|
|
|
SmallString<4> Meta;
|
|
|
|
{
|
|
|
|
raw_svector_ostream MetaOS(Meta);
|
|
|
|
write32(Version, MetaOS);
|
|
|
|
}
|
|
|
|
RIFF.Chunks.push_back({riff::fourCC("meta"), Meta});
|
|
|
|
|
|
|
|
StringTableOut Strings;
|
|
|
|
std::vector<Symbol> Symbols;
|
|
|
|
for (const auto &Sym : *Data.Symbols) {
|
|
|
|
Symbols.emplace_back(Sym);
|
|
|
|
visitStrings(Symbols.back(), [&](StringRef &S) { Strings.intern(S); });
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string StringSection;
|
|
|
|
{
|
|
|
|
raw_string_ostream StringOS(StringSection);
|
|
|
|
Strings.finalize(StringOS);
|
|
|
|
}
|
|
|
|
RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection});
|
|
|
|
|
|
|
|
std::string SymbolSection;
|
|
|
|
{
|
|
|
|
raw_string_ostream SymbolOS(SymbolSection);
|
|
|
|
for (const auto &Sym : Symbols)
|
|
|
|
writeSymbol(Sym, Strings, SymbolOS);
|
|
|
|
}
|
|
|
|
RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection});
|
|
|
|
|
|
|
|
return OS << RIFF;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace clangd
|
|
|
|
} // namespace clang
|