llvm-project/clang-tools-extra/clangd/index/Serialization.cpp

//===-- Serialization.cpp - Binary serialization of index data ------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "Serialization.h"
#include "Index.h"
#include "RIFF.h"
#include "llvm/Support/Compression.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/Error.h"

using namespace llvm;
namespace clang {
namespace clangd {
namespace {
Error makeError(const Twine &Msg) {
  return make_error<StringError>(Msg, inconvertibleErrorCode());
}

// IO PRIMITIVES
// We use little-endian 32 bit ints, sometimes with variable-length encoding.

StringRef consume(StringRef &Data, int N) {
  StringRef Ret = Data.take_front(N);
  Data = Data.drop_front(N);
  return Ret;
}

uint8_t consume8(StringRef &Data) {
  uint8_t Ret = Data.front();
  Data = Data.drop_front();
  return Ret;
}

uint32_t consume32(StringRef &Data) {
  auto Ret = support::endian::read32le(Data.bytes_begin());
  Data = Data.drop_front(4);
  return Ret;
}

void write32(uint32_t I, raw_ostream &OS) {
  char buf[4];
  support::endian::write32le(buf, I);
  OS.write(buf, sizeof(buf));
}

// Variable-length int encoding (varint) uses the bottom 7 bits of each byte
// to encode the number, and the top bit to indicate whether more bytes follow.
// e.g. 9a 2f means [0x1a and keep reading, 0x2f and stop].
// This represents 0x1a | 0x2f<<7 = 6042.
// A 32-bit integer takes 1-5 bytes to encode; small numbers are more compact.
void writeVar(uint32_t I, raw_ostream &OS) {
  constexpr static uint8_t More = 1 << 7;
  if (LLVM_LIKELY(I < 1 << 7)) {
    OS.write(I);
    return;
  }
  for (;;) {
    OS.write(I | More);
    I >>= 7;
    if (I < 1 << 7) {
      OS.write(I);
      return;
    }
  }
}

uint32_t consumeVar(StringRef &Data) {
  constexpr static uint8_t More = 1 << 7;
  uint8_t B = consume8(Data);
  if (LLVM_LIKELY(!(B & More)))
    return B;
  uint32_t Val = B & ~More;
  for (int Shift = 7; B & More && Shift < 32; Shift += 7) {
    B = consume8(Data);
    Val |= (B & ~More) << Shift;
  }
  return Val;
}

// STRING TABLE ENCODING
// Index data has many string fields, and many strings are identical.
// We store each string once, and refer to them by index.
//
// The string table's format is:
//   - UncompressedSize : uint32 (or 0 for no compression)
//   - CompressedData   : byte[CompressedSize]
//
// CompressedData is a zlib-compressed byte[UncompressedSize].
// It contains a sequence of null-terminated strings, e.g. "foo\0bar\0".
// These are sorted to improve compression.

// Maps each string to a canonical representation.
// Strings remain owned externally (e.g. by SymbolSlab).
class StringTableOut {
  DenseSet<StringRef> Unique;
  std::vector<StringRef> Sorted;
  // Since strings are interned, look up can be by pointer.
  DenseMap<std::pair<const char *, size_t>, unsigned> Index;

public:
  StringTableOut() {
    // Ensure there's at least one string in the table.
    // Table size zero is reserved to indicate no compression.
    Unique.insert("");
  }
  // Add a string to the table. Overwrites S if an identical string exists.
  void intern(StringRef &S) { S = *Unique.insert(S).first; };
  // Finalize the table and write it to OS. No more strings may be added.
  void finalize(raw_ostream &OS) {
    Sorted = {Unique.begin(), Unique.end()};
    std::sort(Sorted.begin(), Sorted.end());
    for (unsigned I = 0; I < Sorted.size(); ++I)
      Index.try_emplace({Sorted[I].data(), Sorted[I].size()}, I);

    std::string RawTable;
    for (StringRef S : Sorted) {
      RawTable.append(S);
      RawTable.push_back(0);
    }
    if (zlib::isAvailable()) {
      SmallString<1> Compressed;
      cantFail(zlib::compress(RawTable, Compressed));
      write32(RawTable.size(), OS);
      OS << Compressed;
    } else {
      write32(0, OS); // No compression.
      OS << RawTable;
    }
  }
  // Get the ID of an string, which must be interned. Table must be finalized.
  unsigned index(StringRef S) const {
    assert(!Sorted.empty() && "table not finalized");
    assert(Index.count({S.data(), S.size()}) && "string not interned");
    return Index.find({S.data(), S.size()})->second;
  }
};

struct StringTableIn {
  BumpPtrAllocator Arena;
  std::vector<StringRef> Strings;
};

Expected<StringTableIn> readStringTable(StringRef Data) {
  if (Data.size() < 4)
    return makeError("Bad string table: not enough metadata");
  size_t UncompressedSize = consume32(Data);

  StringRef Uncompressed;
  SmallString<1> UncompressedStorage;
  if (UncompressedSize == 0) // No compression
    Uncompressed = Data;
  else {
    if (Error E =
            llvm::zlib::uncompress(Data, UncompressedStorage, UncompressedSize))
      return std::move(E);
    Uncompressed = UncompressedStorage;
  }

  StringTableIn Table;
  StringSaver Saver(Table.Arena);
  for (StringRef Rest = Uncompressed; !Rest.empty();) {
    auto Len = Rest.find(0);
    if (Len == StringRef::npos)
      return makeError("Bad string table: not null terminated");
    Table.Strings.push_back(Saver.save(consume(Rest, Len)));
    Rest = Rest.drop_front();
  }
  return std::move(Table);
}

// SYMBOL ENCODING
// Each field of clangd::Symbol is encoded in turn (see implementation).
//  - StringRef fields encode as varint (index into the string table)
//  - enums encode as the underlying type
//  - most numbers encode as varint

// It's useful to the implementation to assume symbols have a bounded size.
constexpr size_t SymbolSizeBound = 512;
// To ensure the bounded size, restrict the number of include headers stored.
constexpr unsigned MaxIncludes = 50;

void writeSymbol(const Symbol &Sym, const StringTableOut &Strings,
                 raw_ostream &OS) {
  auto StartOffset = OS.tell();
  OS << Sym.ID.raw(); // TODO: once we start writing xrefs and posting lists,
                      // symbol IDs should probably be in a string table.
  OS.write(static_cast<uint8_t>(Sym.SymInfo.Kind));
  OS.write(static_cast<uint8_t>(Sym.SymInfo.Lang));
  writeVar(Strings.index(Sym.Name), OS);
  writeVar(Strings.index(Sym.Scope), OS);
  for (const auto &Loc : {Sym.Definition, Sym.CanonicalDeclaration}) {
    writeVar(Strings.index(Loc.FileURI), OS);
    for (const auto &Endpoint : {Loc.Start, Loc.End}) {
      writeVar(Endpoint.Line, OS);
      writeVar(Endpoint.Column, OS);
    }
  }
  writeVar(Sym.References, OS);
  OS.write(static_cast<uint8_t>(Sym.Flags));
  OS.write(static_cast<uint8_t>(Sym.Origin));
  writeVar(Strings.index(Sym.Signature), OS);
  writeVar(Strings.index(Sym.CompletionSnippetSuffix), OS);
  writeVar(Strings.index(Sym.Documentation), OS);
  writeVar(Strings.index(Sym.ReturnType), OS);

  auto WriteInclude = [&](const Symbol::IncludeHeaderWithReferences &Include) {
    writeVar(Strings.index(Include.IncludeHeader), OS);
    writeVar(Include.References, OS);
  };
  // There are almost certainly few includes, so we can just write them.
  if (LLVM_LIKELY(Sym.IncludeHeaders.size() <= MaxIncludes)) {
    writeVar(Sym.IncludeHeaders.size(), OS);
    for (const auto &Include : Sym.IncludeHeaders)
      WriteInclude(Include);
  } else {
    // If there are too many, make sure we truncate the least important.
    using Pointer = const Symbol::IncludeHeaderWithReferences *;
    std::vector<Pointer> Pointers;
    for (const auto &Include : Sym.IncludeHeaders)
      Pointers.push_back(&Include);
    std::sort(Pointers.begin(), Pointers.end(), [](Pointer L, Pointer R) {
      return L->References > R->References;
    });
    Pointers.resize(MaxIncludes);

    writeVar(MaxIncludes, OS);
    for (Pointer P : Pointers)
      WriteInclude(*P);
  }

  assert(OS.tell() - StartOffset < SymbolSizeBound && "Symbol length unsafe!");
  (void)StartOffset; // Unused in NDEBUG;
}

Expected<Symbol> readSymbol(StringRef &Data, const StringTableIn &Strings) {
  // Usually we can skip bounds checks because the buffer is huge.
  // Near the end of the buffer, this would be unsafe. In this rare case, copy
  // the data into a bigger buffer so we can again skip the checks.
  if (LLVM_UNLIKELY(Data.size() < SymbolSizeBound)) {
    std::string Buf(Data);
    Buf.resize(SymbolSizeBound);
    StringRef ExtendedData = Buf;
    auto Ret = readSymbol(ExtendedData, Strings);
    unsigned BytesRead = Buf.size() - ExtendedData.size();
    if (BytesRead > Data.size())
      return makeError("read past end of data");
    Data = Data.drop_front(BytesRead);
    return Ret;
  }

#define READ_STRING(Field)                                                     \
  do {                                                                         \
    auto StringIndex = consumeVar(Data);                                       \
    if (LLVM_UNLIKELY(StringIndex >= Strings.Strings.size()))                  \
      return makeError("Bad string index");                                    \
    Field = Strings.Strings[StringIndex];                                      \
  } while (0)

  Symbol Sym;
  Sym.ID = SymbolID::fromRaw(consume(Data, 20));
  Sym.SymInfo.Kind = static_cast<index::SymbolKind>(consume8(Data));
  Sym.SymInfo.Lang = static_cast<index::SymbolLanguage>(consume8(Data));
  READ_STRING(Sym.Name);
  READ_STRING(Sym.Scope);
  for (SymbolLocation *Loc : {&Sym.Definition, &Sym.CanonicalDeclaration}) {
    READ_STRING(Loc->FileURI);
    for (auto &Endpoint : {&Loc->Start, &Loc->End}) {
      Endpoint->Line = consumeVar(Data);
      Endpoint->Column = consumeVar(Data);
    }
  }
  Sym.References = consumeVar(Data);
  Sym.Flags = static_cast<Symbol::SymbolFlag>(consume8(Data));
  Sym.Origin = static_cast<SymbolOrigin>(consume8(Data));
  READ_STRING(Sym.Signature);
  READ_STRING(Sym.CompletionSnippetSuffix);
  READ_STRING(Sym.Documentation);
  READ_STRING(Sym.ReturnType);
  unsigned IncludeHeaderN = consumeVar(Data);
  if (IncludeHeaderN > MaxIncludes)
    return makeError("too many IncludeHeaders");
  Sym.IncludeHeaders.resize(IncludeHeaderN);
  for (auto &I : Sym.IncludeHeaders) {
    READ_STRING(I.IncludeHeader);
    I.References = consumeVar(Data);
  }

#undef READ_STRING
  return std::move(Sym);
}

} // namespace

// FILE ENCODING
// A file is a RIFF chunk with type 'CdIx'.
// It contains the sections:
//   - meta: version number
//   - stri: string table
//   - symb: symbols

// The current versioning scheme is simple - non-current versions are rejected.
// If you make a breaking change, bump this version number to invalidate stored
// data. Later we may want to support some backward compatibility.
constexpr static uint32_t Version = 3;

Expected<IndexFileIn> readIndexFile(StringRef Data) {
  auto RIFF = riff::readFile(Data);
  if (!RIFF)
    return RIFF.takeError();
  if (RIFF->Type != riff::fourCC("CdIx"))
    return makeError("wrong RIFF type");
  StringMap<StringRef> Chunks;
  for (const auto &Chunk : RIFF->Chunks)
    Chunks.try_emplace(StringRef(Chunk.ID.data(), Chunk.ID.size()), Chunk.Data);

  for (StringRef RequiredChunk : {"meta", "stri"})
    if (!Chunks.count(RequiredChunk))
      return makeError("missing required chunk " + RequiredChunk);

  StringRef Meta = Chunks.lookup("meta");
  if (Meta.size() < 4 || consume32(Meta) != Version)
    return makeError("wrong version");

  auto Strings = readStringTable(Chunks.lookup("stri"));
  if (!Strings)
    return Strings.takeError();

  IndexFileIn Result;
  if (Chunks.count("symb")) {
    StringRef SymbolData = Chunks.lookup("symb");
    SymbolSlab::Builder Symbols;
    while (!SymbolData.empty())
      if (auto Sym = readSymbol(SymbolData, *Strings))
        Symbols.insert(*Sym);
      else
        return Sym.takeError();
    Result.Symbols = std::move(Symbols).build();
  }
  return std::move(Result);
}

raw_ostream &operator<<(raw_ostream &OS, const IndexFileOut &Data) {
  assert(Data.Symbols && "An index file without symbols makes no sense!");
  riff::File RIFF;
  RIFF.Type = riff::fourCC("CdIx");

  SmallString<4> Meta;
  {
    raw_svector_ostream MetaOS(Meta);
    write32(Version, MetaOS);
  }
  RIFF.Chunks.push_back({riff::fourCC("meta"), Meta});

  StringTableOut Strings;
  std::vector<Symbol> Symbols;
  for (const auto &Sym : *Data.Symbols) {
    Symbols.emplace_back(Sym);
    visitStrings(Symbols.back(), [&](StringRef &S) { Strings.intern(S); });
  }

  std::string StringSection;
  {
    raw_string_ostream StringOS(StringSection);
    Strings.finalize(StringOS);
  }
  RIFF.Chunks.push_back({riff::fourCC("stri"), StringSection});

  std::string SymbolSection;
  {
    raw_string_ostream SymbolOS(SymbolSection);
    for (const auto &Sym : Symbols)
      writeSymbol(Sym, Strings, SymbolOS);
  }
  RIFF.Chunks.push_back({riff::fourCC("symb"), SymbolSection});

  return OS << RIFF;
}

} // namespace clangd
} // namespace clang