forked from OSchip/llvm-project
[sanitizer] Add Lempel–Ziv–Welch encoder/decoder
It's very simple, fast and efficient for the stack depot compression if used on entire pointers. Reviewed By: morehouse, kstoimenov Differential Revision: https://reviews.llvm.org/D114918
This commit is contained in:
parent
5f1d1854eb
commit
98bb198693
|
@ -155,6 +155,7 @@ set(SANITIZER_IMPL_HEADERS
|
|||
sanitizer_linux.h
|
||||
sanitizer_list.h
|
||||
sanitizer_local_address_space_view.h
|
||||
sanitizer_lzw.h
|
||||
sanitizer_mac.h
|
||||
sanitizer_malloc_mac.inc
|
||||
sanitizer_mutex.h
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Lempel–Ziv–Welch encoding/decoding
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef SANITIZER_LZW_H
|
||||
#define SANITIZER_LZW_H
|
||||
|
||||
#include "sanitizer_dense_map.h"
|
||||
|
||||
namespace __sanitizer {
|
||||
|
||||
using LzwCodeType = u32;
|
||||
|
||||
template <class T, class ItIn, class ItOut>
|
||||
ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
|
||||
using Substring =
|
||||
detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
|
||||
|
||||
// Sentinel value for substrings of len 1.
|
||||
static constexpr LzwCodeType kNoPrefix =
|
||||
Min(DenseMapInfo<Substring>::getEmptyKey().first,
|
||||
DenseMapInfo<Substring>::getTombstoneKey().first) -
|
||||
1;
|
||||
DenseMap<Substring, LzwCodeType> prefix_to_code;
|
||||
{
|
||||
// Add all substring of len 1 as initial dictionary.
|
||||
InternalMmapVector<T> dict_len1;
|
||||
for (auto it = begin; it != end; ++it)
|
||||
if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
|
||||
dict_len1.push_back(*it);
|
||||
|
||||
// Slightly helps with later delta encoding.
|
||||
Sort(dict_len1.data(), dict_len1.size());
|
||||
|
||||
// For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
|
||||
// just generate them.
|
||||
*out = dict_len1.size();
|
||||
++out;
|
||||
|
||||
for (uptr i = 0; i != dict_len1.size(); ++i) {
|
||||
// Remap after the Sort.
|
||||
prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
|
||||
*out = dict_len1[i];
|
||||
++out;
|
||||
}
|
||||
CHECK_EQ(prefix_to_code.size(), dict_len1.size());
|
||||
}
|
||||
|
||||
if (begin == end)
|
||||
return out;
|
||||
|
||||
// Main LZW encoding loop.
|
||||
LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
|
||||
++begin;
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
// Extend match with the new item.
|
||||
auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
|
||||
if (ins.second) {
|
||||
// This is a new substring, but emit the code for the current match
|
||||
// (before extend). This allows LZW decoder to recover the dictionary.
|
||||
*out = match;
|
||||
++out;
|
||||
// Reset the match to a single item, which must be already in the map.
|
||||
match = prefix_to_code.find({kNoPrefix, *it})->second;
|
||||
} else {
|
||||
// Already known, use as the current match.
|
||||
match = ins.first->second;
|
||||
}
|
||||
}
|
||||
|
||||
*out = match;
|
||||
++out;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
template <class T, class ItIn, class ItOut>
|
||||
ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
|
||||
if (begin == end)
|
||||
return out;
|
||||
|
||||
// Load dictionary of len 1 substrings. Theses correspont to lowest codes.
|
||||
InternalMmapVector<T> dict_len1(*begin);
|
||||
++begin;
|
||||
|
||||
if (begin == end)
|
||||
return out;
|
||||
|
||||
for (auto& v : dict_len1) {
|
||||
v = *begin;
|
||||
++begin;
|
||||
}
|
||||
|
||||
// Substrings of len 2 and up. Indexes are shifted because [0,
|
||||
// dict_len1.size()) stored in dict_len1. Substings get here after being
|
||||
// emitted to the output, so we can use output position.
|
||||
InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
|
||||
code_to_substr;
|
||||
|
||||
// Copies already emitted substrings into the output again.
|
||||
auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
|
||||
if (code < dict_len1.size()) {
|
||||
*out = dict_len1[code];
|
||||
++out;
|
||||
return out;
|
||||
}
|
||||
const auto& s = code_to_substr[code - dict_len1.size()];
|
||||
|
||||
for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
|
||||
return out;
|
||||
};
|
||||
|
||||
// Returns lens of the substring with the given code.
|
||||
auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
|
||||
if (code < dict_len1.size())
|
||||
return 1;
|
||||
const auto& s = code_to_substr[code - dict_len1.size()];
|
||||
return s.second - s.first;
|
||||
};
|
||||
|
||||
// Main LZW decoding loop.
|
||||
LzwCodeType prev_code = *begin;
|
||||
++begin;
|
||||
out = copy(prev_code, out);
|
||||
for (auto it = begin; it != end; ++it) {
|
||||
LzwCodeType code = *it;
|
||||
auto start = out;
|
||||
if (code == dict_len1.size() + code_to_substr.size()) {
|
||||
// Special LZW case. The code is not in the dictionary yet. This is
|
||||
// possible only when the new substring is the same as previous one plus
|
||||
// the first item of the previous substring. We can emit that in two
|
||||
// steps.
|
||||
out = copy(prev_code, out);
|
||||
*out = *start;
|
||||
++out;
|
||||
} else {
|
||||
out = copy(code, out);
|
||||
}
|
||||
|
||||
// Every time encoded emits the code, it also creates substing of len + 1
|
||||
// including the first item of the just emmited substring. Do the same here.
|
||||
uptr len = code_to_len(prev_code);
|
||||
code_to_substr.push_back({start - len, start + 1});
|
||||
|
||||
prev_code = code;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace __sanitizer
|
||||
#endif
|
|
@ -27,6 +27,7 @@ set(SANITIZER_UNITTESTS
|
|||
sanitizer_libc_test.cpp
|
||||
sanitizer_linux_test.cpp
|
||||
sanitizer_list_test.cpp
|
||||
sanitizer_lzw_test.cpp
|
||||
sanitizer_mac_test.cpp
|
||||
sanitizer_mutex_test.cpp
|
||||
sanitizer_nolibc_test.cpp
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
//===-- sanitizer_lzw_test.cpp ----------------------------------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "sanitizer_common/sanitizer_lzw.h"
|
||||
|
||||
#include <iterator>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "sanitizer_hash.h"
|
||||
|
||||
namespace __sanitizer {
|
||||
|
||||
template <typename T>
|
||||
struct LzwTest : public ::testing::Test {
|
||||
template <typename Generator>
|
||||
void Run(size_t n, Generator gen) {
|
||||
std::vector<T> data(n);
|
||||
std::generate(data.begin(), data.end(), gen);
|
||||
|
||||
std::vector<u64> lzw;
|
||||
LzwEncode<T>(data.begin(), data.end(), std::back_inserter(lzw));
|
||||
|
||||
std::vector<T> unlzw(data.size() * 2);
|
||||
auto unlzw_end = LzwDecode<T>(lzw.begin(), lzw.end(), unlzw.data());
|
||||
unlzw.resize(unlzw_end - unlzw.data());
|
||||
|
||||
EXPECT_EQ(data, unlzw);
|
||||
}
|
||||
};
|
||||
|
||||
static constexpr size_t kSizes[] = {0, 1, 2, 7, 13, 32, 129, 10000};
|
||||
|
||||
using LzwTestTypes = ::testing::Types<u8, u16, u32, u64>;
|
||||
TYPED_TEST_SUITE(LzwTest, LzwTestTypes, );
|
||||
|
||||
TYPED_TEST(LzwTest, Same) {
|
||||
MurMur2Hash64Builder h(0);
|
||||
for (size_t sz : kSizes) {
|
||||
u64 v = 0;
|
||||
for (size_t i = 0; i < 100 && !this->HasFailure(); ++i) {
|
||||
this->Run(sz, [&] { return v; });
|
||||
h.add(i);
|
||||
v = h.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST(LzwTest, Increment) {
|
||||
MurMur2Hash64Builder h(0);
|
||||
for (size_t sz : kSizes) {
|
||||
u64 v = 0;
|
||||
for (size_t i = 0; i < 100 && !this->HasFailure(); ++i) {
|
||||
this->Run(sz, [&v] { return v++; });
|
||||
h.add(i);
|
||||
v = h.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST(LzwTest, IncrementMod) {
|
||||
MurMur2Hash64Builder h(0);
|
||||
for (size_t sz : kSizes) {
|
||||
u64 v = 0;
|
||||
for (size_t i = 1; i < 16 && !this->HasFailure(); ++i) {
|
||||
this->Run(sz, [&] { return v++ % i; });
|
||||
h.add(i);
|
||||
v = h.get();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST(LzwTest, RandomLimited) {
|
||||
for (size_t sz : kSizes) {
|
||||
for (size_t i = 1; i < 1000 && !this->HasFailure(); i *= 2) {
|
||||
u64 v = 0;
|
||||
this->Run(sz, [&] {
|
||||
MurMur2Hash64Builder h(v % i /* Keep unique set limited */);
|
||||
v = h.get();
|
||||
return v;
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace __sanitizer
|
Loading…
Reference in New Issue