forked from openGauss-Ecosystem/openGauss-server
1461 lines
48 KiB
INI
1461 lines
48 KiB
INI
size_t GetSizeOfHeadData(bool heapPageData)
|
|
{
|
|
if (heapPageData) {
|
|
return SizeOfHeapPageHeaderData;
|
|
} else {
|
|
return SizeOfPageHeaderData;
|
|
}
|
|
}
|
|
|
|
// maybe some itemid is not valid
|
|
uint16 HeapPageCalcRealRowCnt (char *buf) {
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 cnt = 0;
|
|
uint16 i;
|
|
uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData);
|
|
|
|
for (i = 0; i < row_cnt; i++) {
|
|
if (ItemIdIsNormal(GET_ITEMID_BY_IDX(buf, i))) {
|
|
cnt++;
|
|
}
|
|
}
|
|
return cnt;
|
|
}
|
|
|
|
void DecompressDeconvertRows(char *buf, char *aux_buf, int16 *real_order, uint16 max_row_len, uint16 real_row_cnt) {
|
|
errno_t ret;
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 row_cnt = real_row_cnt;
|
|
uint32 total_size = page->pd_special - page->pd_upper;
|
|
char *copy_begin = buf + page->pd_upper;
|
|
char *row;
|
|
uint16 i, j, k, cur, up, row_size;
|
|
|
|
ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ);
|
|
securec_check(ret, "", "");
|
|
|
|
k = 0;
|
|
for (i = 0; i < max_row_len; i++) {
|
|
for (j = 0; j < row_cnt; j++) {
|
|
up = (j == (row_cnt - 1)) ? page->pd_special : GET_ITEMID_BY_IDX(buf, (real_order[j + 1]))->lp_off;
|
|
cur = GET_ITEMID_BY_IDX(buf, (real_order[j]))->lp_off;
|
|
row_size = up - cur;
|
|
row = aux_buf + cur;
|
|
if (i < row_size) {
|
|
row[i] = copy_begin[k++]; // this part is reshaped
|
|
}
|
|
}
|
|
}
|
|
|
|
if (k != total_size) {
|
|
printf("ERROR!!! pg_deconvert_rows error...!!!\n");
|
|
ASSERT(0);
|
|
return;
|
|
}
|
|
|
|
// cp aux_buf to page_buf
|
|
ret = memcpy_sp(copy_begin, total_size, aux_buf + page->pd_upper, total_size);
|
|
securec_check(ret, "", "");
|
|
return ;
|
|
}
|
|
|
|
// 1: as tuple_offset order, that means asc order.
|
|
// 2: store all itemid's idx.
|
|
// 3:maybe some itemid is not in order.
|
|
void CompressConvertItemRealOrder(char *buf, int16 *real_order, uint16 real_row_cnt) {
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData);
|
|
ItemIdData *begin = (ItemIdData *)(buf + GetPageHeaderSize(page));
|
|
int16 *link_order = real_order + real_row_cnt;
|
|
|
|
int16 i, head, curr, prev;
|
|
int16 end = -1; // invalid index
|
|
|
|
head = end;
|
|
// very likely to seems that itemids stored by desc order, and ignore invalid itemid
|
|
for (i = 0; i < row_cnt; i++) {
|
|
if (!ItemIdIsNormal(begin + i)) {
|
|
continue;
|
|
}
|
|
|
|
if (head == end) { // set the head idx, insert the first
|
|
link_order[i] = end;
|
|
head = i;
|
|
continue;
|
|
}
|
|
|
|
if ((begin + i)->lp_off < (begin + head)->lp_off) {
|
|
link_order[i] = head; // update the head idx
|
|
head = i;
|
|
continue;
|
|
}
|
|
|
|
prev = head;
|
|
curr = link_order[head];
|
|
while ((curr != end) && ((begin + i)->lp_off > (begin + curr)->lp_off)) {
|
|
prev = curr;
|
|
curr = link_order[curr];
|
|
}
|
|
|
|
link_order[prev] = i;
|
|
link_order[i] = curr;
|
|
}
|
|
|
|
// arrange the link to array
|
|
curr = head;
|
|
for (i = 0; i < real_row_cnt; i++) {
|
|
real_order[i] = curr;
|
|
curr = link_order[curr];
|
|
}
|
|
|
|
if (curr != end) {
|
|
printf("ERROR!!! pre_convert_real_order error...!!!\n");
|
|
ASSERT(0);
|
|
return;
|
|
}
|
|
|
|
}
|
|
|
|
int DecompressPage(const char* src, char* dst, uint8 algorithm)
|
|
{
|
|
if (PageIs8BXidHeapVersion(src)) {
|
|
return TemplateDecompressPage<true>(src, dst, algorithm);
|
|
} else {
|
|
return TemplateDecompressPage<false>(src, dst, algorithm);
|
|
}
|
|
}
|
|
|
|
void cprs_diff_deconvert_rows(char *buf, uint32 offset, uint16 min_row_len, uint16 real_row_cnt) {
|
|
uint16 row_cnt = real_row_cnt;
|
|
uint32 common_size = min_row_len;
|
|
uint8 *copy_begin = (uint8 *)(buf + offset);
|
|
uint16 i, j;
|
|
|
|
for (i = 0; i < common_size; i++) {
|
|
for (j = 1; j < row_cnt; j++) {
|
|
copy_begin[i * row_cnt + j] += copy_begin[i * row_cnt + (j - 1)];
|
|
}
|
|
}
|
|
return ;
|
|
}
|
|
|
|
// to find all row size are diffs in MIN_DIFF_SIZE byts.
|
|
bool CompressConvertCheck(char *buf, int16 **real_order, uint16 *max_row_len, uint16 *min_row_len, uint16 *real_row_cnt) {
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData);
|
|
int16 i, row_size;
|
|
ItemIdData *ptr = NULL;
|
|
uint16 up = page->pd_special;
|
|
uint16 min_size = GS_INVALID_ID16;
|
|
uint16 max_size = 0;
|
|
errno_t ret;
|
|
if (page->pd_lower < GetPageHeaderSize(page) || (page->pd_lower > page->pd_upper)) {
|
|
return false;
|
|
}
|
|
|
|
uint16 normal_row_cnt = HeapPageCalcRealRowCnt(buf);
|
|
if (normal_row_cnt < MIN_CONVERT_CNT) { // no need convert
|
|
return false;
|
|
}
|
|
|
|
// to store the real tuple order.
|
|
/*
|
|
--------------------------|--------------------------
|
|
xxxxxxxxxxxxxxxxxxxxxxxxxx|xxxxxxxxxxxxxxxxxxxxxxxxxx
|
|
--------------------------|--------------------------
|
|
*/
|
|
// the first part is real array order, and the second part is link.
|
|
*real_order = (int16 *)malloc(sizeof(uint16) * row_cnt * 2);
|
|
if (*real_order == NULL) {
|
|
printf("zfunc compress file");
|
|
return false;
|
|
}
|
|
ret = memset_sp(*real_order, sizeof(uint16) * row_cnt * 2, 0, sizeof(uint16) * row_cnt * 2);
|
|
securec_check(ret, "", "");
|
|
|
|
// order the ItemIds by tuple_offset order.
|
|
CompressConvertItemRealOrder(buf, *real_order, normal_row_cnt);
|
|
|
|
// do the check, to check all size of tuples.
|
|
for (i = normal_row_cnt - 1; i >= 0; i--) {
|
|
ptr = GET_ITEMID_BY_IDX(buf, ((*real_order)[i]));
|
|
|
|
row_size = up - ptr->lp_off;
|
|
if (row_size < MIN_CONVERT_CNT * 2) {
|
|
return false;
|
|
}
|
|
|
|
min_size = (row_size < min_size) ? row_size : min_size;
|
|
max_size = (row_size > max_size) ? row_size : max_size;
|
|
|
|
if ((max_size - min_size) > MIN_DIFF_SIZE) { // no need convert
|
|
return false;
|
|
}
|
|
up = ptr->lp_off;
|
|
}
|
|
|
|
// get the min row common size.
|
|
*max_row_len = max_size;
|
|
*min_row_len = min_size;
|
|
*real_row_cnt = normal_row_cnt;
|
|
return true;
|
|
}
|
|
|
|
void DecompressDeconvertItemIds(char *buf, char *aux_buf) {
|
|
errno_t ret;
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData);
|
|
uint32 total_size = row_cnt * sizeof(ItemIdData);
|
|
char *copy_begin = buf + GetPageHeaderSize(page);
|
|
uint16 i, j, k;
|
|
|
|
// clear aux_buf
|
|
ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ);
|
|
securec_check(ret, "", "");
|
|
|
|
k = 0;
|
|
for (i = 0; i < sizeof(ItemIdData); i++) {
|
|
for (j = 0; j < row_cnt; j++) {
|
|
aux_buf[j * sizeof(ItemIdData) + i] = copy_begin[k++];
|
|
}
|
|
}
|
|
|
|
// cp aux_buf to page_buf
|
|
ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size);
|
|
securec_check(ret, "", "");
|
|
return ;
|
|
}
|
|
|
|
|
|
void DecompressDeconvertOnePage(char *buf, char *aux_buf, bool diff_convert) {
|
|
uint16 max_row_len = 0;
|
|
uint16 min_row_len = 0;
|
|
int16 *real_order = NULL; // itemids are not in order sometimes. we must find the real
|
|
uint16 real_row_cnt = 0;
|
|
|
|
if (diff_convert) {
|
|
cprs_diff_deconvert_rows(buf, GetPageHeaderSize(buf), sizeof(ItemIdData),
|
|
(((HeapPageHeaderData *)buf)->pd_lower - GetPageHeaderSize(buf)) / sizeof(ItemIdData));
|
|
}
|
|
|
|
// =======firstly, arrange the itemids.
|
|
DecompressDeconvertItemIds(buf, aux_buf);
|
|
|
|
if (!CompressConvertCheck(buf, &real_order, &max_row_len, &min_row_len, &real_row_cnt)) {
|
|
if (real_order != NULL) {
|
|
free(real_order);
|
|
}
|
|
ASSERT(0);
|
|
return ;
|
|
}
|
|
|
|
// =======and last, the tuples
|
|
if (diff_convert) {
|
|
cprs_diff_deconvert_rows(buf, ((HeapPageHeaderData *)buf)->pd_upper, min_row_len, real_row_cnt);
|
|
}
|
|
DecompressDeconvertRows(buf, aux_buf, real_order, max_row_len, real_row_cnt);
|
|
|
|
if (real_order != NULL) {
|
|
free(real_order);
|
|
}
|
|
return ;
|
|
}
|
|
|
|
|
|
void DecompressPageDeconvert(char *src, bool diff_convert)
|
|
{
|
|
char *aux_buf = NULL;
|
|
errno_t rc;
|
|
|
|
aux_buf = (char *)malloc(BLCKSZ);
|
|
if (aux_buf == NULL) {
|
|
// add log
|
|
return;
|
|
}
|
|
rc = memset_s(aux_buf, BLCKSZ, 0, BLCKSZ);
|
|
securec_check(rc, "", "");
|
|
|
|
// do convert
|
|
DecompressDeconvertOnePage(src, aux_buf, diff_convert);
|
|
|
|
if (aux_buf != NULL) {
|
|
free(aux_buf);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* DecompressPage() -- Decompress one compressed page.
|
|
* return size of decompressed page which should be BLCKSZ or
|
|
* -1 for decompress error
|
|
* -2 for unrecognized compression algorithm
|
|
*
|
|
* note:The size of dst must be greater than or equal to BLCKSZ.
|
|
*/
|
|
template<bool heapPageData>
|
|
int TemplateDecompressPage(const char* src, char* dst, uint8 algorithm)
|
|
{
|
|
int decompressed_size;
|
|
char* data;
|
|
uint32 size;
|
|
bool byte_convert, diff_convert;
|
|
size_t sizeOfPageHeaderData = GetSizeOfHeadData(heapPageData);
|
|
int rc = memcpy_s(dst, sizeOfPageHeaderData, src, sizeOfPageHeaderData);
|
|
securec_check(rc, "", "");
|
|
|
|
if (heapPageData) {
|
|
data = ((HeapPageCompressData*) src)->data;
|
|
size = ((HeapPageCompressData*) src)->size;
|
|
byte_convert = ((HeapPageCompressData*) src)->byte_convert;
|
|
diff_convert = ((HeapPageCompressData*) src)->diff_convert;
|
|
} else {
|
|
data = ((PageCompressData*) src)->data;
|
|
size = ((PageCompressData*) src)->size;
|
|
byte_convert = ((PageCompressData*) src)->byte_convert;
|
|
diff_convert = ((PageCompressData*) src)->diff_convert;
|
|
}
|
|
|
|
switch (algorithm) {
|
|
case COMPRESS_ALGORITHM_PGLZ:
|
|
decompressed_size = lz_decompress(
|
|
data, size, dst + sizeOfPageHeaderData, BLCKSZ - sizeOfPageHeaderData, false);
|
|
break;
|
|
case COMPRESS_ALGORITHM_ZSTD:
|
|
decompressed_size =
|
|
ZSTD_decompress(dst + sizeOfPageHeaderData, BLCKSZ - sizeOfPageHeaderData, data, size);
|
|
|
|
if (ZSTD_isError(decompressed_size)) {
|
|
return -1;
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
return COMPRESS_UNSUPPORTED_ERROR;
|
|
break;
|
|
}
|
|
|
|
if (byte_convert) {
|
|
// deconvert dst
|
|
DecompressPageDeconvert(dst, diff_convert);
|
|
}
|
|
|
|
return sizeOfPageHeaderData + decompressed_size;
|
|
}
|
|
|
|
// pg_lz
|
|
/* ----------
|
|
* pg_lzcompress.c -
|
|
*
|
|
* This is an implementation of LZ compression for PostgreSQL.
|
|
* It uses a simple history table and generates 2-3 byte tags
|
|
* capable of backward copy information for 3-273 bytes with
|
|
* a max offset of 4095.
|
|
*
|
|
* Entry routines:
|
|
*
|
|
* bool
|
|
* pglz_compress(const char *source, int32 slen, PGLZ_Header *dest,
|
|
* const PGLZ_Strategy *strategy);
|
|
*
|
|
* source is the input data to be compressed.
|
|
*
|
|
* slen is the length of the input data.
|
|
*
|
|
* dest is the output area for the compressed result.
|
|
* It must be at least as big as PGLZ_MAX_OUTPUT(slen).
|
|
*
|
|
* strategy is a pointer to some information controlling
|
|
* the compression algorithm. If NULL, the compiled
|
|
* in default strategy is used.
|
|
*
|
|
* The return value is TRUE if compression succeeded,
|
|
* FALSE if not; in the latter case the contents of dest
|
|
* are undefined.
|
|
*
|
|
* void
|
|
* pglz_decompress(const PGLZ_Header *source, char *dest)
|
|
*
|
|
* source is the compressed input.
|
|
*
|
|
* dest is the area where the uncompressed data will be
|
|
* written to. It is the callers responsibility to
|
|
* provide enough space. The required amount can be
|
|
* obtained with the macro PGLZ_RAW_SIZE(source).
|
|
*
|
|
* The data is written to buff exactly as it was handed
|
|
* to pglz_compress(). No terminating zero byte is added.
|
|
*
|
|
* The decompression algorithm and internal data format:
|
|
*
|
|
* PGLZ_Header is defined as
|
|
*
|
|
* typedef struct PGLZ_Header {
|
|
* int32 vl_len_;
|
|
* int32 rawsize;
|
|
* }
|
|
*
|
|
* The header is followed by the compressed data itself.
|
|
*
|
|
* The data representation is easiest explained by describing
|
|
* the process of decompression.
|
|
*
|
|
* If VARSIZE(x) == rawsize + sizeof(PGLZ_Header), then the data
|
|
* is stored uncompressed as plain bytes. Thus, the decompressor
|
|
* simply copies rawsize bytes from the location after the
|
|
* header to the destination.
|
|
*
|
|
* Otherwise the first byte after the header tells what to do
|
|
* the next 8 times. We call this the control byte.
|
|
*
|
|
* An unset bit in the control byte means, that one uncompressed
|
|
* byte follows, which is copied from input to output.
|
|
*
|
|
* A set bit in the control byte means, that a tag of 2-3 bytes
|
|
* follows. A tag contains information to copy some bytes, that
|
|
* are already in the output buffer, to the current location in
|
|
* the output. Let's call the three tag bytes T1, T2 and T3. The
|
|
* position of the data to copy is coded as an offset from the
|
|
* actual output position.
|
|
*
|
|
* The offset is in the upper nibble of T1 and in T2.
|
|
* The length is in the lower nibble of T1.
|
|
*
|
|
* So the 16 bits of a 2 byte tag are coded as
|
|
*
|
|
* 7---T1--0 7---T2--0
|
|
* OOOO LLLL OOOO OOOO
|
|
*
|
|
* This limits the offset to 1-4095 (12 bits) and the length
|
|
* to 3-18 (4 bits) because 3 is always added to it. To emit
|
|
* a tag of 2 bytes with a length of 2 only saves one control
|
|
* bit. But we lose one byte in the possible length of a tag.
|
|
*
|
|
* In the actual implementation, the 2 byte tag's length is
|
|
* limited to 3-17, because the value 0xF in the length nibble
|
|
* has special meaning. It means, that the next following
|
|
* byte (T3) has to be added to the length value of 18. That
|
|
* makes total limits of 1-4095 for offset and 3-273 for length.
|
|
*
|
|
* Now that we have successfully decoded a tag. We simply copy
|
|
* the output that occurred <offset> bytes back to the current
|
|
* output location in the specified <length>. Thus, a
|
|
* sequence of 200 spaces (think about bpchar fields) could be
|
|
* coded in 4 bytes. One literal space and a three byte tag to
|
|
* copy 199 bytes with a -1 offset. Whow - that's a compression
|
|
* rate of 98%! Well, the implementation needs to save the
|
|
* original data size too, so we need another 4 bytes for it
|
|
* and end up with a total compression rate of 96%, what's still
|
|
* worth a Whow.
|
|
*
|
|
* The compression algorithm
|
|
*
|
|
* The following uses numbers used in the default strategy.
|
|
*
|
|
* The compressor works best for attributes of a size between
|
|
* 1K and 1M. For smaller items there's not that much chance of
|
|
* redundancy in the character sequence (except for large areas
|
|
* of identical bytes like trailing spaces) and for bigger ones
|
|
* our 4K maximum look-back distance is too small.
|
|
*
|
|
* The compressor creates a table for 8192 lists of positions.
|
|
* For each input position (except the last 3), a hash key is
|
|
* built from the 4 next input bytes and the position remembered
|
|
* in the appropriate list. Thus, the table points to linked
|
|
* lists of likely to be at least in the first 4 characters
|
|
* matching strings. This is done on the fly while the input
|
|
* is compressed into the output area. Table entries are only
|
|
* kept for the last 4096 input positions, since we cannot use
|
|
* back-pointers larger than that anyway.
|
|
*
|
|
* For each byte in the input, it's hash key (built from this
|
|
* byte and the next 3) is used to find the appropriate list
|
|
* in the table. The lists remember the positions of all bytes
|
|
* that had the same hash key in the past in increasing backward
|
|
* offset order. Now for all entries in the used lists, the
|
|
* match length is computed by comparing the characters from the
|
|
* entries position with the characters from the actual input
|
|
* position.
|
|
*
|
|
* The compressor starts with a so called "good_match" of 128.
|
|
* It is a "prefer speed against compression ratio" optimizer.
|
|
* So if the first entry looked at already has 128 or more
|
|
* matching characters, the lookup stops and that position is
|
|
* used for the next tag in the output.
|
|
*
|
|
* For each subsequent entry in the history list, the "good_match"
|
|
* is lowered by 10%. So the compressor will be more happy with
|
|
* short matches the farer it has to go back in the history.
|
|
* Another "speed against ratio" preference characteristic of
|
|
* the algorithm.
|
|
*
|
|
* Thus there are 3 stop conditions for the lookup of matches:
|
|
*
|
|
* - a match >= good_match is found
|
|
* - there are no more history entries to look at
|
|
* - the next history entry is already too far back
|
|
* to be coded into a tag.
|
|
*
|
|
* Finally the match algorithm checks that at least a match
|
|
* of 3 or more bytes has been found, because thats the smallest
|
|
* amount of copy information to code into a tag. If so, a tag
|
|
* is omitted and all the input bytes covered by that are just
|
|
* scanned for the history add's, otherwise a literal character
|
|
* is omitted and only his history entry added.
|
|
*
|
|
* Acknowledgements:
|
|
*
|
|
* Many thanks to Adisak Pochanayon, who's article about SLZ
|
|
* inspired me to write the PostgreSQL compression this way.
|
|
*
|
|
* Jan Wieck
|
|
*
|
|
* Copyright (c) 1999-2012, PostgreSQL Global Development Group
|
|
*
|
|
* src/backend/utils/adt/pg_lzcompress.c
|
|
* ----------
|
|
*/
|
|
#include "postgres.h"
|
|
#include "knl/knl_variable.h"
|
|
|
|
#include <limits.h>
|
|
|
|
#include "utils/pg_lzcompress.h"
|
|
|
|
/* ----------
|
|
* The provided standard strategies
|
|
* ----------
|
|
*/
|
|
static const PGLZ_Strategy strategy_default_data = {
|
|
32, /* Data chunks less than 32 bytes are not
|
|
* compressed */
|
|
INT_MAX, /* No upper limit on what we'll try to
|
|
* compress */
|
|
25, /* Require 25% compression rate, or not worth
|
|
* it */
|
|
1024, /* Give up if no compression in the first 1KB */
|
|
128, /* Stop history lookup if a match of 128 bytes
|
|
* is found */
|
|
10 /* Lower good match size by 10% at every loop
|
|
* iteration */
|
|
};
|
|
const PGLZ_Strategy* const PGLZ_strategy_default = &strategy_default_data;
|
|
|
|
static const PGLZ_Strategy strategy_always_data = {
|
|
0, /* Chunks of any size are compressed */
|
|
INT_MAX,
|
|
0, /* It's enough to save one single byte */
|
|
INT_MAX, /* Never give up early */
|
|
128, /* Stop history lookup if a match of 128 bytes
|
|
* is found */
|
|
6 /* Look harder for a good match */
|
|
};
|
|
const PGLZ_Strategy* const PGLZ_strategy_always = &strategy_always_data;
|
|
|
|
/* ----------
|
|
* pglz_hist_idx -
|
|
*
|
|
* Computes the history table slot for the lookup by the next 4
|
|
* characters in the input.
|
|
*
|
|
* NB: because we use the next 4 characters, we are not guaranteed to
|
|
* find 3-character matches; they very possibly will be in the wrong
|
|
* hash list. This seems an acceptable tradeoff for spreading out the
|
|
* hash keys more.
|
|
* ----------
|
|
*/
|
|
#define pglz_hist_idx(_s, _e) \
|
|
(((((_e) - (_s)) < 4) ? (int)(_s)[0] \
|
|
: (((unsigned char)((_s)[0]) << 9) ^ ((unsigned char)((_s)[1]) << 6) ^ \
|
|
((unsigned char)((_s)[2]) << 3) ^ (unsigned char)((_s)[3]))) & \
|
|
(PGLZ_HISTORY_MASK))
|
|
|
|
/* ----------
|
|
* pglz_hist_add -
|
|
*
|
|
* Adds a new entry to the history table.
|
|
*
|
|
* If _recycle is true, then we are recycling a previously used entry,
|
|
* and must first delink it from its old hashcode's linked list.
|
|
*
|
|
* NOTE: beware of multiple evaluations of macro's arguments, and note that
|
|
* _hn and _recycle are modified in the macro.
|
|
* ----------
|
|
*/
|
|
#define pglz_hist_add(_hs, _he, _hn, _recycle, _s, _e) \
|
|
do { \
|
|
int __hindex = pglz_hist_idx((_s), (_e)); \
|
|
PGLZ_HistEntry** __myhsp = &(_hs)[__hindex]; \
|
|
PGLZ_HistEntry* __myhe = &(_he)[_hn]; \
|
|
if (_recycle) { \
|
|
if (__myhe->prev == NULL) \
|
|
(_hs)[__myhe->hindex] = __myhe->next; \
|
|
else \
|
|
__myhe->prev->next = __myhe->next; \
|
|
if (__myhe->next != NULL) \
|
|
__myhe->next->prev = __myhe->prev; \
|
|
} \
|
|
__myhe->next = *__myhsp; \
|
|
__myhe->prev = NULL; \
|
|
__myhe->hindex = __hindex; \
|
|
__myhe->pos = (_s); \
|
|
if (*__myhsp != NULL) \
|
|
(*__myhsp)->prev = __myhe; \
|
|
*__myhsp = __myhe; \
|
|
if (++(_hn) >= PGLZ_HISTORY_SIZE) { \
|
|
(_hn) = 0; \
|
|
(_recycle) = true; \
|
|
} \
|
|
} while (0)
|
|
|
|
/* ----------
|
|
* pglz_out_ctrl -
|
|
*
|
|
* Outputs the last and allocates a new control byte if needed.
|
|
* ----------
|
|
*/
|
|
#define pglz_out_ctrl(__ctrlp, __ctrlb, __ctrl, __buf) \
|
|
do { \
|
|
if ((((unsigned char)(__ctrl)) & 0xff) == 0) { \
|
|
*(__ctrlp) = __ctrlb; \
|
|
__ctrlp = (__buf)++; \
|
|
__ctrlb = 0; \
|
|
__ctrl = 1; \
|
|
} \
|
|
} while (0)
|
|
|
|
/* ----------
|
|
* pglz_out_literal -
|
|
*
|
|
* Outputs a literal byte to the destination buffer including the
|
|
* appropriate control bit.
|
|
* ----------
|
|
*/
|
|
#define pglz_out_literal(_ctrlp, _ctrlb, _ctrl, _buf, _byte) \
|
|
do { \
|
|
pglz_out_ctrl(_ctrlp, _ctrlb, _ctrl, _buf); \
|
|
*(_buf)++ = (unsigned char)(_byte); \
|
|
(_ctrl) <<= 1; \
|
|
} while (0)
|
|
|
|
/* ----------
|
|
* pglz_out_tag -
|
|
*
|
|
* Outputs a backward reference tag of 2-4 bytes (depending on
|
|
* offset and length) to the destination buffer including the
|
|
* appropriate control bit.
|
|
* ----------
|
|
*/
|
|
#define pglz_out_tag(_ctrlp, _ctrlb, _ctrl, _buf, _len, _off) \
|
|
do { \
|
|
pglz_out_ctrl(_ctrlp, _ctrlb, _ctrl, _buf); \
|
|
(_ctrlb) |= (_ctrl); \
|
|
(_ctrl) <<= 1; \
|
|
if ((_len) > 17) { \
|
|
(_buf)[0] = (unsigned char)((((uint32)(_off)&0xf00) >> 4) | 0x0f); \
|
|
(_buf)[1] = (unsigned char)(((uint32)(_off)&0xff)); \
|
|
(_buf)[2] = (unsigned char)((_len)-18); \
|
|
(_buf) += 3; \
|
|
} else { \
|
|
(_buf)[0] = (unsigned char)((((uint32)(_off)&0xf00) >> 4) | ((uint32)(_len)-3)); \
|
|
(_buf)[1] = (unsigned char)((uint32)(_off)&0xff); \
|
|
(_buf) += 2; \
|
|
} \
|
|
} while (0)
|
|
|
|
#define HIST_START_LEN (sizeof(PGLZ_HistEntry*) * PGLZ_HISTORY_LISTS)
|
|
#define HIST_ENTRIES_LEN (sizeof(PGLZ_HistEntry) * PGLZ_HISTORY_SIZE)
|
|
|
|
#define PGLZ_MAX_HISTORY_LISTS 8192 /* must be power of 2 */
|
|
static PGLZ_HistEntry* hist_start[PGLZ_MAX_HISTORY_LISTS];
|
|
static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1];
|
|
|
|
/* ----------
|
|
* pglz_find_match -
|
|
*
|
|
* Lookup the history table if the actual input stream matches
|
|
* another sequence of characters, starting somewhere earlier
|
|
* in the input buffer.
|
|
* ----------
|
|
*/
|
|
static inline int pglz_find_match(
|
|
PGLZ_HistEntry** hstart, const char* input, const char* end, int* lenp, int* offp, int good_match, int good_drop)
|
|
{
|
|
PGLZ_HistEntry* hent = NULL;
|
|
int32 len = 0;
|
|
int32 off = 0;
|
|
|
|
/*
|
|
* Traverse the linked history list until a good enough match is found.
|
|
*/
|
|
hent = hstart[pglz_hist_idx(input, end)];
|
|
while (hent != NULL) {
|
|
const char* ip = input;
|
|
const char* hp = hent->pos;
|
|
int32 thisoff;
|
|
int32 thislen;
|
|
|
|
/*
|
|
* Stop if the offset does not fit into our tag anymore.
|
|
*/
|
|
thisoff = ip - hp;
|
|
if (thisoff >= 0x0fff)
|
|
break;
|
|
|
|
/*
|
|
* Determine length of match. A better match must be larger than the
|
|
* best so far. And if we already have a match of 16 or more bytes,
|
|
* it's worth the call overhead to use memcmp() to check if this match
|
|
* is equal for the same size. After that we must fallback to
|
|
* character by character comparison to know the exact position where
|
|
* the diff occurred.
|
|
*/
|
|
thislen = 0;
|
|
if (len >= 16) {
|
|
if (memcmp(ip, hp, len) == 0) {
|
|
thislen = len;
|
|
ip += len;
|
|
hp += len;
|
|
while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) {
|
|
thislen++;
|
|
ip++;
|
|
hp++;
|
|
}
|
|
}
|
|
} else {
|
|
while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) {
|
|
thislen++;
|
|
ip++;
|
|
hp++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Remember this match as the best (if it is)
|
|
*/
|
|
if (thislen > len) {
|
|
len = thislen;
|
|
off = thisoff;
|
|
}
|
|
|
|
/*
|
|
* Advance to the next history entry
|
|
*/
|
|
hent = hent->next;
|
|
|
|
/*
|
|
* Be happy with lesser good matches the more entries we visited. But
|
|
* no point in doing calculation if we're at end of list.
|
|
*/
|
|
if (hent != NULL) {
|
|
if (len >= good_match)
|
|
break;
|
|
good_match -= (good_match * good_drop) / 100;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Return match information only if it results at least in one byte
|
|
* reduction.
|
|
*/
|
|
if (len > 2) {
|
|
*lenp = len;
|
|
*offp = off;
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* ----------
|
|
* pglz_compress -
|
|
*
|
|
* Compresses source into dest using strategy.
|
|
* ----------
|
|
*/
|
|
bool pglz_compress(const char* source, int32 slen, PGLZ_Header* dest, const PGLZ_Strategy* strategy)
|
|
{
|
|
unsigned char* bp = ((unsigned char*)dest) + sizeof(PGLZ_Header);
|
|
unsigned char* bstart = bp;
|
|
int hist_next = 0;
|
|
bool hist_recycle = false;
|
|
const char* dp = source;
|
|
const char* dend = source + slen;
|
|
unsigned char ctrl_dummy = 0;
|
|
unsigned char* ctrlp = &ctrl_dummy;
|
|
unsigned char ctrlb = 0;
|
|
unsigned char ctrl = 0;
|
|
bool found_match = false;
|
|
int32 match_len;
|
|
int32 match_off;
|
|
int32 good_match;
|
|
int32 good_drop;
|
|
int32 result_size;
|
|
int32 result_max;
|
|
int32 need_rate;
|
|
|
|
/*
|
|
* Our fallback strategy is the default.
|
|
*/
|
|
if (strategy == NULL)
|
|
strategy = PGLZ_strategy_default;
|
|
|
|
/*
|
|
* If the strategy forbids compression (at all or if source chunk size out
|
|
* of range), fail.
|
|
*/
|
|
if (strategy->match_size_good <= 0 || slen < strategy->min_input_size || slen > strategy->max_input_size)
|
|
return false;
|
|
|
|
/*
|
|
* Save the original source size in the header.
|
|
*/
|
|
dest->rawsize = slen;
|
|
|
|
/*
|
|
* Limit the match parameters to the supported range.
|
|
*/
|
|
good_match = strategy->match_size_good;
|
|
if (good_match > PGLZ_MAX_MATCH)
|
|
good_match = PGLZ_MAX_MATCH;
|
|
else if (good_match < 17)
|
|
good_match = 17;
|
|
|
|
good_drop = strategy->match_size_drop;
|
|
if (good_drop < 0)
|
|
good_drop = 0;
|
|
else if (good_drop > 100)
|
|
good_drop = 100;
|
|
|
|
need_rate = strategy->min_comp_rate;
|
|
if (need_rate < 0)
|
|
need_rate = 0;
|
|
else if (need_rate > 99)
|
|
need_rate = 99;
|
|
|
|
/*
|
|
* Compute the maximum result size allowed by the strategy, namely the
|
|
* input size minus the minimum wanted compression rate. This had better
|
|
* be <= slen, else we might overrun the provided output buffer.
|
|
*/
|
|
if (slen > (INT_MAX / 100)) {
|
|
/* Approximate to avoid overflow */
|
|
result_max = (slen / 100) * (100 - need_rate);
|
|
} else
|
|
result_max = (slen * (100 - need_rate)) / 100;
|
|
|
|
/*
|
|
* Initialize the history lists to empty. We do not need to zero the
|
|
* hist_entries[] array; its entries are initialized as they are used.
|
|
*/
|
|
errno_t rc = memset_s(hist_start, HIST_START_LEN, 0, HIST_START_LEN);
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
/*
|
|
* Compress the source directly into the output buffer.
|
|
*/
|
|
while (dp < dend) {
|
|
/*
|
|
* If we already exceeded the maximum result size, fail.
|
|
*
|
|
* We check once per loop; since the loop body could emit as many as 4
|
|
* bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
|
|
* allow 4 slop bytes.
|
|
*/
|
|
if (bp - bstart >= result_max)
|
|
return false;
|
|
|
|
/*
|
|
* If we've emitted more than first_success_by bytes without finding
|
|
* anything compressible at all, fail. This lets us fall out
|
|
* reasonably quickly when looking at incompressible input (such as
|
|
* pre-compressed data).
|
|
*/
|
|
if (!found_match && bp - bstart >= strategy->first_success_by)
|
|
return false;
|
|
|
|
/*
|
|
* Try to find a match in the history
|
|
*/
|
|
if (pglz_find_match(hist_start, dp, dend, &match_len, &match_off, good_match, good_drop)) {
|
|
/*
|
|
* Create the tag and add history entries for all matched
|
|
* characters.
|
|
*/
|
|
pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
|
|
while (match_len--) {
|
|
pglz_hist_add(
|
|
hist_start, hist_entries, hist_next, hist_recycle, dp, dend);
|
|
dp++; /* Do not do this ++ in the line above! */
|
|
/* The macro would do it four times - Jan. */
|
|
}
|
|
found_match = true;
|
|
} else {
|
|
/*
|
|
* No match found. Copy one literal byte.
|
|
*/
|
|
pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
|
|
pglz_hist_add(
|
|
hist_start, hist_entries, hist_next, hist_recycle, dp, dend);
|
|
dp++; /* Do not do this ++ in the line above! */
|
|
/* The macro would do it four times - Jan. */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Write out the last control byte and check that we haven't overrun the
|
|
* output size allowed by the strategy.
|
|
*/
|
|
*ctrlp = ctrlb;
|
|
result_size = bp - bstart;
|
|
if (result_size >= result_max)
|
|
return false;
|
|
|
|
/*
|
|
* Success - need only fill in the actual length of the compressed datum.
|
|
*/
|
|
SET_VARSIZE_COMPRESSED(dest, result_size + sizeof(PGLZ_Header));
|
|
|
|
return true;
|
|
}
|
|
|
|
/* ----------
|
|
* lz_compress -
|
|
*
|
|
* Compresses source into dest using strategy. Returns the number of
|
|
* bytes written in buffer dest, or -1 if compression fails.
|
|
* ----------
|
|
*/
|
|
int32 lz_compress(const char* source, int32 slen, char* dest)
|
|
{
|
|
unsigned char* bp = (unsigned char*) dest;
|
|
unsigned char* bstart = bp;
|
|
int hist_next = 0;
|
|
bool hist_recycle = false;
|
|
const char* dp = source;
|
|
const char* dend = source + slen;
|
|
unsigned char ctrl_dummy = 0;
|
|
unsigned char* ctrlp = &ctrl_dummy;
|
|
unsigned char ctrlb = 0;
|
|
unsigned char ctrl = 0;
|
|
bool found_match = false;
|
|
int32 match_len;
|
|
int32 match_off;
|
|
int32 good_match;
|
|
int32 good_drop;
|
|
int32 result_size;
|
|
int32 result_max;
|
|
int32 need_rate;
|
|
errno_t rc;
|
|
|
|
const PGLZ_Strategy* strategy = PGLZ_strategy_always;
|
|
/*
|
|
* Our fallback strategy is the default.
|
|
*/
|
|
if (strategy == NULL) {
|
|
strategy = PGLZ_strategy_default;
|
|
}
|
|
|
|
/*
|
|
* If the strategy forbids compression (at all or if source chunk size out
|
|
* of range), fail.
|
|
*/
|
|
if (strategy->match_size_good <= 0 || slen < strategy->min_input_size || slen > strategy->max_input_size) {
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Limit the match parameters to the supported range.
|
|
*/
|
|
good_match = strategy->match_size_good;
|
|
if (good_match > PGLZ_MAX_MATCH) {
|
|
good_match = PGLZ_MAX_MATCH;
|
|
} else if (good_match < 17) {
|
|
good_match = 17;
|
|
}
|
|
|
|
good_drop = strategy->match_size_drop;
|
|
if (good_drop < 0) {
|
|
good_drop = 0;
|
|
} else if (good_drop > 100) {
|
|
good_drop = 100;
|
|
}
|
|
|
|
need_rate = strategy->min_comp_rate;
|
|
if (need_rate < 0) {
|
|
need_rate = 0;
|
|
} else if (need_rate > 99) {
|
|
need_rate = 99;
|
|
}
|
|
|
|
/*
|
|
* Compute the maximum result size allowed by the strategy, namely the
|
|
* input size minus the minimum wanted compression rate. This had better
|
|
* be <= slen, else we might overrun the provided output buffer.
|
|
*/
|
|
if (slen > (INT_MAX / 100)) {
|
|
/* Approximate to avoid overflow */
|
|
result_max = (slen / 100) * (100 - need_rate);
|
|
} else {
|
|
result_max = (slen * (100 - need_rate)) / 100;
|
|
}
|
|
|
|
/*
|
|
* Initialize the history lists to empty. We do not need to zero the
|
|
* hist_entries[] array; its entries are initialized as they are used.
|
|
*/
|
|
rc = memset_s(hist_start, HIST_START_LEN, 0, HIST_START_LEN);
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
/*
|
|
* Compress the source directly into the output buffer.
|
|
*/
|
|
while (dp < dend) {
|
|
/*
|
|
* If we already exceeded the maximum result size, fail.
|
|
*
|
|
* We check once per loop; since the loop body could emit as many as 4
|
|
* bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
|
|
* allow 4 slop bytes.
|
|
*/
|
|
if (bp - bstart >= result_max) {
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* If we've emitted more than first_success_by bytes without finding
|
|
* anything compressible at all, fail. This lets us fall out
|
|
* reasonably quickly when looking at incompressible input (such as
|
|
* pre-compressed data).
|
|
*/
|
|
if (!found_match && bp - bstart >= strategy->first_success_by) {
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Try to find a match in the history
|
|
*/
|
|
if (pglz_find_match(hist_start, dp, dend, &match_len, &match_off, good_match, good_drop)) {
|
|
/*
|
|
* Create the tag and add history entries for all matched
|
|
* characters.
|
|
*/
|
|
pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
|
|
while (match_len--) {
|
|
pglz_hist_add(
|
|
hist_start, hist_entries, hist_next, hist_recycle, dp,
|
|
dend);
|
|
dp++; /* Do not do this ++ in the line above! */
|
|
/* The macro would do it four times - Jan. */
|
|
}
|
|
found_match = true;
|
|
} else {
|
|
/*
|
|
* No match found. Copy one literal byte.
|
|
*/
|
|
pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
|
|
pglz_hist_add(
|
|
hist_start, hist_entries, hist_next, hist_recycle, dp, dend);
|
|
dp++; /* Do not do this ++ in the line above! */
|
|
/* The macro would do it four times - Jan. */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Write out the last control byte and check that we haven't overrun the
|
|
* output size allowed by the strategy.
|
|
*/
|
|
*ctrlp = ctrlb;
|
|
result_size = bp - bstart;
|
|
if (result_size >= result_max) {
|
|
return -1;
|
|
}
|
|
|
|
/* success */
|
|
return result_size;
|
|
}
|
|
|
|
/* ----------
|
|
* pglz_decompress -
|
|
*
|
|
* Decompresses source into dest. Returns the number of bytes
|
|
* decompressed in the destination buffer, and *optionally*
|
|
* checks that both the source and dest buffers have been
|
|
* fully read and written to, respectively.
|
|
* ----------
|
|
*/
|
|
int32 lz_decompress(const char* source, int32 slen, char* dest, int32 rawsize, bool check_complete)
|
|
{
|
|
const unsigned char* sp;
|
|
const unsigned char* srcend;
|
|
unsigned char* dp;
|
|
unsigned char* destend;
|
|
errno_t rc = 0;
|
|
|
|
sp = (const unsigned char*) source;
|
|
srcend = ((const unsigned char*) source) + slen;
|
|
dp = (unsigned char*) dest;
|
|
destend = dp + rawsize;
|
|
|
|
while (sp < srcend && dp < destend) {
|
|
/*
|
|
* Read one control byte and process the next 8 items (or as many as
|
|
* remain in the compressed input).
|
|
*/
|
|
unsigned char ctrl = *sp++;
|
|
int ctrlc;
|
|
|
|
for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) {
|
|
|
|
if (ctrl & 1) {
|
|
/*
|
|
* Set control bit means we must read a match tag. The match
|
|
* is coded with two bytes. First byte uses lower nibble to
|
|
* code length - 3. Higher nibble contains upper 4 bits of the
|
|
* offset. The next following byte contains the lower 8 bits
|
|
* of the offset. If the length is coded as 18, another
|
|
* extension tag byte tells how much longer the match really
|
|
* was (0-255).
|
|
*/
|
|
int32 len;
|
|
int32 off;
|
|
|
|
len = (sp[0] & 0x0f) + 3;
|
|
off = ((sp[0] & 0xf0) << 4) | sp[1];
|
|
sp += 2;
|
|
if (len == 18) {
|
|
len += *sp++;
|
|
}
|
|
|
|
/*
|
|
* Now we copy the bytes specified by the tag from OUTPUT to
|
|
* OUTPUT (copy len bytes from dp - off to dp). The copied
|
|
* areas could overlap, to preven possible uncertainty, we
|
|
* copy only non-overlapping regions.
|
|
*/
|
|
len = Min(len, destend - dp);
|
|
while (off < len) {
|
|
/*---------
|
|
* When offset is smaller than length - source and
|
|
* destination regions overlap. memmove() is resolving
|
|
* this overlap in an incompatible way with pglz. Thus we
|
|
* resort to memcpy()-ing non-overlapping regions.
|
|
*
|
|
* Consider input: 112341234123412341234
|
|
* At byte 5 here ^ we have match with length 16 and
|
|
* offset 4. 11234M(len=16, off=4)
|
|
* We are decoding first period of match and rewrite match
|
|
* 112341234M(len=12, off=8)
|
|
*
|
|
* The same match is now at position 9, it points to the
|
|
* same start byte of output, but from another position:
|
|
* the offset is doubled.
|
|
*
|
|
* We iterate through this offset growth until we can
|
|
* proceed to usual memcpy(). If we would try to decode
|
|
* the match at byte 5 (len=16, off=4) by memmove() we
|
|
* would issue memmove(5, 1, 16) which would produce
|
|
* 112341234XXXXXXXXXXXX, where series of X is 12
|
|
* undefined bytes, that were at bytes [5:17].
|
|
* ---------
|
|
*/
|
|
errno_t rc = memcpy_s(dp, off + 1, dp - off, off);
|
|
securec_check(rc, "", "");
|
|
len -= off;
|
|
dp += off;
|
|
off += off;
|
|
}
|
|
rc = memcpy_s(dp, len + 1, dp - off, len);
|
|
securec_check(rc, "", "");
|
|
dp += len;
|
|
} else {
|
|
/*
|
|
* An unset control bit means LITERAL BYTE. So we just copy
|
|
* one from INPUT to OUTPUT.
|
|
*/
|
|
*dp++ = *sp++;
|
|
}
|
|
|
|
/*
|
|
* Advance the control bit
|
|
*/
|
|
ctrl >>= 1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Check we decompressed the right amount. If we are slicing, then we
|
|
* won't necessarily be at the end of the source or dest buffers when we
|
|
* hit a stop, so we don't test them.
|
|
*/
|
|
if (check_complete && (dp != destend || sp != srcend)) {
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* That's it.
|
|
*/
|
|
return (char*) dp - dest;
|
|
}
|
|
|
|
|
|
int CompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option)
|
|
{
|
|
if (PageIs8BXidHeapVersion(src)) {
|
|
return TemplateCompressPage<true>(src, dst, dst_size, option);
|
|
} else {
|
|
return TemplateCompressPage<false>(src, dst, dst_size, option);
|
|
}
|
|
}
|
|
|
|
void CompressConvertRows(char *buf, char *aux_buf, int16 *real_order, uint16 max_row_len, uint16 real_row_cnt) {
|
|
errno_t ret;
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 row_cnt = real_row_cnt;
|
|
uint32 total_size = page->pd_special - page->pd_upper;
|
|
char *copy_begin = buf + page->pd_upper;
|
|
char *row;
|
|
uint16 i, j, k, cur, up, row_size;
|
|
|
|
ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ);
|
|
securec_check(ret, "", "");
|
|
|
|
k = 0;
|
|
for (i = 0; i < max_row_len; i++) {
|
|
for (j = 0; j < row_cnt; j++) {
|
|
up = (j == (row_cnt - 1)) ? page->pd_special : GET_ITEMID_BY_IDX(buf, (real_order[j + 1]))->lp_off;
|
|
cur = GET_ITEMID_BY_IDX(buf, (real_order[j]))->lp_off;
|
|
row_size = up - cur;
|
|
row = buf + cur;
|
|
if (i < row_size) {
|
|
aux_buf[k++] = row[i]; // this part is reshaped
|
|
}
|
|
}
|
|
}
|
|
|
|
if (k != total_size) {
|
|
printf("ERROR!!! convert_rows_2 error...!!!\n");
|
|
ASSERT(0);
|
|
return;
|
|
}
|
|
|
|
// cp aux_buf to page_buf
|
|
ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size);
|
|
securec_check(ret, "", "");
|
|
return ;
|
|
}
|
|
|
|
void CompressConvertItemIds(char *buf, char *aux_buf) {
|
|
errno_t ret;
|
|
HeapPageHeaderData *page = (HeapPageHeaderData *)buf;
|
|
uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData);
|
|
uint32 total_size = row_cnt * sizeof(ItemIdData);
|
|
char *copy_begin = buf + GetPageHeaderSize(page);
|
|
uint16 i, j, k;
|
|
|
|
// clear aux_buf
|
|
ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ);
|
|
securec_check(ret, "", "");
|
|
|
|
k = 0;
|
|
for (i = 0; i < row_cnt; i++) {
|
|
for (j = 0; j < sizeof(ItemIdData); j++) {
|
|
aux_buf[j * row_cnt + i] = copy_begin[k++];
|
|
}
|
|
}
|
|
|
|
// cp aux_buf to page_buf
|
|
ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size);
|
|
securec_check(ret, "", "");
|
|
return ;
|
|
}
|
|
|
|
void cprs_diff_convert_rows(char *buf, uint32 offset,uint16 min_row_len, uint16 real_row_cnt) {
|
|
uint16 row_cnt = real_row_cnt;
|
|
uint32 common_size = min_row_len;
|
|
uint8 *copy_begin = (uint8 *)(buf + offset);
|
|
uint16 i, j;
|
|
|
|
for (i = 0; i < common_size; i++) {
|
|
for (j = row_cnt - 1; j > 0; j--) {
|
|
copy_begin[i * row_cnt + j] -= copy_begin[i * row_cnt + (j - 1)];
|
|
}
|
|
}
|
|
return ;
|
|
}
|
|
|
|
bool CompressConvertOnePage(char *buf, char *aux_buf, bool diff_convert) {
|
|
uint16 max_row_len = 0;
|
|
uint16 min_row_len = 0;
|
|
int16 *real_order = NULL; // itemids are not in order sometimes. we must find the real
|
|
uint16 real_row_cnt = 0;
|
|
if (!CompressConvertCheck(buf, &real_order, &max_row_len, &min_row_len, &real_row_cnt)) {
|
|
if (real_order != NULL) {
|
|
free(real_order);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
CompressConvertRows(buf, aux_buf, real_order, max_row_len, real_row_cnt);
|
|
CompressConvertItemIds(buf, aux_buf);
|
|
|
|
if (diff_convert) {
|
|
cprs_diff_convert_rows(buf, ((HeapPageHeaderData *)buf)->pd_upper, min_row_len, real_row_cnt);
|
|
cprs_diff_convert_rows(buf, GetPageHeaderSize(buf), sizeof(ItemIdData),
|
|
(((HeapPageHeaderData *)buf)->pd_lower - GetPageHeaderSize(buf)) / sizeof(ItemIdData));
|
|
}
|
|
|
|
if (real_order != NULL) {
|
|
free(real_order);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void CompressPagePrepareConvert(char *src, bool diff_convert, bool *real_ByteConvert)
|
|
{
|
|
char *aux_buf = NULL;
|
|
errno_t rc;
|
|
|
|
aux_buf = (char *)malloc(BLCKSZ);
|
|
if (aux_buf == NULL) {
|
|
// add log
|
|
return;
|
|
}
|
|
rc = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ);
|
|
securec_check(rc, "", "");
|
|
|
|
// do convert
|
|
*real_ByteConvert = false;
|
|
if (CompressConvertOnePage(src, aux_buf, diff_convert)) {
|
|
*real_ByteConvert = true;
|
|
}
|
|
|
|
if (aux_buf != NULL) {
|
|
free(aux_buf);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* CompressPage() -- Compress one page.
|
|
*
|
|
* Only the parts other than the page header will be compressed. The
|
|
* compressed data is rounded by chunck_size, The insufficient part is
|
|
* filled with zero. Compression needs to be able to save at least one
|
|
* chunk of space, otherwise it fail.
|
|
* This function returen the size of compressed data or
|
|
* -1 for compression fail
|
|
* COMPRESS_UNSUPPORTED_ERROR for unrecognized compression algorithm
|
|
*/
|
|
template<bool heapPageData>
|
|
int TemplateCompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option)
|
|
{
|
|
int compressed_size;
|
|
int8 level = option.compressLevelSymbol ? option.compressLevel : -option.compressLevel;
|
|
size_t sizeOfHeaderData = GetSizeOfHeadData(heapPageData);
|
|
char *src_copy = NULL;
|
|
bool real_ByteConvert = false;
|
|
errno_t rc;
|
|
char* data;
|
|
|
|
if (option.byteConvert) {
|
|
// copy and maybe change it
|
|
src_copy = (char *)malloc(BLCKSZ);
|
|
if (src_copy == NULL) {
|
|
// add log
|
|
return -1;
|
|
}
|
|
rc = memcpy_s(src_copy, BLCKSZ, src, BLCKSZ);
|
|
securec_check(rc, "", "");
|
|
CompressPagePrepareConvert(src_copy, option.diffConvert, &real_ByteConvert); /* preprocess convert src */
|
|
}
|
|
|
|
if (heapPageData) {
|
|
data = ((HeapPageCompressData*)dst)->data;
|
|
} else {
|
|
data = ((PageCompressData*)dst)->data;
|
|
}
|
|
|
|
switch (option.compressAlgorithm) {
|
|
case COMPRESS_ALGORITHM_PGLZ:
|
|
if (real_ByteConvert) {
|
|
compressed_size = lz_compress(src_copy + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, data);
|
|
} else {
|
|
compressed_size = lz_compress(src + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, data);
|
|
}
|
|
break;
|
|
case COMPRESS_ALGORITHM_ZSTD: {
|
|
if (level == 0 || level < MIN_ZSTD_COMPRESSION_LEVEL || level > MAX_ZSTD_COMPRESSION_LEVEL) {
|
|
level = DEFAULT_ZSTD_COMPRESSION_LEVEL;
|
|
}
|
|
|
|
if (real_ByteConvert) {
|
|
compressed_size = ZSTD_compress(data, dst_size, src_copy + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, level);
|
|
} else {
|
|
compressed_size = ZSTD_compress(data, dst_size, src + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, level);
|
|
}
|
|
|
|
if (ZSTD_isError(compressed_size)) {
|
|
if (src_copy != NULL) {
|
|
free(src_copy);
|
|
}
|
|
return -1;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
if (src_copy != NULL) {
|
|
free(src_copy);
|
|
}
|
|
return COMPRESS_UNSUPPORTED_ERROR;
|
|
}
|
|
|
|
if (compressed_size < 0) {
|
|
if (src_copy != NULL) {
|
|
free(src_copy);
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
if (heapPageData) {
|
|
HeapPageCompressData* pcdptr = ((HeapPageCompressData*)dst);
|
|
rc = memcpy_s(pcdptr->page_header, sizeOfHeaderData, src, sizeOfHeaderData);
|
|
securec_check(rc, "", "");
|
|
pcdptr->size = compressed_size;
|
|
pcdptr->byte_convert = real_ByteConvert;
|
|
pcdptr->diff_convert = option.diffConvert;
|
|
} else {
|
|
PageCompressData* pcdptr = ((PageCompressData*)dst);
|
|
rc = memcpy_s(pcdptr->page_header, sizeOfHeaderData, src, sizeOfHeaderData);
|
|
securec_check(rc, "", "");
|
|
pcdptr->size = compressed_size;
|
|
pcdptr->byte_convert = real_ByteConvert;
|
|
pcdptr->diff_convert = option.diffConvert;
|
|
}
|
|
|
|
if (src_copy != NULL) {
|
|
free(src_copy);
|
|
}
|
|
return SIZE_OF_PAGE_COMPRESS_DATA_HEADER_DATA(heapPageData) + compressed_size;
|
|
}
|
|
|
|
/**
|
|
* CompressPageBufferBound()
|
|
* -- Get the destination buffer boundary to compress one page.
|
|
* Return needed destination buffer size for compress one page or
|
|
* -1 for unrecognized compression algorithm
|
|
*/
|
|
int CompressPageBufferBound(const char* page, uint8 algorithm)
|
|
{
|
|
switch (algorithm) {
|
|
case COMPRESS_ALGORITHM_PGLZ:
|
|
return BLCKSZ + 4;
|
|
case COMPRESS_ALGORITHM_ZSTD:
|
|
return ZSTD_compressBound(BLCKSZ - GetPageHeaderSize(page));
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
|