From 4b35784ea0c5ac1da7bba649939536f055897ff1 Mon Sep 17 00:00:00 2001 From: wuyuechuan Date: Wed, 8 Dec 2021 21:04:28 +0800 Subject: [PATCH] table/index(btree) support compression --- contrib/pagehack/CMakeLists.txt | 5 +- contrib/pagehack/Makefile | 4 +- contrib/pagehack/compression_algorithm.ini | 1308 +++++++++++++++++ contrib/pagehack/openGaussCompression.cpp | 177 +++ contrib/pagehack/openGaussCompression.h | 40 + contrib/pagehack/pagehack.cpp | 95 +- src/Makefile.global.in | 5 +- src/bin/pg_rewind/Makefile | 2 +- src/bin/pg_rewind/compressed_common.h | 46 + src/bin/pg_rewind/compressed_rewind.cpp | 129 ++ src/bin/pg_rewind/compressed_rewind.h | 21 + src/bin/pg_rewind/fetch.cpp | 280 +++- src/bin/pg_rewind/fetch.h | 4 +- src/bin/pg_rewind/file_ops.cpp | 147 +- src/bin/pg_rewind/file_ops.h | 8 +- src/bin/pg_rewind/filemap.cpp | 40 +- src/bin/pg_rewind/filemap.h | 10 +- src/bin/pg_rewind/parsexlog.cpp | 2 +- src/common/backend/catalog/builtin_funcs.ini | 9 +- src/common/backend/catalog/heap.cpp | 37 +- src/common/backend/catalog/index.cpp | 14 +- src/common/backend/catalog/storage.cpp | 56 +- src/common/backend/utils/adt/dbsize.cpp | 4 +- src/common/backend/utils/adt/genfile.cpp | 126 ++ .../backend/utils/adt/pg_lzcompress.cpp | 278 ++++ src/common/backend/utils/cache/partcache.cpp | 55 +- src/common/backend/utils/cache/relcache.cpp | 60 +- src/common/backend/utils/init/globals.cpp | 2 +- src/common/backend/utils/misc/guc.cpp | 1 + src/common/backend/utils/mmgr/memprot.cpp | 31 + src/gausskernel/Makefile | 1 + src/gausskernel/bootstrap/bootparse.y | 1 + .../cbb/grpc/remote_read_client.cpp | 9 +- .../optimizer/commands/tablecmds.cpp | 130 +- .../process/postmaster/pagewriter.cpp | 2 +- src/gausskernel/process/postmaster/pgstat.cpp | 6 + .../process/postmaster/postmaster.cpp | 2 + .../storage/access/common/reloptions.cpp | 26 + .../storage/access/redo/redo_xlogutils.cpp | 12 +- .../storage/access/transam/double_write.cpp | 29 +- .../access/transam/extreme_rto/batch_redo.cpp | 3 +- .../access/transam/extreme_rto/page_redo.cpp | 1 + .../storage/access/transam/xloginsert.cpp | 28 +- .../storage/access/transam/xlogreader.cpp | 15 + .../storage/access/transam/xlogutils.cpp | 7 +- .../access/ustore/knl_uextremeredo.cpp | 1 + src/gausskernel/storage/buffer/bufmgr.cpp | 9 +- src/gausskernel/storage/file/fd.cpp | 85 ++ .../storage/remote/remote_adapter.cpp | 53 +- .../storage/replication/basebackup.cpp | 325 +++- src/gausskernel/storage/smgr/Makefile | 2 +- src/gausskernel/storage/smgr/md.cpp | 1270 ++++++++++++++-- src/gausskernel/storage/smgr/mmap_shared.cpp | 146 ++ .../storage/smgr/page_compression.cpp | 472 ++++++ src/include/access/double_write.h | 3 +- src/include/access/xloginsert.h | 4 +- src/include/access/xlogproc.h | 4 +- src/include/access/xlogrecord.h | 5 +- src/include/catalog/heap.h | 12 +- src/include/catalog/storage_xlog.h | 12 + .../rollback_catalog_maindb_92_424.sql | 2 + .../rollback_catalog_otherdb_92_424.sql | 2 + .../upgrade_catalog_maindb_92_424.sql | 22 + .../upgrade_catalog_otherdb_92_424.sql | 22 + src/include/knl/knl_instance.h | 1 + src/include/pgstat.h | 2 + src/include/service/remote_read_client.h | 2 +- src/include/storage/buf/buf_internals.h | 7 + src/include/storage/buf/bufpage.h | 4 + src/include/storage/page_compression.h | 336 +++++ src/include/storage/page_compression_impl.h | 715 +++++++++ src/include/storage/remote_adapter.h | 2 +- src/include/storage/smgr/fd.h | 5 + src/include/storage/smgr/relfilenode.h | 22 +- src/include/storage/vfd.h | 3 + src/include/utils/aset.h | 6 + src/include/utils/builtins.h | 1 + src/include/utils/partcache.h | 2 +- src/include/utils/pg_lzcompress.h | 5 + src/include/utils/rel.h | 17 + src/include/utils/rel_gs.h | 5 + src/include/utils/relcache.h | 3 +- src/test/regress/expected/hw_package.out | 2 - .../expected/row_compression/normal_test.out | 183 +++ .../row_compression/pg_table_size.out | 79 + .../row_compression/pg_tablespace_size.out | 32 + .../row_compression/unsupported_feature.out | 66 + src/test/regress/expected/rule_test.out | 694 ++++----- .../expected/single_node_opr_sanity.out | 2 + .../row_compression_basebackup.source | 6 + .../row_compression_basebackup.source | 28 + src/test/regress/parallel_schedule0 | 3 +- .../script/gs_basebackup/gs_basebackup.sh | 10 +- .../sql/gs_basebackup/init/compress_data.sql | 4 + .../gs_basebackup/validate/compress_data.sql | 3 + src/test/regress/sql/hw_package.sql | 1 - .../sql/row_compression/normal_test.sql | 69 + .../sql/row_compression/pg_table_size.sql | 30 + .../row_compression/pg_tablespace_size.sql | 14 + .../row_compression/unsupported_feature.sql | 41 + src/test/regress/sql/rule_test.sql | 452 +++--- 101 files changed, 7636 insertions(+), 940 deletions(-) create mode 100644 contrib/pagehack/compression_algorithm.ini create mode 100644 contrib/pagehack/openGaussCompression.cpp create mode 100644 contrib/pagehack/openGaussCompression.h create mode 100644 src/bin/pg_rewind/compressed_common.h create mode 100644 src/bin/pg_rewind/compressed_rewind.cpp create mode 100644 src/bin/pg_rewind/compressed_rewind.h create mode 100644 src/gausskernel/storage/smgr/mmap_shared.cpp create mode 100644 src/gausskernel/storage/smgr/page_compression.cpp create mode 100644 src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_424.sql create mode 100644 src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_424.sql create mode 100644 src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_424.sql create mode 100644 src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_424.sql create mode 100644 src/include/storage/page_compression.h create mode 100644 src/include/storage/page_compression_impl.h create mode 100644 src/test/regress/expected/row_compression/normal_test.out create mode 100644 src/test/regress/expected/row_compression/pg_table_size.out create mode 100644 src/test/regress/expected/row_compression/pg_tablespace_size.out create mode 100644 src/test/regress/expected/row_compression/unsupported_feature.out create mode 100644 src/test/regress/input/row_compression/row_compression_basebackup.source create mode 100644 src/test/regress/output/row_compression/row_compression_basebackup.source create mode 100644 src/test/regress/sql/gs_basebackup/init/compress_data.sql create mode 100644 src/test/regress/sql/gs_basebackup/validate/compress_data.sql create mode 100644 src/test/regress/sql/row_compression/normal_test.sql create mode 100644 src/test/regress/sql/row_compression/pg_table_size.sql create mode 100644 src/test/regress/sql/row_compression/pg_tablespace_size.sql create mode 100644 src/test/regress/sql/row_compression/unsupported_feature.sql diff --git a/contrib/pagehack/CMakeLists.txt b/contrib/pagehack/CMakeLists.txt index d221bd7ff..cc6a658e2 100644 --- a/contrib/pagehack/CMakeLists.txt +++ b/contrib/pagehack/CMakeLists.txt @@ -2,7 +2,7 @@ # pagehack AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} TGT_pagehack_SRC) set(TGT_pagehack_INC - ${TGT_pq_INC} ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SRC_DIR}/lib/gstrace + ${TGT_pq_INC} ${ZSTD_INCLUDE_PATH} ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SRC_DIR}/lib/gstrace ) set(pagehack_DEF_OPTIONS ${MACRO_OPTIONS}) @@ -11,12 +11,13 @@ if(${ENABLE_DEBUG} STREQUAL "ON") endif() set(pagehack_COMPILE_OPTIONS ${OS_OPTIONS} ${PROTECT_OPTIONS} ${WARNING_OPTIONS} ${CHECK_OPTIONS} ${BIN_SECURE_OPTIONS} ${OPTIMIZE_OPTIONS}) set(pagehack_LINK_OPTIONS ${BIN_LINK_OPTIONS}) -set(pagehack_LINK_LIBS -lpgport -lcrypt -ldl -lm -ledit -lssl -lcrypto -lsecurec -lrt -lz -lminiunz) +set(pagehack_LINK_LIBS -lpgport -lcrypt -ldl -lm -ledit -lssl -lcrypto -lsecurec -lrt -lz -lminiunz -lzstd) add_bintarget(pagehack TGT_pagehack_SRC TGT_pagehack_INC "${pagehack_DEF_OPTIONS}" "${pagehack_COMPILE_OPTIONS}" "${pagehack_LINK_OPTIONS}" "${pagehack_LINK_LIBS}") add_dependencies(pagehack pgport_static) target_link_directories(pagehack PUBLIC ${LIBOPENSSL_LIB_PATH} ${PROTOBUF_LIB_PATH} ${LIBPARQUET_LIB_PATH} ${LIBCURL_LIB_PATH} ${SECURE_LIB_PATH} ${ZLIB_LIB_PATH} ${LIBOBS_LIB_PATH} ${LIBEDIT_LIB_PATH} ${LIBCGROUP_LIB_PATH} ${CMAKE_BINARY_DIR}/lib + ${ZSTD_LIB_PATH} ) install(TARGETS pagehack RUNTIME DESTINATION bin) diff --git a/contrib/pagehack/Makefile b/contrib/pagehack/Makefile index 4c1571e4e..fe8eca407 100644 --- a/contrib/pagehack/Makefile +++ b/contrib/pagehack/Makefile @@ -1,6 +1,6 @@ # contrib/pagehack/Makefile MODULE_big = pagehack -OBJS = pagehack.o +OBJS = openGaussCompression.o pagehack.o # executable program, even there is no database server/client PROGRAM = pagehack @@ -13,7 +13,7 @@ else subdir = contrib/pagehack top_builddir = ../.. include $(top_builddir)/src/Makefile.global -enable_shared = false +override CFLAGS += -lzstd ifeq ($(enable_debug), yes) PG_CPPFLAGS += -DDEBUG diff --git a/contrib/pagehack/compression_algorithm.ini b/contrib/pagehack/compression_algorithm.ini new file mode 100644 index 000000000..16761f42b --- /dev/null +++ b/contrib/pagehack/compression_algorithm.ini @@ -0,0 +1,1308 @@ +size_t GetSizeOfHeadData(bool heapPageData) +{ + if (heapPageData) { + return SizeOfHeapPageHeaderData; + } else { + return SizeOfPageHeaderData; + } +} + +// maybe some itemid is not valid +uint16 HeapPageCalcRealRowCnt (char *buf) { + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 cnt = 0; + uint16 i; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + + for (i = 0; i < row_cnt; i++) { + if (ItemIdIsNormal(GET_ITEMID_BY_IDX(buf, i))) { + cnt++; + } + } + return cnt; +} + +void DecompressDeconvertRows(char *buf, char *aux_buf, int16 *real_order, uint16 max_row_len, uint16 real_row_cnt) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = real_row_cnt; + uint32 total_size = page->pd_special - page->pd_upper; + char *copy_begin = buf + page->pd_upper; + char *row; + uint16 i, j, k, cur, up, row_size; + + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + k = 0; + for (i = 0; i < max_row_len; i++) { + for (j = 0; j < row_cnt; j++) { + up = (j == (row_cnt - 1)) ? page->pd_special : GET_ITEMID_BY_IDX(buf, (real_order[j + 1]))->lp_off; + cur = GET_ITEMID_BY_IDX(buf, (real_order[j]))->lp_off; + row_size = up - cur; + row = aux_buf + cur; + if (i < row_size) { + row[i] = copy_begin[k++]; // this part is reshaped + } + } + } + + if (k != total_size) { + printf("ERROR!!! pg_deconvert_rows error...!!!\n"); + ASSERT(0); + return; + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf + page->pd_upper, total_size); + securec_check(ret, "", ""); + return ; +} + +// 1: as tuple_offset order, that means asc order. +// 2: store all itemid's idx. +// 3:maybe some itemid is not in order. +void CompressConvertItemRealOrder(char *buf, int16 *real_order, uint16 real_row_cnt) { + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + ItemIdData *begin = (ItemIdData *)(buf + GetPageHeaderSize(page)); + int16 *link_order = real_order + real_row_cnt; + + int16 i, head, curr, prev; + int16 end = -1; // invalid index + + head = end; + // very likely to seems that itemids stored by desc order, and ignore invalid itemid + for (i = 0; i < row_cnt; i++) { + if (!ItemIdIsNormal(begin + i)) { + continue; + } + + if (head == end) { // set the head idx, insert the first + link_order[i] = end; + head = i; + continue; + } + + if ((begin + i)->lp_off < (begin + head)->lp_off) { + link_order[i] = head; // update the head idx + head = i; + continue; + } + + prev = head; + curr = link_order[head]; + while ((curr != end) && ((begin + i)->lp_off > (begin + curr)->lp_off)) { + prev = curr; + curr = link_order[curr]; + } + + link_order[prev] = i; + link_order[i] = curr; + } + + // arrange the link to array + curr = head; + for (i = 0; i < real_row_cnt; i++) { + real_order[i] = curr; + curr = link_order[curr]; + } + + if (curr != end) { + printf("ERROR!!! pre_convert_real_order error...!!!\n"); + ASSERT(0); + return; + } + +} + +int DecompressPage(const char* src, char* dst, uint8 algorithm) +{ + if (PageIs8BXidHeapVersion(src)) { + return TemplateDecompressPage(src, dst, algorithm); + } else { + return TemplateDecompressPage(src, dst, algorithm); + } +} + +void cprs_diff_deconvert_rows(char *buf, uint32 offset, uint16 min_row_len, uint16 real_row_cnt) { + uint16 row_cnt = real_row_cnt; + uint32 common_size = min_row_len; + uint8 *copy_begin = (uint8 *)(buf + offset); + uint16 i, j; + + for (i = 0; i < common_size; i++) { + for (j = 1; j < row_cnt; j++) { + copy_begin[i * row_cnt + j] += copy_begin[i * row_cnt + (j - 1)]; + } + } + return ; +} + +// to find all row size are diffs in MIN_DIFF_SIZE byts. +bool CompressConvertCheck(char *buf, int16 **real_order, uint16 *max_row_len, uint16 *min_row_len, uint16 *real_row_cnt) { + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + int16 i, row_size; + ItemIdData *ptr = NULL; + uint16 up = page->pd_special; + uint16 min_size = GS_INVALID_ID16; + uint16 max_size = 0; + errno_t ret; + if (page->pd_lower < GetPageHeaderSize(page) || (page->pd_lower > page->pd_upper)) { + return false; + } + + uint16 normal_row_cnt = HeapPageCalcRealRowCnt(buf); + if (normal_row_cnt < MIN_CONVERT_CNT) { // no need convert + return false; + } + + // to store the real tuple order. + /* + --------------------------|-------------------------- + xxxxxxxxxxxxxxxxxxxxxxxxxx|xxxxxxxxxxxxxxxxxxxxxxxxxx + --------------------------|-------------------------- + */ + // the first part is real array order, and the second part is link. + *real_order = (int16 *)malloc(sizeof(uint16) * row_cnt * 2); + if (*real_order == NULL) { + printf("zfunc compress file"); + return false; + } + ret = memset_sp(*real_order, sizeof(uint16) * row_cnt * 2, 0, sizeof(uint16) * row_cnt * 2); + securec_check(ret, "", ""); + + // order the ItemIds by tuple_offset order. + CompressConvertItemRealOrder(buf, *real_order, normal_row_cnt); + + // do the check, to check all size of tuples. + for (i = normal_row_cnt - 1; i >= 0; i--) { + ptr = GET_ITEMID_BY_IDX(buf, ((*real_order)[i])); + + row_size = up - ptr->lp_off; + if (row_size < MIN_CONVERT_CNT * 2) { + return false; + } + + min_size = (row_size < min_size) ? row_size : min_size; + max_size = (row_size > max_size) ? row_size : max_size; + + if ((max_size - min_size) > MIN_DIFF_SIZE) { // no need convert + return false; + } + up = ptr->lp_off; + } + + // get the min row common size. + *max_row_len = max_size; + *min_row_len = min_size; + *real_row_cnt = normal_row_cnt; + return true; +} + +void DecompressDeconvertItemIds(char *buf, char *aux_buf) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + uint32 total_size = row_cnt * sizeof(ItemIdData); + char *copy_begin = buf + GetPageHeaderSize(page); + uint16 i, j, k; + + // clear aux_buf + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + k = 0; + for (i = 0; i < sizeof(ItemIdData); i++) { + for (j = 0; j < row_cnt; j++) { + aux_buf[j * sizeof(ItemIdData) + i] = copy_begin[k++]; + } + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size); + securec_check(ret, "", ""); + return ; +} + + +void DecompressDeconvertOnePage(char *buf, char *aux_buf, bool diff_convert) { + uint16 max_row_len = 0; + uint16 min_row_len = 0; + int16 *real_order = NULL; // itemids are not in order sometimes. we must find the real + uint16 real_row_cnt = 0; + + if (diff_convert) { + cprs_diff_deconvert_rows(buf, GetPageHeaderSize(buf), sizeof(ItemIdData), + (((HeapPageHeaderData *)buf)->pd_lower - GetPageHeaderSize(buf)) / sizeof(ItemIdData)); + } + + // =======firstly, arrange the itemids. + DecompressDeconvertItemIds(buf, aux_buf); + + if (!CompressConvertCheck(buf, &real_order, &max_row_len, &min_row_len, &real_row_cnt)) { + if (real_order != NULL) { + free(real_order); + } + ASSERT(0); + return ; + } + + // =======and last, the tuples + if (diff_convert) { + cprs_diff_deconvert_rows(buf, ((HeapPageHeaderData *)buf)->pd_upper, min_row_len, real_row_cnt); + } + DecompressDeconvertRows(buf, aux_buf, real_order, max_row_len, real_row_cnt); + + if (real_order != NULL) { + free(real_order); + } + return ; +} + + +void DecompressPageDeconvert(char *src, bool diff_convert) +{ + char *aux_buf = NULL; + errno_t rc; + + aux_buf = (char *)malloc(BLCKSZ); + if (aux_buf == NULL) { + // add log + return; + } + rc = memset_s(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(rc, "", ""); + + // do convert + DecompressDeconvertOnePage(src, aux_buf, diff_convert); + + if (aux_buf != NULL) { + free(aux_buf); + } +} + + +/** + * DecompressPage() -- Decompress one compressed page. + * return size of decompressed page which should be BLCKSZ or + * -1 for decompress error + * -2 for unrecognized compression algorithm + * + * note:The size of dst must be greater than or equal to BLCKSZ. + */ +template +int TemplateDecompressPage(const char* src, char* dst, uint8 algorithm) +{ + int decompressed_size; + char* data; + uint32 size; + bool byte_convert, diff_convert; + size_t sizeOfPageHeaderData = GetSizeOfHeadData(heapPageData); + int rc = memcpy_s(dst, sizeOfPageHeaderData, src, sizeOfPageHeaderData); + securec_check(rc, "", ""); + + if (heapPageData) { + data = ((HeapPageCompressData*) src)->data; + size = ((HeapPageCompressData*) src)->size; + byte_convert = ((HeapPageCompressData*) src)->byte_convert; + diff_convert = ((HeapPageCompressData*) src)->diff_convert; + } else { + data = ((PageCompressData*) src)->data; + size = ((PageCompressData*) src)->size; + byte_convert = ((PageCompressData*) src)->byte_convert; + diff_convert = ((PageCompressData*) src)->diff_convert; + } + + switch (algorithm) { + case COMPRESS_ALGORITHM_PGLZ: + decompressed_size = lz_decompress( + data, size, dst + sizeOfPageHeaderData, BLCKSZ - sizeOfPageHeaderData, false); + break; + case COMPRESS_ALGORITHM_ZSTD: + decompressed_size = + ZSTD_decompress(dst + sizeOfPageHeaderData, BLCKSZ - sizeOfPageHeaderData, data, size); + + if (ZSTD_isError(decompressed_size)) { + return -1; + } + + break; + + default: + return COMPRESS_UNSUPPORTED_ERROR; + break; + } + + if (byte_convert) { + // deconvert dst + DecompressPageDeconvert(dst, diff_convert); + } + + return sizeOfPageHeaderData + decompressed_size; +} + +// pg_lz +/* ---------- + * pg_lzcompress.c - + * + * This is an implementation of LZ compression for PostgreSQL. + * It uses a simple history table and generates 2-3 byte tags + * capable of backward copy information for 3-273 bytes with + * a max offset of 4095. + * + * Entry routines: + * + * bool + * pglz_compress(const char *source, int32 slen, PGLZ_Header *dest, + * const PGLZ_Strategy *strategy); + * + * source is the input data to be compressed. + * + * slen is the length of the input data. + * + * dest is the output area for the compressed result. + * It must be at least as big as PGLZ_MAX_OUTPUT(slen). + * + * strategy is a pointer to some information controlling + * the compression algorithm. If NULL, the compiled + * in default strategy is used. + * + * The return value is TRUE if compression succeeded, + * FALSE if not; in the latter case the contents of dest + * are undefined. + * + * void + * pglz_decompress(const PGLZ_Header *source, char *dest) + * + * source is the compressed input. + * + * dest is the area where the uncompressed data will be + * written to. It is the callers responsibility to + * provide enough space. The required amount can be + * obtained with the macro PGLZ_RAW_SIZE(source). + * + * The data is written to buff exactly as it was handed + * to pglz_compress(). No terminating zero byte is added. + * + * The decompression algorithm and internal data format: + * + * PGLZ_Header is defined as + * + * typedef struct PGLZ_Header { + * int32 vl_len_; + * int32 rawsize; + * } + * + * The header is followed by the compressed data itself. + * + * The data representation is easiest explained by describing + * the process of decompression. + * + * If VARSIZE(x) == rawsize + sizeof(PGLZ_Header), then the data + * is stored uncompressed as plain bytes. Thus, the decompressor + * simply copies rawsize bytes from the location after the + * header to the destination. + * + * Otherwise the first byte after the header tells what to do + * the next 8 times. We call this the control byte. + * + * An unset bit in the control byte means, that one uncompressed + * byte follows, which is copied from input to output. + * + * A set bit in the control byte means, that a tag of 2-3 bytes + * follows. A tag contains information to copy some bytes, that + * are already in the output buffer, to the current location in + * the output. Let's call the three tag bytes T1, T2 and T3. The + * position of the data to copy is coded as an offset from the + * actual output position. + * + * The offset is in the upper nibble of T1 and in T2. + * The length is in the lower nibble of T1. + * + * So the 16 bits of a 2 byte tag are coded as + * + * 7---T1--0 7---T2--0 + * OOOO LLLL OOOO OOOO + * + * This limits the offset to 1-4095 (12 bits) and the length + * to 3-18 (4 bits) because 3 is always added to it. To emit + * a tag of 2 bytes with a length of 2 only saves one control + * bit. But we lose one byte in the possible length of a tag. + * + * In the actual implementation, the 2 byte tag's length is + * limited to 3-17, because the value 0xF in the length nibble + * has special meaning. It means, that the next following + * byte (T3) has to be added to the length value of 18. That + * makes total limits of 1-4095 for offset and 3-273 for length. + * + * Now that we have successfully decoded a tag. We simply copy + * the output that occurred bytes back to the current + * output location in the specified . Thus, a + * sequence of 200 spaces (think about bpchar fields) could be + * coded in 4 bytes. One literal space and a three byte tag to + * copy 199 bytes with a -1 offset. Whow - that's a compression + * rate of 98%! Well, the implementation needs to save the + * original data size too, so we need another 4 bytes for it + * and end up with a total compression rate of 96%, what's still + * worth a Whow. + * + * The compression algorithm + * + * The following uses numbers used in the default strategy. + * + * The compressor works best for attributes of a size between + * 1K and 1M. For smaller items there's not that much chance of + * redundancy in the character sequence (except for large areas + * of identical bytes like trailing spaces) and for bigger ones + * our 4K maximum look-back distance is too small. + * + * The compressor creates a table for 8192 lists of positions. + * For each input position (except the last 3), a hash key is + * built from the 4 next input bytes and the position remembered + * in the appropriate list. Thus, the table points to linked + * lists of likely to be at least in the first 4 characters + * matching strings. This is done on the fly while the input + * is compressed into the output area. Table entries are only + * kept for the last 4096 input positions, since we cannot use + * back-pointers larger than that anyway. + * + * For each byte in the input, it's hash key (built from this + * byte and the next 3) is used to find the appropriate list + * in the table. The lists remember the positions of all bytes + * that had the same hash key in the past in increasing backward + * offset order. Now for all entries in the used lists, the + * match length is computed by comparing the characters from the + * entries position with the characters from the actual input + * position. + * + * The compressor starts with a so called "good_match" of 128. + * It is a "prefer speed against compression ratio" optimizer. + * So if the first entry looked at already has 128 or more + * matching characters, the lookup stops and that position is + * used for the next tag in the output. + * + * For each subsequent entry in the history list, the "good_match" + * is lowered by 10%. So the compressor will be more happy with + * short matches the farer it has to go back in the history. + * Another "speed against ratio" preference characteristic of + * the algorithm. + * + * Thus there are 3 stop conditions for the lookup of matches: + * + * - a match >= good_match is found + * - there are no more history entries to look at + * - the next history entry is already too far back + * to be coded into a tag. + * + * Finally the match algorithm checks that at least a match + * of 3 or more bytes has been found, because thats the smallest + * amount of copy information to code into a tag. If so, a tag + * is omitted and all the input bytes covered by that are just + * scanned for the history add's, otherwise a literal character + * is omitted and only his history entry added. + * + * Acknowledgements: + * + * Many thanks to Adisak Pochanayon, who's article about SLZ + * inspired me to write the PostgreSQL compression this way. + * + * Jan Wieck + * + * Copyright (c) 1999-2012, PostgreSQL Global Development Group + * + * src/backend/utils/adt/pg_lzcompress.c + * ---------- + */ +#include "postgres.h" +#include "knl/knl_variable.h" + +#include + +#include "utils/pg_lzcompress.h" + +/* ---------- + * The provided standard strategies + * ---------- + */ +static const PGLZ_Strategy strategy_default_data = { + 32, /* Data chunks less than 32 bytes are not + * compressed */ + INT_MAX, /* No upper limit on what we'll try to + * compress */ + 25, /* Require 25% compression rate, or not worth + * it */ + 1024, /* Give up if no compression in the first 1KB */ + 128, /* Stop history lookup if a match of 128 bytes + * is found */ + 10 /* Lower good match size by 10% at every loop + * iteration */ +}; +const PGLZ_Strategy* const PGLZ_strategy_default = &strategy_default_data; + +static const PGLZ_Strategy strategy_always_data = { + 0, /* Chunks of any size are compressed */ + INT_MAX, + 0, /* It's enough to save one single byte */ + INT_MAX, /* Never give up early */ + 128, /* Stop history lookup if a match of 128 bytes + * is found */ + 6 /* Look harder for a good match */ +}; +const PGLZ_Strategy* const PGLZ_strategy_always = &strategy_always_data; + +/* ---------- + * pglz_hist_idx - + * + * Computes the history table slot for the lookup by the next 4 + * characters in the input. + * + * NB: because we use the next 4 characters, we are not guaranteed to + * find 3-character matches; they very possibly will be in the wrong + * hash list. This seems an acceptable tradeoff for spreading out the + * hash keys more. + * ---------- + */ +#define pglz_hist_idx(_s, _e) \ + (((((_e) - (_s)) < 4) ? (int)(_s)[0] \ + : (((unsigned char)((_s)[0]) << 9) ^ ((unsigned char)((_s)[1]) << 6) ^ \ + ((unsigned char)((_s)[2]) << 3) ^ (unsigned char)((_s)[3]))) & \ + (PGLZ_HISTORY_MASK)) + +/* ---------- + * pglz_hist_add - + * + * Adds a new entry to the history table. + * + * If _recycle is true, then we are recycling a previously used entry, + * and must first delink it from its old hashcode's linked list. + * + * NOTE: beware of multiple evaluations of macro's arguments, and note that + * _hn and _recycle are modified in the macro. + * ---------- + */ +#define pglz_hist_add(_hs, _he, _hn, _recycle, _s, _e) \ + do { \ + int __hindex = pglz_hist_idx((_s), (_e)); \ + PGLZ_HistEntry** __myhsp = &(_hs)[__hindex]; \ + PGLZ_HistEntry* __myhe = &(_he)[_hn]; \ + if (_recycle) { \ + if (__myhe->prev == NULL) \ + (_hs)[__myhe->hindex] = __myhe->next; \ + else \ + __myhe->prev->next = __myhe->next; \ + if (__myhe->next != NULL) \ + __myhe->next->prev = __myhe->prev; \ + } \ + __myhe->next = *__myhsp; \ + __myhe->prev = NULL; \ + __myhe->hindex = __hindex; \ + __myhe->pos = (_s); \ + if (*__myhsp != NULL) \ + (*__myhsp)->prev = __myhe; \ + *__myhsp = __myhe; \ + if (++(_hn) >= PGLZ_HISTORY_SIZE) { \ + (_hn) = 0; \ + (_recycle) = true; \ + } \ + } while (0) + +/* ---------- + * pglz_out_ctrl - + * + * Outputs the last and allocates a new control byte if needed. + * ---------- + */ +#define pglz_out_ctrl(__ctrlp, __ctrlb, __ctrl, __buf) \ + do { \ + if ((((unsigned char)(__ctrl)) & 0xff) == 0) { \ + *(__ctrlp) = __ctrlb; \ + __ctrlp = (__buf)++; \ + __ctrlb = 0; \ + __ctrl = 1; \ + } \ + } while (0) + +/* ---------- + * pglz_out_literal - + * + * Outputs a literal byte to the destination buffer including the + * appropriate control bit. + * ---------- + */ +#define pglz_out_literal(_ctrlp, _ctrlb, _ctrl, _buf, _byte) \ + do { \ + pglz_out_ctrl(_ctrlp, _ctrlb, _ctrl, _buf); \ + *(_buf)++ = (unsigned char)(_byte); \ + (_ctrl) <<= 1; \ + } while (0) + +/* ---------- + * pglz_out_tag - + * + * Outputs a backward reference tag of 2-4 bytes (depending on + * offset and length) to the destination buffer including the + * appropriate control bit. + * ---------- + */ +#define pglz_out_tag(_ctrlp, _ctrlb, _ctrl, _buf, _len, _off) \ + do { \ + pglz_out_ctrl(_ctrlp, _ctrlb, _ctrl, _buf); \ + (_ctrlb) |= (_ctrl); \ + (_ctrl) <<= 1; \ + if ((_len) > 17) { \ + (_buf)[0] = (unsigned char)((((uint32)(_off)&0xf00) >> 4) | 0x0f); \ + (_buf)[1] = (unsigned char)(((uint32)(_off)&0xff)); \ + (_buf)[2] = (unsigned char)((_len)-18); \ + (_buf) += 3; \ + } else { \ + (_buf)[0] = (unsigned char)((((uint32)(_off)&0xf00) >> 4) | ((uint32)(_len)-3)); \ + (_buf)[1] = (unsigned char)((uint32)(_off)&0xff); \ + (_buf) += 2; \ + } \ + } while (0) + +#define HIST_START_LEN (sizeof(PGLZ_HistEntry*) * PGLZ_HISTORY_LISTS) +#define HIST_ENTRIES_LEN (sizeof(PGLZ_HistEntry) * PGLZ_HISTORY_SIZE) + +#define PGLZ_MAX_HISTORY_LISTS 8192 /* must be power of 2 */ +static PGLZ_HistEntry* hist_start[PGLZ_MAX_HISTORY_LISTS]; +static PGLZ_HistEntry hist_entries[PGLZ_HISTORY_SIZE + 1]; + +/* ---------- + * pglz_find_match - + * + * Lookup the history table if the actual input stream matches + * another sequence of characters, starting somewhere earlier + * in the input buffer. + * ---------- + */ +static inline int pglz_find_match( + PGLZ_HistEntry** hstart, const char* input, const char* end, int* lenp, int* offp, int good_match, int good_drop) +{ + PGLZ_HistEntry* hent = NULL; + int32 len = 0; + int32 off = 0; + + /* + * Traverse the linked history list until a good enough match is found. + */ + hent = hstart[pglz_hist_idx(input, end)]; + while (hent != NULL) { + const char* ip = input; + const char* hp = hent->pos; + int32 thisoff; + int32 thislen; + + /* + * Stop if the offset does not fit into our tag anymore. + */ + thisoff = ip - hp; + if (thisoff >= 0x0fff) + break; + + /* + * Determine length of match. A better match must be larger than the + * best so far. And if we already have a match of 16 or more bytes, + * it's worth the call overhead to use memcmp() to check if this match + * is equal for the same size. After that we must fallback to + * character by character comparison to know the exact position where + * the diff occurred. + */ + thislen = 0; + if (len >= 16) { + if (memcmp(ip, hp, len) == 0) { + thislen = len; + ip += len; + hp += len; + while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) { + thislen++; + ip++; + hp++; + } + } + } else { + while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) { + thislen++; + ip++; + hp++; + } + } + + /* + * Remember this match as the best (if it is) + */ + if (thislen > len) { + len = thislen; + off = thisoff; + } + + /* + * Advance to the next history entry + */ + hent = hent->next; + + /* + * Be happy with lesser good matches the more entries we visited. But + * no point in doing calculation if we're at end of list. + */ + if (hent != NULL) { + if (len >= good_match) + break; + good_match -= (good_match * good_drop) / 100; + } + } + + /* + * Return match information only if it results at least in one byte + * reduction. + */ + if (len > 2) { + *lenp = len; + *offp = off; + return 1; + } + + return 0; +} + +/* ---------- + * lz_compress - + * + * Compresses source into dest using strategy. Returns the number of + * bytes written in buffer dest, or -1 if compression fails. + * ---------- + */ +int32 lz_compress(const char* source, int32 slen, char* dest) +{ + unsigned char* bp = (unsigned char*) dest; + unsigned char* bstart = bp; + int hist_next = 0; + bool hist_recycle = false; + const char* dp = source; + const char* dend = source + slen; + unsigned char ctrl_dummy = 0; + unsigned char* ctrlp = &ctrl_dummy; + unsigned char ctrlb = 0; + unsigned char ctrl = 0; + bool found_match = false; + int32 match_len; + int32 match_off; + int32 good_match; + int32 good_drop; + int32 result_size; + int32 result_max; + int32 need_rate; + errno_t rc; + + const PGLZ_Strategy* strategy = PGLZ_strategy_always; + /* + * Our fallback strategy is the default. + */ + if (strategy == NULL) { + strategy = PGLZ_strategy_default; + } + + /* + * If the strategy forbids compression (at all or if source chunk size out + * of range), fail. + */ + if (strategy->match_size_good <= 0 || slen < strategy->min_input_size || slen > strategy->max_input_size) { + return -1; + } + + /* + * Limit the match parameters to the supported range. + */ + good_match = strategy->match_size_good; + if (good_match > PGLZ_MAX_MATCH) { + good_match = PGLZ_MAX_MATCH; + } else if (good_match < 17) { + good_match = 17; + } + + good_drop = strategy->match_size_drop; + if (good_drop < 0) { + good_drop = 0; + } else if (good_drop > 100) { + good_drop = 100; + } + + need_rate = strategy->min_comp_rate; + if (need_rate < 0) { + need_rate = 0; + } else if (need_rate > 99) { + need_rate = 99; + } + + /* + * Compute the maximum result size allowed by the strategy, namely the + * input size minus the minimum wanted compression rate. This had better + * be <= slen, else we might overrun the provided output buffer. + */ + if (slen > (INT_MAX / 100)) { + /* Approximate to avoid overflow */ + result_max = (slen / 100) * (100 - need_rate); + } else { + result_max = (slen * (100 - need_rate)) / 100; + } + + /* + * Initialize the history lists to empty. We do not need to zero the + * hist_entries[] array; its entries are initialized as they are used. + */ + rc = memset_s(hist_start, HIST_START_LEN, 0, HIST_START_LEN); + securec_check(rc, "\0", "\0"); + + /* + * Compress the source directly into the output buffer. + */ + while (dp < dend) { + /* + * If we already exceeded the maximum result size, fail. + * + * We check once per loop; since the loop body could emit as many as 4 + * bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better + * allow 4 slop bytes. + */ + if (bp - bstart >= result_max) { + return -1; + } + + /* + * If we've emitted more than first_success_by bytes without finding + * anything compressible at all, fail. This lets us fall out + * reasonably quickly when looking at incompressible input (such as + * pre-compressed data). + */ + if (!found_match && bp - bstart >= strategy->first_success_by) { + return -1; + } + + /* + * Try to find a match in the history + */ + if (pglz_find_match(hist_start, dp, dend, &match_len, &match_off, good_match, good_drop)) { + /* + * Create the tag and add history entries for all matched + * characters. + */ + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); + while (match_len--) { + pglz_hist_add( + hist_start, hist_entries, hist_next, hist_recycle, dp, + dend); + dp++; /* Do not do this ++ in the line above! */ + /* The macro would do it four times - Jan. */ + } + found_match = true; + } else { + /* + * No match found. Copy one literal byte. + */ + pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp); + pglz_hist_add( + hist_start, hist_entries, hist_next, hist_recycle, dp, dend); + dp++; /* Do not do this ++ in the line above! */ + /* The macro would do it four times - Jan. */ + } + } + + /* + * Write out the last control byte and check that we haven't overrun the + * output size allowed by the strategy. + */ + *ctrlp = ctrlb; + result_size = bp - bstart; + if (result_size >= result_max) { + return -1; + } + + /* success */ + return result_size; +} + +/* ---------- + * pglz_decompress - + * + * Decompresses source into dest. Returns the number of bytes + * decompressed in the destination buffer, and *optionally* + * checks that both the source and dest buffers have been + * fully read and written to, respectively. + * ---------- + */ +int32 lz_decompress(const char* source, int32 slen, char* dest, int32 rawsize, bool check_complete) +{ + const unsigned char* sp; + const unsigned char* srcend; + unsigned char* dp; + unsigned char* destend; + errno_t rc = 0; + + sp = (const unsigned char*) source; + srcend = ((const unsigned char*) source) + slen; + dp = (unsigned char*) dest; + destend = dp + rawsize; + + while (sp < srcend && dp < destend) { + /* + * Read one control byte and process the next 8 items (or as many as + * remain in the compressed input). + */ + unsigned char ctrl = *sp++; + int ctrlc; + + for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) { + + if (ctrl & 1) { + /* + * Set control bit means we must read a match tag. The match + * is coded with two bytes. First byte uses lower nibble to + * code length - 3. Higher nibble contains upper 4 bits of the + * offset. The next following byte contains the lower 8 bits + * of the offset. If the length is coded as 18, another + * extension tag byte tells how much longer the match really + * was (0-255). + */ + int32 len; + int32 off; + + len = (sp[0] & 0x0f) + 3; + off = ((sp[0] & 0xf0) << 4) | sp[1]; + sp += 2; + if (len == 18) { + len += *sp++; + } + + /* + * Now we copy the bytes specified by the tag from OUTPUT to + * OUTPUT (copy len bytes from dp - off to dp). The copied + * areas could overlap, to preven possible uncertainty, we + * copy only non-overlapping regions. + */ + len = Min(len, destend - dp); + while (off < len) { + /*--------- + * When offset is smaller than length - source and + * destination regions overlap. memmove() is resolving + * this overlap in an incompatible way with pglz. Thus we + * resort to memcpy()-ing non-overlapping regions. + * + * Consider input: 112341234123412341234 + * At byte 5 here ^ we have match with length 16 and + * offset 4. 11234M(len=16, off=4) + * We are decoding first period of match and rewrite match + * 112341234M(len=12, off=8) + * + * The same match is now at position 9, it points to the + * same start byte of output, but from another position: + * the offset is doubled. + * + * We iterate through this offset growth until we can + * proceed to usual memcpy(). If we would try to decode + * the match at byte 5 (len=16, off=4) by memmove() we + * would issue memmove(5, 1, 16) which would produce + * 112341234XXXXXXXXXXXX, where series of X is 12 + * undefined bytes, that were at bytes [5:17]. + * --------- + */ + errno_t rc = memcpy_s(dp, off + 1, dp - off, off); + securec_check(rc, "", ""); + len -= off; + dp += off; + off += off; + } + rc = memcpy_s(dp, len + 1, dp - off, len); + securec_check(rc, "", ""); + dp += len; + } else { + /* + * An unset control bit means LITERAL BYTE. So we just copy + * one from INPUT to OUTPUT. + */ + *dp++ = *sp++; + } + + /* + * Advance the control bit + */ + ctrl >>= 1; + } + } + + /* + * Check we decompressed the right amount. If we are slicing, then we + * won't necessarily be at the end of the source or dest buffers when we + * hit a stop, so we don't test them. + */ + if (check_complete && (dp != destend || sp != srcend)) { + return -1; + } + + /* + * That's it. + */ + return (char*) dp - dest; +} + + +int CompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option) +{ + if (PageIs8BXidHeapVersion(src)) { + return TemplateCompressPage(src, dst, dst_size, option); + } else { + return TemplateCompressPage(src, dst, dst_size, option); + } +} + +void CompressConvertRows(char *buf, char *aux_buf, int16 *real_order, uint16 max_row_len, uint16 real_row_cnt) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = real_row_cnt; + uint32 total_size = page->pd_special - page->pd_upper; + char *copy_begin = buf + page->pd_upper; + char *row; + uint16 i, j, k, cur, up, row_size; + + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + k = 0; + for (i = 0; i < max_row_len; i++) { + for (j = 0; j < row_cnt; j++) { + up = (j == (row_cnt - 1)) ? page->pd_special : GET_ITEMID_BY_IDX(buf, (real_order[j + 1]))->lp_off; + cur = GET_ITEMID_BY_IDX(buf, (real_order[j]))->lp_off; + row_size = up - cur; + row = buf + cur; + if (i < row_size) { + aux_buf[k++] = row[i]; // this part is reshaped + } + } + } + + if (k != total_size) { + printf("ERROR!!! convert_rows_2 error...!!!\n"); + ASSERT(0); + return; + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size); + securec_check(ret, "", ""); + return ; +} + +void CompressConvertItemIds(char *buf, char *aux_buf) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + uint32 total_size = row_cnt * sizeof(ItemIdData); + char *copy_begin = buf + GetPageHeaderSize(page); + uint16 i, j, k; + + // clear aux_buf + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + k = 0; + for (i = 0; i < row_cnt; i++) { + for (j = 0; j < sizeof(ItemIdData); j++) { + aux_buf[j * row_cnt + i] = copy_begin[k++]; + } + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size); + securec_check(ret, "", ""); + return ; +} + +void cprs_diff_convert_rows(char *buf, uint32 offset,uint16 min_row_len, uint16 real_row_cnt) { + uint16 row_cnt = real_row_cnt; + uint32 common_size = min_row_len; + uint8 *copy_begin = (uint8 *)(buf + offset); + uint16 i, j; + + for (i = 0; i < common_size; i++) { + for (j = row_cnt - 1; j > 0; j--) { + copy_begin[i * row_cnt + j] -= copy_begin[i * row_cnt + (j - 1)]; + } + } + return ; +} + +bool CompressConvertOnePage(char *buf, char *aux_buf, bool diff_convert) { + uint16 max_row_len = 0; + uint16 min_row_len = 0; + int16 *real_order = NULL; // itemids are not in order sometimes. we must find the real + uint16 real_row_cnt = 0; + if (!CompressConvertCheck(buf, &real_order, &max_row_len, &min_row_len, &real_row_cnt)) { + if (real_order != NULL) { + free(real_order); + } + return false; + } + + CompressConvertRows(buf, aux_buf, real_order, max_row_len, real_row_cnt); + CompressConvertItemIds(buf, aux_buf); + + if (diff_convert) { + cprs_diff_convert_rows(buf, ((HeapPageHeaderData *)buf)->pd_upper, min_row_len, real_row_cnt); + cprs_diff_convert_rows(buf, GetPageHeaderSize(buf), sizeof(ItemIdData), + (((HeapPageHeaderData *)buf)->pd_lower - GetPageHeaderSize(buf)) / sizeof(ItemIdData)); + } + + if (real_order != NULL) { + free(real_order); + } + return true; +} + +void CompressPagePrepareConvert(char *src, bool diff_convert, bool *real_ByteConvert) +{ + char *aux_buf = NULL; + errno_t rc; + + aux_buf = (char *)malloc(BLCKSZ); + if (aux_buf == NULL) { + // add log + return; + } + rc = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(rc, "", ""); + + // do convert + *real_ByteConvert = false; + if (CompressConvertOnePage(src, aux_buf, diff_convert)) { + *real_ByteConvert = true; + } + + if (aux_buf != NULL) { + free(aux_buf); + } +} + + +/** + * CompressPage() -- Compress one page. + * + * Only the parts other than the page header will be compressed. The + * compressed data is rounded by chunck_size, The insufficient part is + * filled with zero. Compression needs to be able to save at least one + * chunk of space, otherwise it fail. + * This function returen the size of compressed data or + * -1 for compression fail + * COMPRESS_UNSUPPORTED_ERROR for unrecognized compression algorithm + */ +template +int TemplateCompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option) +{ + int compressed_size; + int8 level = option.compressLevelSymbol ? option.compressLevel : -option.compressLevel; + size_t sizeOfHeaderData = GetSizeOfHeadData(heapPageData); + char *src_copy = NULL; + bool real_ByteConvert = false; + errno_t rc; + char* data; + + if (option.byteConvert) { + // copy and maybe change it + src_copy = (char *)malloc(BLCKSZ); + if (src_copy == NULL) { + // add log + return -1; + } + rc = memcpy_s(src_copy, BLCKSZ, src, BLCKSZ); + securec_check(rc, "", ""); + CompressPagePrepareConvert(src_copy, option.diffConvert, &real_ByteConvert); /* preprocess convert src */ + } + + if (heapPageData) { + data = ((HeapPageCompressData*)dst)->data; + } else { + data = ((PageCompressData*)dst)->data; + } + + switch (option.compressAlgorithm) { + case COMPRESS_ALGORITHM_PGLZ: + if (real_ByteConvert) { + compressed_size = lz_compress(src_copy + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, data); + } else { + compressed_size = lz_compress(src + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, data); + } + break; + case COMPRESS_ALGORITHM_ZSTD: { + if (level == 0 || level < MIN_ZSTD_COMPRESSION_LEVEL || level > MAX_ZSTD_COMPRESSION_LEVEL) { + level = DEFAULT_ZSTD_COMPRESSION_LEVEL; + } + + if (real_ByteConvert) { + compressed_size = ZSTD_compress(data, dst_size, src_copy + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, level); + } else { + compressed_size = ZSTD_compress(data, dst_size, src + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, level); + } + + if (ZSTD_isError(compressed_size)) { + if (src_copy != NULL) { + free(src_copy); + } + return -1; + } + break; + } + default: + if (src_copy != NULL) { + free(src_copy); + } + return COMPRESS_UNSUPPORTED_ERROR; + } + + if (compressed_size < 0) { + if (src_copy != NULL) { + free(src_copy); + } + return -1; + } + + if (heapPageData) { + HeapPageCompressData* pcdptr = ((HeapPageCompressData*)dst); + rc = memcpy_s(pcdptr->page_header, sizeOfHeaderData, src, sizeOfHeaderData); + securec_check(rc, "", ""); + pcdptr->size = compressed_size; + pcdptr->byte_convert = real_ByteConvert; + pcdptr->diff_convert = option.diffConvert; + } else { + PageCompressData* pcdptr = ((PageCompressData*)dst); + rc = memcpy_s(pcdptr->page_header, sizeOfHeaderData, src, sizeOfHeaderData); + securec_check(rc, "", ""); + pcdptr->size = compressed_size; + pcdptr->byte_convert = real_ByteConvert; + pcdptr->diff_convert = option.diffConvert; + } + + if (src_copy != NULL) { + free(src_copy); + } + return SIZE_OF_PAGE_COMPRESS_DATA_HEADER_DATA(heapPageData) + compressed_size; +} + +/** + * CompressPageBufferBound() + * -- Get the destination buffer boundary to compress one page. + * Return needed destination buffer size for compress one page or + * -1 for unrecognized compression algorithm + */ +int CompressPageBufferBound(const char* page, uint8 algorithm) +{ + switch (algorithm) { + case COMPRESS_ALGORITHM_PGLZ: + return BLCKSZ + 4; + case COMPRESS_ALGORITHM_ZSTD: + return ZSTD_compressBound(BLCKSZ - GetPageHeaderSize(page)); + default: + return -1; + } +} + + diff --git a/contrib/pagehack/openGaussCompression.cpp b/contrib/pagehack/openGaussCompression.cpp new file mode 100644 index 000000000..64eb02ba7 --- /dev/null +++ b/contrib/pagehack/openGaussCompression.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved. + */ + +#include "openGaussCompression.h" +#include "storage/checksum_impl.h" +#include "storage/page_compression_impl.h" + +void OpenGaussCompression::SetFilePath(const char *filePath, int segNo) +{ + int rc = snprintf_s(pcaFilePath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, filePath); + securec_check_ss_c(rc, "\0", "\0"); + rc = snprintf_s(pcdFilePath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, filePath); + securec_check_ss_c(rc, "\0", "\0"); + + this->segmentNo = segNo; +} + +OpenGaussCompression::~OpenGaussCompression() +{ + if (pcaFd != nullptr) { + fclose(pcaFd); + } + if (pcdFd != nullptr) { + fclose(pcdFd); + } + if (header != nullptr) { + pc_munmap(header); + } +} + +bool OpenGaussCompression::TryOpen() +{ + if ((pcaFd = fopen(this->pcaFilePath, "rb+")) == nullptr) { + return false; + } + if ((pcdFd = fopen(this->pcdFilePath, "rb+")) == nullptr) { + return false; + } + if (fseeko(pcaFd, (off_t)offsetof(PageCompressHeader, chunk_size), SEEK_SET) != 0) { + return false; + } + if (fread(&chunkSize, sizeof(chunkSize), 1, this->pcaFd) <= 0) { + return false; + } + header = pc_mmap(fileno(pcaFd), chunkSize, false); + return true; +} +bool OpenGaussCompression::ReadChunkOfBlock(char *dst, size_t *dstLen, BlockNumber blockNumber) +{ + auto currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber); + do { + auto chunkNum = currentAddr->nchunks; + for (uint8 i = 0; i < chunkNum; i++) { + off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]); + uint8 start = i; + while (i < chunkNum - 1 && currentAddr->chunknos[i + 1] == currentAddr->chunknos[i] + 1) { + i++; + } + if (fseeko(this->pcdFd, seekPos, SEEK_SET) != 0) { + return false; + } + size_t readAmount = chunkSize * (i - start + 1); + if (fread(dst + start * chunkSize, 1, readAmount, this->pcdFd) != readAmount && ferror(this->pcdFd)) { + return false; + } + *dstLen += readAmount; + } + if (chunkNum == 0 || DecompressPage(dst, decompressedBuffer, header->algorithm) == BLCKSZ) { + break; + } + } while (true); + if (PageIs8BXidHeapVersion(dst)) { + byteConvert = ((HeapPageCompressData *)dst)->byte_convert; + diffConvert = ((HeapPageCompressData *)dst)->diff_convert; + } else { + byteConvert = ((PageCompressData *)dst)->byte_convert; + diffConvert = ((PageCompressData *)dst)->diff_convert; + } + this->blockNumber = blockNumber; + return true; +} + +bool OpenGaussCompression::WriteBackCompressedData(char *source, size_t sourceLen, BlockNumber blockNumber) +{ + auto currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber); + for (size_t i = 0; i < currentAddr->nchunks; ++i) { + off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]); + if (fseeko(this->pcdFd, seekPos, SEEK_SET) != 0) { + return false; + } + Assert(sourceLen >= i * chunkSize); + auto writeCount = fwrite(source + i * chunkSize, 1, chunkSize, this->pcdFd); + bool success = chunkSize == writeCount; + if (!success) { + return false; + } + } + fflush(this->pcdFd); + return true; +} + +void OpenGaussCompression::MarkUncompressedDirty() +{ + constexpr int writeLen = BLCKSZ / 2; + unsigned char fill_byte[writeLen] = {0xFF}; + for (int i = 0; i < writeLen; i++) + fill_byte[i] = 0xFF; + auto rc = memcpy_s(decompressedBuffer + writeLen, BLCKSZ - writeLen, fill_byte, writeLen); + securec_check(rc, "", ""); +} + +BlockNumber OpenGaussCompression::GetMaxBlockNumber() +{ + return (BlockNumber)pg_atomic_read_u32(&header->nblocks); +} + +char *OpenGaussCompression::GetPcdFilePath() +{ + return this->pcdFilePath; +} + +char *OpenGaussCompression::GetDecompressedPage() +{ + return this->decompressedBuffer; +} + +bool OpenGaussCompression::WriteBackUncompressedData() +{ + auto algorithm = header->algorithm; + auto workBufferSize = CompressPageBufferBound(decompressedBuffer, algorithm); + if (workBufferSize < 0) { + return false; + } + char *work_buffer = (char *)malloc(workBufferSize); + RelFileCompressOption relFileCompressOption; + relFileCompressOption.compressPreallocChunks = 0; + relFileCompressOption.compressLevelSymbol = true; + relFileCompressOption.compressLevel = 1; + relFileCompressOption.compressAlgorithm = algorithm; + relFileCompressOption.byteConvert = byteConvert; + relFileCompressOption.diffConvert = diffConvert; + + auto compress_buffer_size = CompressPage(decompressedBuffer, work_buffer, workBufferSize, relFileCompressOption); + if (compress_buffer_size < 0) { + return false; + } + uint8 nchunks = (compress_buffer_size - 1) / chunkSize + 1; + auto bufferSize = chunkSize * nchunks; + if (bufferSize >= BLCKSZ) { + /* store original page if can not save space? */ + free(work_buffer); + work_buffer = (char *)decompressedBuffer; + nchunks = BLCKSZ / chunkSize; + } else { + /* fill zero in the last chunk */ + if (compress_buffer_size < bufferSize) { + auto leftSize = bufferSize - compress_buffer_size; + errno_t rc = memset_s(work_buffer + compress_buffer_size, leftSize, 0, leftSize); + securec_check(rc, "", ""); + } + } + uint8 need_chunks = nchunks; + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber); + if (pcAddr->allocated_chunks < need_chunks) { + auto chunkno = pg_atomic_fetch_add_u32(&header->allocated_chunks, need_chunks - pcAddr->allocated_chunks); + for (uint8 i = pcAddr->allocated_chunks; i < need_chunks; ++i) { + pcAddr->chunknos[i] = ++chunkno; + } + pcAddr->allocated_chunks = need_chunks; + pcAddr->nchunks = need_chunks; + } + return this->WriteBackCompressedData(work_buffer, compress_buffer_size, blockNumber); +} + + +#include "compression_algorithm.ini" \ No newline at end of file diff --git a/contrib/pagehack/openGaussCompression.h b/contrib/pagehack/openGaussCompression.h new file mode 100644 index 000000000..016c04faf --- /dev/null +++ b/contrib/pagehack/openGaussCompression.h @@ -0,0 +1,40 @@ +#ifndef OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H +#define OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H +#define FRONTEND 1 + + +#include +#include "c.h" +#include "storage/buf/block.h" +#include "storage/page_compression.h" + +class OpenGaussCompression { +private: + FILE* pcaFd = nullptr; + FILE* pcdFd = nullptr; + char pcaFilePath[MAXPGPATH]; + char pcdFilePath[MAXPGPATH]; + PageCompressHeader* header = nullptr; + +private: + int segmentNo; + BlockNumber blockNumber; + decltype(PageCompressHeader::chunk_size) chunkSize; + char decompressedBuffer[BLCKSZ]; + bool byteConvert; + bool diffConvert; + +public: + void SetFilePath(const char* filePath, int segNo); + virtual ~OpenGaussCompression(); + bool TryOpen(); + bool ReadChunkOfBlock(char* dst, size_t* dstLen, BlockNumber blockNumber); + bool WriteBackCompressedData(char* source, size_t sourceLen, BlockNumber blockNumber); + bool WriteBackUncompressedData(); + void MarkUncompressedDirty(); + BlockNumber GetMaxBlockNumber(); + char* GetPcdFilePath(); + char* GetDecompressedPage(); +}; + +#endif // OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H diff --git a/contrib/pagehack/pagehack.cpp b/contrib/pagehack/pagehack.cpp index d7e69bab2..77013434d 100644 --- a/contrib/pagehack/pagehack.cpp +++ b/contrib/pagehack/pagehack.cpp @@ -90,6 +90,9 @@ #include "tsdb/utils/constant_def.h" #endif +#include "openGaussCompression.h" + + /* Max number of pg_class oid, currently about 4000 */ #define MAX_PG_CLASS_ID 10000 /* Number of pg_class types */ @@ -129,6 +132,7 @@ static const char* PgHeapRelName[] = {"pg_class", "pg_am", "pg_statistic", "pg_toast"}; +typedef enum SegmentType { SEG_HEAP, SEG_FSM, SEG_UHEAP, SEG_INDEX_BTREE, SEG_UNDO, SEG_UNKNOWN } SegmentType; static void ParsePgClassTupleData(binary tupdata, int len, binary nullBitmap, int natrrs); static void ParsePgIndexTupleData(binary tupdata, int len, binary nullBitmap, int nattrs); @@ -146,6 +150,8 @@ static void ParseToastTupleData(binary tupdata, int len, binary nullBitmap, int static void ParseTDSlot(const char *page); static void ParseToastIndexTupleData(binary tupdata, int len, binary nullBitmap, int nattrs); +static int parse_uncompressed_page_file(const char *filename, SegmentType type, const uint32 start_point, + const uint32 number_read); static ParseHeapTupleData PgHeapRelTupleParser[] = { ParsePgClassTupleData, // pg_class @@ -894,8 +900,6 @@ static const char* HACKINGTYPE[] = {"heap", "segment" }; -typedef enum SegmentType { SEG_HEAP, SEG_FSM, SEG_UHEAP, SEG_INDEX_BTREE, SEG_UNDO, SEG_UNKNOWN } SegmentType; - const char* PageTypeNames[] = {"DATA", "FSM", "VM"}; #define GETHEAPSTRUCT(TUP) ((unsigned char*)(TUP) + (TUP)->t_hoff) @@ -3093,7 +3097,78 @@ static int parse_a_page(const char* buffer, int blkno, int blknum, SegmentType t return true; } +static BlockNumber CalculateMaxBlockNumber(BlockNumber blknum, BlockNumber start, BlockNumber number) +{ + /* parse */ + if (start >= blknum) { + fprintf(stderr, "start point exceeds the total block number of relation.\n"); + return InvalidBlockNumber; + } else if ((start + number) > blknum) { + fprintf(stderr, "don't have %d blocks from block %d in the relation, only %d blocks\n", number, start, + (blknum - start)); + number = blknum; + } else if (number == 0) { + number = blknum; + } else { + number += start; + } + return number; +} + static int parse_page_file(const char* filename, SegmentType type, const uint32 start_point, const uint32 number_read) +{ + if (type != SEG_HEAP && type != SEG_INDEX_BTREE) { + return parse_uncompressed_page_file(filename, type, start_point, number_read); + } + + auto openGaussCompression = new OpenGaussCompression(); + openGaussCompression->SetFilePath(filename, SegNo); + bool success = openGaussCompression->TryOpen(); + if (!success) { + delete openGaussCompression; + return parse_uncompressed_page_file(filename, type, start_point, number_read); + } + + BlockNumber start = start_point; + BlockNumber blknum = openGaussCompression->GetMaxBlockNumber(); + BlockNumber number = CalculateMaxBlockNumber(blknum, start, number_read); + if (number == InvalidBlockNumber) { + delete openGaussCompression; + return false; + } + char compressed[BLCKSZ]; + size_t compressedLen; + while (start < number) { + if (!openGaussCompression->ReadChunkOfBlock(compressed, &compressedLen, start)) { + fprintf(stderr, "read block %d failed, filename: %s: %s\n", start, openGaussCompression->GetPcdFilePath(), + strerror(errno)); + delete openGaussCompression; + return false; + } + if (!parse_a_page(openGaussCompression->GetDecompressedPage(), start, blknum, type)) { + fprintf(stderr, "Error during parsing block %d/%d\n", start, blknum); + delete openGaussCompression; + return false; + } + if ((write_back && num_item) || dirty_page) { + if (dirty_page) { + openGaussCompression->MarkUncompressedDirty(); + } + if (!openGaussCompression->WriteBackUncompressedData()) { + fprintf(stderr, "write back failed, filename: %s: %s\n", openGaussCompression->GetPcdFilePath(), + strerror(errno)); + delete openGaussCompression; + return false; + } + } + start++; + } + delete openGaussCompression; + return true; +} + +static int parse_uncompressed_page_file(const char *filename, SegmentType type, const uint32 start_point, + const uint32 number_read) { char buffer[BLCKSZ]; FILE* fd = NULL; @@ -3121,21 +3196,9 @@ static int parse_page_file(const char* filename, SegmentType type, const uint32 blknum = size / BLCKSZ; /* parse */ - if (start >= blknum) { - fprintf(stderr, "start point exceeds the total block number of relation.\n"); - fclose(fd); + number = CalculateMaxBlockNumber(blknum, start, number); + if (number == InvalidBlockNumber) { return false; - } else if ((start + number) > blknum) { - fprintf(stderr, - "don't have %d blocks from block %d in the relation, only %d blocks\n", - number, - start, - (blknum - start)); - number = blknum; - } else if (number == 0) { - number = blknum; - } else { - number += start; } Assert((start * BLCKSZ) < size); diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 3cfe30121..f833d44d0 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -717,7 +717,7 @@ else # not PGXS endif endif -override CPPFLAGS := $(CPPFLAGS) -I$(LIBODBC_INCLUDE_PATH) -I$(LIBOBS_INCLUDE_PATH) -I$(LIBCGROUP_INCLUDE_PATH) -I$(LIBOPENSSL_INCLUDE_PATH) -I${LIBORC_INCLUDE_PATH} -I${LIBPARQUET_INCLUDE_PATH} -I${PROTOBUF_INCLUDE_PATH} -I${BOOST_INCLUDE_PATH} -I$(LIBLLVM_INCLUDE_PATH) -I$(KERBEROS_INCLUDE_PATH) -I$(CJSON_INCLUDE_PATH) -I$(NUMA_INCLUDE_PATH) -I$(ZLIB_INCLUDE_PATH) -I$(LZ4_INCLUDE_PATH) -I$(LIBCURL_INCLUDE_PATH) -I$(DCF_INCLUDE_PATH) +override CPPFLAGS := $(CPPFLAGS) -I$(LIBODBC_INCLUDE_PATH) -I$(LIBOBS_INCLUDE_PATH) -I$(LIBCGROUP_INCLUDE_PATH) -I$(LIBOPENSSL_INCLUDE_PATH) -I${LIBORC_INCLUDE_PATH} -I${LIBPARQUET_INCLUDE_PATH} -I${PROTOBUF_INCLUDE_PATH} -I${BOOST_INCLUDE_PATH} -I$(LIBLLVM_INCLUDE_PATH) -I$(KERBEROS_INCLUDE_PATH) -I$(CJSON_INCLUDE_PATH) -I$(NUMA_INCLUDE_PATH) -I$(ZLIB_INCLUDE_PATH) -I$(LZ4_INCLUDE_PATH) -I$(LIBCURL_INCLUDE_PATH) -I$(DCF_INCLUDE_PATH) -I$(ZSTD_INCLUDE_PATH) # GDS links to libevent ifeq ($(enable_multiple_nodes), yes) @@ -852,6 +852,9 @@ endif # append zlib for compression: zlib LDFLAGS += -L$(ZLIB_LIB_PATH) -I$(ZLIB_INCLUDE_PATH) +#append zstd for compression: zstd +LDFLAGS += -L$(ZSTD_LIB_PATH) -I$(ZSTD_INCLUDE_PATH) + LDFLAGS += -L$(SECURE_LIB_PATH) LDFLAGS += -L$(LIBOPENSSL_LIB_PATH) LDFLAGS += -L$(LIBSTD_LIB_PATH) diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 29f927f57..3d0bcdd99 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -26,7 +26,7 @@ ifneq "$(MAKECMDGOALS)" "clean" endif endif endif -OBJS = file_ops.o datapagemap.o fetch.o filemap.o logging.o parsexlog.o pg_rewind.o +OBJS = file_ops.o datapagemap.o fetch.o filemap.o logging.o parsexlog.o pg_rewind.o compressed_rewind.o #all:gs_rewind.a diff --git a/src/bin/pg_rewind/compressed_common.h b/src/bin/pg_rewind/compressed_common.h new file mode 100644 index 000000000..cbb7c421e --- /dev/null +++ b/src/bin/pg_rewind/compressed_common.h @@ -0,0 +1,46 @@ +/* ------------------------------------------------------------------------- + * + * compressed_common.h + * + * Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * + * ------------------------------------------------------------------------- + */ +#ifndef OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H +#define OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H + +#include "utils/atomic.h" + + + +struct RewindCompressInfo { + bool compressed = false; /* compressed table or not */ + uint32 oldBlockNumber = 0; + uint32 newBlockNumber = 0; + uint8 algorithm = 0; /* compressed algorithm */ + uint16 chunkSize = 0; /* compressed chunk size */ +}; + +struct CompressedPcaInfo { + char *pcaMap = NULL; + int pcaFd = -1; + char path[MAXPGPATH]; + int32 chunkSize = 0; + int32 algorithm = 0; +}; + +#define COPY_REWIND_COMPRESS_INFO(entry, infoPointer, oldBlock, newBlock) \ + (entry)->rewindCompressInfo.oldBlockNumber = 0; \ + (entry)->rewindCompressInfo.newBlockNumber = 0; \ + (entry)->rewindCompressInfo.compressed = false; \ + (entry)->rewindCompressInfo.algorithm = 0; \ + (entry)->rewindCompressInfo.chunkSize = 0; \ + if ((infoPointer) != NULL && (infoPointer)->compressed) { \ + (entry)->rewindCompressInfo.oldBlockNumber = (oldBlock); \ + (entry)->rewindCompressInfo.newBlockNumber = (newBlock); \ + (entry)->rewindCompressInfo.compressed = true; \ + (entry)->rewindCompressInfo.algorithm = (infoPointer)->algorithm; \ + (entry)->rewindCompressInfo.chunkSize = (infoPointer)->chunkSize; \ + } + +#endif // OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H diff --git a/src/bin/pg_rewind/compressed_rewind.cpp b/src/bin/pg_rewind/compressed_rewind.cpp new file mode 100644 index 000000000..c73e4d29c --- /dev/null +++ b/src/bin/pg_rewind/compressed_rewind.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * compressed_rewind.cpp + * Functions for fetching compressed table. + * + * + * IDENTIFICATION + * ./src/bin/pg_rewind/compressed_rewind.cpp + * + * ------------------------------------------------------------------------- + */ +#include "compressed_rewind.h" +#include "libpq/libpq-fe.h" +#include "lib/string.h" +#include "logging.h" +#include "filemap.h" +#include "utils/elog.h" +#include "file_ops.h" + +void FormatPathToPca(const char* path, char* dst, size_t len, bool withPrefix) +{ + errno_t rc; + if (withPrefix) { + rc = snprintf_s(dst, len, len - 1, "%s/" PCA_SUFFIX, pg_data, path); + } else { + rc = snprintf_s(dst, len, len - 1, PCA_SUFFIX, path); + } + securec_check_ss_c(rc, "\0", "\0"); +} + +void FormatPathToPcd(const char* path, char* dst, size_t len, bool withPrefix) +{ + errno_t rc; + if (withPrefix) { + rc = snprintf_s(dst, len, len - 1, "%s/" PCD_SUFFIX, pg_data, path); + } else { + rc = snprintf_s(dst, len, len - 1, PCD_SUFFIX, path); + } + securec_check_ss_c(rc, "\0", "\0"); +} + +template +bool ReadCompressedInfo(T& t, off_t offset, FILE* file, char* pcaFilePath, size_t len) +{ + if (fseeko(file, offset, SEEK_SET) != 0) { + pg_fatal("could not seek in file \"%s\": \"%lu\": %s\n", pcaFilePath, len, strerror(errno)); + return false; + } + if (fread(&t, sizeof(t), 1, file) <= 0) { + pg_fatal("could not open file \"%s\": \"%lu\": %s\n", pcaFilePath, len, strerror(errno)); + return false; + } + return true; +} + +/** + * write RewindCompressInfo + * @param file file fp + * @param pcaFilePath file path,for ereport + * @param rewindCompressInfo pointer of return + * @return sucesss or not + */ +static bool ReadRewindCompressedInfo(FILE* file, char* pcaFilePath, size_t len, RewindCompressInfo* rewindCompressInfo) +{ + off_t offset = (off_t)offsetof(PageCompressHeader, chunk_size); + if (!ReadCompressedInfo(rewindCompressInfo->chunkSize, offset, file, pcaFilePath, len)) { + return false; + } + offset = (off_t)offsetof(PageCompressHeader, algorithm); + if (!ReadCompressedInfo(rewindCompressInfo->algorithm, offset, file, pcaFilePath, len)) { + return false; + } + offset = (off_t)offsetof(PageCompressHeader, nblocks); + if (!ReadCompressedInfo(rewindCompressInfo->oldBlockNumber, offset, file, pcaFilePath, len)) { + return false; + } + rewindCompressInfo->compressed = true; + return true; +} + +bool FetchSourcePca(const char* strValue, RewindCompressInfo* rewindCompressInfo) +{ + size_t length = 0; + PageCompressHeader* ptr = (PageCompressHeader*)PQunescapeBytea((const unsigned char*)strValue, &length); + rewindCompressInfo->compressed = false; + if (length == sizeof(PageCompressHeader)) { + rewindCompressInfo->compressed = true; + rewindCompressInfo->algorithm = ptr->algorithm; + rewindCompressInfo->newBlockNumber = ptr->nblocks; + rewindCompressInfo->oldBlockNumber = 0; + rewindCompressInfo->chunkSize = ptr->chunk_size; + } + PQfreemem(ptr); + return rewindCompressInfo->compressed; +} + +bool ProcessLocalPca(const char* tablePath, RewindCompressInfo* rewindCompressInfo) +{ + rewindCompressInfo->compressed = false; + if (!isRelDataFile(tablePath)) { + return false; + } + char pcaFilePath[MAXPGPATH]; + FormatPathToPca(tablePath, pcaFilePath, MAXPGPATH, true); + FILE* file = fopen(pcaFilePath, "rb"); + if (file == NULL) { + if (errno == ENOENT) { + return false; + } + pg_fatal("could not open file \"%s\": %s\n", pcaFilePath, strerror(errno)); + return false; + } + bool success = ReadRewindCompressedInfo(file, pcaFilePath, MAXPGPATH, rewindCompressInfo); + fclose(file); + return success; +} \ No newline at end of file diff --git a/src/bin/pg_rewind/compressed_rewind.h b/src/bin/pg_rewind/compressed_rewind.h new file mode 100644 index 000000000..967c0b76f --- /dev/null +++ b/src/bin/pg_rewind/compressed_rewind.h @@ -0,0 +1,21 @@ +/* ------------------------------------------------------------------------- + * + * compressed_rewind.h + * + * Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * + * ------------------------------------------------------------------------- + */ +#ifndef OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H +#define OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H + +#include "compressed_common.h" +#include "storage/page_compression.h" +#include "storage/smgr/relfilenode.h" + +extern bool FetchSourcePca(const char* strValue, RewindCompressInfo* rewindCompressInfo); +extern bool ProcessLocalPca(const char* tablePath, RewindCompressInfo* rewindCompressInfo); +extern void FormatPathToPca(const char* path, char* dst, size_t len, bool withPrefix = false); +extern void FormatPathToPcd(const char* path, char* dst, size_t len, bool withPrefix = false); + +#endif // OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H diff --git a/src/bin/pg_rewind/fetch.cpp b/src/bin/pg_rewind/fetch.cpp index 6d2cfe12e..4ad5aa894 100755 --- a/src/bin/pg_rewind/fetch.cpp +++ b/src/bin/pg_rewind/fetch.cpp @@ -23,6 +23,7 @@ #include "libpq/libpq-fe.h" #include "libpq/libpq-int.h" #include "common/fe_memutils.h" +#include "compressed_rewind.h" #include "catalog/catalog.h" #include "catalog/pg_type.h" @@ -47,11 +48,11 @@ const uint64 MAX_FILE_SIZE = 0xFFFFFFFF; #define MAX_PARAM_LEN 1024 static BuildErrorCode receiveFileChunks(const char* sql, FILE* file); -static BuildErrorCode execute_pagemap(datapagemap_t* pagemap, const char* path, FILE* file); +static BuildErrorCode execute_pagemap(file_entry_t* entry, FILE* file); static char* run_simple_query(const char* sql); static BuildErrorCode recurse_dir(const char* datadir, const char* path, process_file_callback_t callback); static void get_slot_name_by_app_name(void); - +static BuildErrorCode CheckResultSet(PGresult* pgResult); BuildErrorCode libpqConnect(const char* connstr) { PGresult* res = NULL; @@ -246,10 +247,22 @@ BuildErrorCode fetchSourceFileList() * general, so if the admin has put any custom symbolic links in the data * directory, they won't be copied correctly. */ - sql = "SELECT path, size, isdir, pg_tablespace_location(pg_tablespace.oid) AS link_target \n" + /* skip pca/pcd files and concat pca with table file */ + sql = "WITH tmp_table AS (\n" + "SELECT path, size, isdir, pg_tablespace_location(pg_tablespace.oid) AS link_target \n" "FROM (SELECT * FROM pg_stat_file_recursive('.')) AS files \n" - "LEFT OUTER JOIN pg_tablespace ON files.path like 'pg_tblspc/%' AND oid::text = files.filename\n"; - res = PQexec(conn, sql); + "LEFT OUTER JOIN pg_tablespace ON files.path ~ '^pg_tblspc/' AND oid :: text = files.filename\n" + "),compressed_address AS (SELECT path pca_path, substr(path, 0, length(path) - 4) AS table_path\n" + "FROM pg_stat_file_recursive('.') WHERE path ~ '_pca$' AND length(path) > 4)\n" + "SELECT path, size, isdir, link_target,\n" + "CASE WHEN pca_path IS NOT NULL THEN pg_read_binary_file(pca_path, 0, %d, true)\n" + "ELSE NULL END AS pchdr\n" + "FROM tmp_table LEFT JOIN compressed_address\n" + "ON tmp_table.path = compressed_address.table_path\nWHERE path !~ '_pca$' AND path !~ '_pcd$'\n"; + char sqlbuf[1024]; + int rc = snprintf_s(sqlbuf, sizeof(sqlbuf), sizeof(sqlbuf) - 1, sql, SIZE_OF_PAGE_COMPRESS_HEADER_DATA); + securec_check_ss_c(rc, "\0", "\0"); + res = PQexec(conn, (const char*)sqlbuf); if (PQresultStatus(res) != PGRES_TUPLES_OK) { pg_log(PG_ERROR, "could not fetch file list: %s", PQresultErrorMessage(res)); @@ -257,7 +270,7 @@ BuildErrorCode fetchSourceFileList() } /* sanity check the result set */ - if (PQnfields(res) != 4) { + if (PQnfields(res) != 5) { pg_fatal("unexpected result set while fetching file list\n"); PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); } @@ -300,7 +313,13 @@ BuildErrorCode fetchSourceFileList() } } } - process_source_file(path, type, filesize, link_target); + RewindCompressInfo rewindCompressInfo; + RewindCompressInfo *pointer = NULL; + if (!PQgetisnull(res, i, 4) && FetchSourcePca(PQgetvalue(res, i, 4), &rewindCompressInfo)) { + filesize = rewindCompressInfo.newBlockNumber * BLCKSZ; + pointer = &rewindCompressInfo; + } + process_source_file(path, type, filesize, link_target, pointer); PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); } PQclear(res); @@ -356,7 +375,7 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file) } /* sanity check the result set */ - if (PQnfields(res) != 4 || PQntuples(res) != 1) { + if (PQnfields(res) != 7 || PQntuples(res) != 1) { pg_fatal("unexpected result set size while fetching remote files\n"); PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); } @@ -385,6 +404,8 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file) pg_fatal("unexpected result length while fetching remote files\n"); PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); } + /* check compressed result set */ + CheckResultSet(res); /* Read result set to local variables */ errorno = memcpy_s(&chunkoff, sizeof(int32), PQgetvalue(res, 0, 1), sizeof(int32)); @@ -420,15 +441,37 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file) continue; } - pg_log(PG_DEBUG, "received chunk for file \"%s\", offset %d, size %d\n", - filename, chunkoff, chunksize); - fprintf(file, "received chunk for file \"%s\", offset %d, size %d\n", - filename, chunkoff, chunksize); + int32 algorithm; + errorno = memcpy_s(&algorithm, sizeof(int32), PQgetvalue(res, 0, 4), sizeof(int32)); + securec_check_c(errorno, "\0", "\0"); + algorithm = ntohl(algorithm); + if (algorithm == 0) { + pg_log(PG_DEBUG, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize); + fprintf(file, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize); + open_target_file(filename, false); + pg_free(filename); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + write_target_range(chunk, chunkoff, chunksize, chunkspace); + } else { + int32 chunkSize; + int errorno = memcpy_s(&chunkSize, sizeof(int32), PQgetvalue(res, 0, 5), sizeof(int32)); + securec_check_c(errorno, "\0", "\0"); + chunkSize = ntohl(chunkSize); + bool rebuild = *PQgetvalue(res, 0, 6) != 0; + char dst[MAXPGPATH]; + /* open pca */ + FormatPathToPca(filename, dst, MAXPGPATH, false); + OpenCompressedPcaFile(dst, chunkSize, algorithm, rebuild); - open_target_file(filename, false); - pg_free(filename); - PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); - write_target_range(chunk, chunkoff, chunksize, chunkspace); + /* open pcd */ + FormatPathToPcd(filename, dst, MAXPGPATH, false); + open_target_file(dst, false); + BlockNumber blockNumber = chunkoff; + size_t blockSize = chunkspace; + + /* fetch result */ + FetchCompressedFile(chunk, blockNumber, blockSize); + } PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); PQclear(res); res = NULL; @@ -436,6 +479,32 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file) return BUILD_SUCCESS; } +/** + * check result set of compressed tables + * @param pgResult result + * @return success or not + */ +static BuildErrorCode CheckResultSet(PGresult* res) +{ +#define PQ_TYPE(index, type) (PQftype(res, (index)) != (type)) + if (PQ_TYPE(4, INT4OID) || PQ_TYPE(5, INT4OID) || PQ_TYPE(6, BOOLOID)) { + pg_fatal( + "FetchCompressedFile:unexpected data types: %u %u %u\n", PQftype(res, 4), PQftype(res, 5), PQftype(res, 6)); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + } +#define PQ_FORMAT(index) (PQfformat(res, 0) != 1) + if (PQ_FORMAT(4) && PQ_FORMAT(5) && PQ_FORMAT(6)) { + pg_fatal("unexpected result format while fetching remote files\n"); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + } +#define PQ_ISNULL(index) (PQgetisnull(res, 0, (index))) + if (PQ_ISNULL(4) || PQ_ISNULL(5) || PQ_ISNULL(6)) { + pg_fatal("unexpected null values in result while fetching remote files\n"); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + } + return BUILD_SUCCESS; +} + /* * Receive a single file as a malloc'd buffer. */ @@ -489,6 +558,43 @@ error: return result; } +static void CompressedFileCopy(const file_entry_t* entry, bool rebuild) +{ + Assert(!rebuild || entry->rewindCompressInfo.oldBlockNumber == 0); + if (dry_run) { + return; + } + + char linebuf[MAXPGPATH + 47]; + int ret = snprintf_s(linebuf, + sizeof(linebuf), + sizeof(linebuf) - 1, + "%s\t%u\t%u\t%u\t%u\t%u\n", + entry->path, + entry->rewindCompressInfo.oldBlockNumber, + entry->rewindCompressInfo.newBlockNumber - entry->rewindCompressInfo.oldBlockNumber, + entry->rewindCompressInfo.algorithm, + entry->rewindCompressInfo.chunkSize, + rebuild); + securec_check_ss_c(ret, "\0", "\0"); + if (PQputCopyData(conn, linebuf, strlen(linebuf)) != 1) { + pg_fatal("could not send COPY data: %s", PQerrorMessage(conn)); + } + pg_log(PG_PROGRESS, "CompressedFileCopy:%s", linebuf); +} + +static void CompressedFileRemove(const file_entry_t* entry) +{ + remove_target((file_entry_t*) entry); + char* path = entry->path; + char dst[MAXPGPATH]; + FormatPathToPca(path, dst, MAXPGPATH); + remove_target_file(dst, false); + FormatPathToPcd(path, dst, MAXPGPATH); + remove_target_file(dst, false); + pg_log(PG_PROGRESS, "CompressedFileRemove: %s\n", path); +} + /* * Write a file range to a temporary table in the server. * @@ -498,7 +604,7 @@ error: */ static void fetch_file_range(const char* path, unsigned int begin, unsigned int end) { - char linebuf[MAXPGPATH + 23]; + char linebuf[MAXPGPATH + 47]; int ss_c = 0; /* Split the range into CHUNKSIZE chunks */ @@ -510,12 +616,12 @@ static void fetch_file_range(const char* path, unsigned int begin, unsigned int } else { len = end - begin; } - ss_c = snprintf_s(linebuf, sizeof(linebuf), sizeof(linebuf) - 1, "%s\t%u\t%u\n", path, begin, len); + ss_c = snprintf_s( + linebuf, sizeof(linebuf), sizeof(linebuf) - 1, "%s\t%u\t%u\t%u\t%u\t%u\n", path, begin, len, 0, 0, 0); securec_check_ss_c(ss_c, "\0", "\0"); if (PQputCopyData(conn, linebuf, strlen(linebuf)) != 1) pg_fatal("could not send COPY data: %s", PQerrorMessage(conn)); - begin += len; } } @@ -534,7 +640,8 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file) * First create a temporary table, and load it with the blocks that we * need to fetch. */ - sql = "CREATE TEMPORARY TABLE fetchchunks(path text, begin int4, len int4);"; + sql = "CREATE TEMPORARY TABLE fetchchunks(path text, begin int4, len int4, " + "algorithm int4, chunksize int4, rebuild bool);"; res = PQexec(conn, sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) { pg_fatal("could not create temporary table: %s", PQresultErrorMessage(res)); @@ -558,11 +665,16 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file) entry = map->array[i]; /* report all the path to check whether it's correct */ + if (entry->rewindCompressInfo.compressed) { + pg_log(PG_PROGRESS, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action); + + } pg_log(PG_DEBUG, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action); fprintf(file, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action); /* If this is a relation file, copy the modified blocks */ - execute_pagemap(&entry->pagemap, entry->path, file); + bool compressed = entry->rewindCompressInfo.compressed; + execute_pagemap(entry, file); PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); switch (entry->action) { @@ -571,29 +683,47 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file) break; case FILE_ACTION_COPY: - /* Truncate the old file out of the way, if any */ - open_target_file(entry->path, true); - PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); - fetch_file_range(entry->path, 0, entry->newsize); - PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + if (compressed) { + CompressedFileCopy(entry, true); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + } else { + /* Truncate the old file out of the way, if any */ + open_target_file(entry->path, true); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + fetch_file_range(entry->path, 0, entry->newsize); + PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); + } break; case FILE_ACTION_TRUNCATE: - truncate_target_file(entry->path, entry->newsize); + if (compressed) { + CompressedFileTruncate(entry->path, &entry->rewindCompressInfo); + } else { + truncate_target_file(entry->path, entry->newsize); + } PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); break; case FILE_ACTION_COPY_TAIL: - fetch_file_range(entry->path, entry->oldsize, entry->newsize); + if (compressed) { + CompressedFileCopy(entry, false); + } else { + fetch_file_range(entry->path, entry->oldsize, entry->newsize); + } PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); break; case FILE_ACTION_REMOVE: - remove_target(entry); + if (compressed) { + CompressedFileRemove(entry); + } else { + remove_target(entry); + } PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); break; case FILE_ACTION_CREATE: + Assert(!compressed); create_target(entry); PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res); break; @@ -625,9 +755,14 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file) * temporary table. Now, actually fetch all of those ranges. */ sql = "SELECT path, begin, \n" - " pg_read_binary_file(path, begin, len, true) AS chunk,\n" - " len \n" - "FROM fetchchunks\n"; + " pg_read_binary_file(path, begin, len, true) AS chunk, len, algorithm, chunksize,rebuild \n" + "FROM fetchchunks where algorithm =0 \n" + "union all \n" + "select (json->>'path')::text as path, (json->>'blocknum')::int4 as begin, (json->>'data')::bytea as chunk,\n" + "(json->>'len')::int4 as len, algorithm, chunksize,rebuild \n" + "from (select row_to_json(pg_read_binary_file_blocks(path,begin,len)) json, algorithm, chunksize,rebuild \n" + "from fetchchunks where algorithm !=0) \n" + "order by path, begin;"; fprintf(file, "fetch and write file based on temporary table fetchchunks.\n"); return receiveFileChunks(sql, file); @@ -687,7 +822,7 @@ BuildErrorCode backupFileMap(filemap_t* map) /* to be supported later */ break; - case FILE_ACTION_COPY: + case FILE_ACTION_COPY: { /* create fake file for restore when file not exist, otherwise, backup file */ file_entry_t statbuf; if (targetFilemapSearch(entry->path, &statbuf) < 0) { @@ -696,6 +831,7 @@ BuildErrorCode backupFileMap(filemap_t* map) backup_target_file(entry->path, divergeXlogFileName); } break; + } case FILE_ACTION_COPY_TAIL: case FILE_ACTION_TRUNCATE: @@ -719,17 +855,60 @@ BuildErrorCode backupFileMap(filemap_t* map) return BUILD_SUCCESS; } -static BuildErrorCode execute_pagemap(datapagemap_t* pagemap, const char* path, FILE* file) +/** + * combine continue blocks numbers and copy file + * @param entry file entry + * @param file file + */ +static void CompressedFileCopy(file_entry_t* entry, FILE* file) +{ + datapagemap_t* pagemap = &entry->pagemap; + datapagemap_iterator_t* iter = datapagemap_iterate(pagemap); + + BlockNumber blkno; + file_entry_t fileEntry; + fileEntry.path = entry->path; + fileEntry.rewindCompressInfo = entry->rewindCompressInfo; + int invalidNumber = -1; + long int before = invalidNumber; + while (datapagemap_next(iter, &blkno)) { + fprintf(file, " block %u\n", blkno); + if (before == -1) { + fileEntry.rewindCompressInfo.oldBlockNumber = blkno; + before = blkno; + } else { + if (before == blkno - 1) { + before = blkno; + } else { + fileEntry.rewindCompressInfo.newBlockNumber = before + 1; + CompressedFileCopy(&fileEntry, false); + fileEntry.rewindCompressInfo.oldBlockNumber = blkno; + before = blkno; + } + } + } + if (before != invalidNumber) { + fileEntry.rewindCompressInfo.newBlockNumber = before + 1; + CompressedFileCopy(&fileEntry, false); + } +} +static BuildErrorCode execute_pagemap(file_entry_t* entry, FILE* file) { datapagemap_iterator_t* iter = NULL; BlockNumber blkno; off_t offset; + datapagemap_t* pagemap = &entry->pagemap; + char* path = entry->path; iter = datapagemap_iterate(pagemap); - while (datapagemap_next(iter, &blkno)) { - fprintf(file, " block %u\n", blkno); - offset = blkno * BLCKSZ; - fetch_file_range(path, offset, offset + BLCKSZ); + if (entry->rewindCompressInfo.compressed) { + CompressedFileCopy(entry, file); + } else { + while (datapagemap_next(iter, &blkno)) { + fprintf(file, " block %u\n", blkno); + offset = blkno * BLCKSZ; + fetch_file_range(path, offset, offset + BLCKSZ); + } } pg_free(iter); return BUILD_SUCCESS; @@ -775,9 +954,19 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p struct stat fst; char fullpath[MAXPGPATH]; char path[MAXPGPATH]; + const size_t MINPCANAMESIZE = 4; if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; + /* Skip compressed page files */ + size_t dirNamePath = strlen(xlde->d_name); + if (dirNamePath >= MINPCANAMESIZE) { + const char* suffix = xlde->d_name + dirNamePath - MINPCANAMESIZE; + if (strncmp(suffix, "_pca", MINPCANAMESIZE) == 0 || strncmp(suffix, "_pcd", MINPCANAMESIZE) == 0) { + continue; + } + } + ss_c = snprintf_s(fullpath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", fullparentpath, xlde->d_name); securec_check_ss_c(ss_c, "\0", "\0"); @@ -808,8 +997,15 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p continue; if (S_ISREG(fst.st_mode)) { - if ((uint64)fst.st_size <= MAX_FILE_SIZE) { - callback(path, FILE_TYPE_REGULAR, fst.st_size, NULL); + uint64 fileSize = (uint64)fst.st_size; + RewindCompressInfo rewindCompressInfo; + RewindCompressInfo *pointer = NULL; + if (ProcessLocalPca(path, &rewindCompressInfo)) { + fileSize = rewindCompressInfo.oldBlockNumber * BLCKSZ; + pointer = &rewindCompressInfo; + } + if (fileSize <= MAX_FILE_SIZE) { + callback(path, FILE_TYPE_REGULAR, fileSize, NULL, pointer); if (increment_return_code != BUILD_SUCCESS) { (void)closedir(xldir); } @@ -818,7 +1014,7 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p pg_log(PG_WARNING, "file size of \"%s\" is over %ld\n", fullpath, MAX_FILE_SIZE); } } else if (S_ISDIR(fst.st_mode)) { - callback(path, FILE_TYPE_DIRECTORY, 0, NULL); + callback(path, FILE_TYPE_DIRECTORY, 0, NULL, NULL); if (increment_return_code != BUILD_SUCCESS) { (void)closedir(xldir); } @@ -843,7 +1039,7 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p } link_target[len] = '\0'; - callback(path, FILE_TYPE_SYMLINK, 0, link_target); + callback(path, FILE_TYPE_SYMLINK, 0, link_target, NULL); /* * If it's a symlink within pg_tblspc, we need to recurse into it, diff --git a/src/bin/pg_rewind/fetch.h b/src/bin/pg_rewind/fetch.h index 713d78c16..c383e2087 100755 --- a/src/bin/pg_rewind/fetch.h +++ b/src/bin/pg_rewind/fetch.h @@ -42,7 +42,9 @@ extern XLogRecPtr libpqGetCurrentXlogInsertLocation(void); extern void libpqRequestCheckpoint(void); -typedef void (*process_file_callback_t)(const char* path, file_type_t type, size_t size, const char* link_target); +typedef void (*process_file_callback_t)(const char* path, file_type_t type, size_t oldsize, const char* link_target, + const RewindCompressInfo* rewindCompressInfo); + extern BuildErrorCode traverse_datadir(const char* datadir, process_file_callback_t callback); extern void get_source_slotname(void); diff --git a/src/bin/pg_rewind/file_ops.cpp b/src/bin/pg_rewind/file_ops.cpp index 2f7f6e4b0..50ef7dae5 100644 --- a/src/bin/pg_rewind/file_ops.cpp +++ b/src/bin/pg_rewind/file_ops.cpp @@ -25,6 +25,8 @@ #include "common/fe_memutils.h" #include "common/build_query/build_query.h" +#include "compressed_rewind.h" +#include "storage/page_compression_impl.h" #include "replication/replicainternal.h" #define BLOCKSIZE (8 * 1024) @@ -36,6 +38,8 @@ static int dstfd = -1; static char dstpath[MAXPGPATH] = ""; static bool g_isRelDataFile = false; +static CompressedPcaInfo g_compressedPcaInfo; + static void create_target_dir(const char* path); static void remove_target_dir(const char* path); static void create_target_symlink(const char* path, const char* slink); @@ -101,7 +105,7 @@ void close_target_file(void) dstfd = -1; } -void write_target_range(char* buf, off_t begin, size_t size, int space) +void write_target_range(char* buf, off_t begin, size_t size, int space, bool compressed) { int writeleft; char* p = NULL; @@ -112,7 +116,7 @@ void write_target_range(char* buf, off_t begin, size_t size, int space) if (dry_run) return; - if (begin % BLOCKSIZE != 0) { + if (!compressed && begin % BLOCKSIZE != 0) { (void)close(dstfd); dstfd = -1; pg_fatal("seek position %ld in target file \"%s\" is not in BLOCKSIZEs\n", size, dstpath); @@ -1221,3 +1225,142 @@ bool tablespaceDataIsValid(const char* path) return true; } + +void CompressedFileTruncate(const char *path, const RewindCompressInfo *rewindCompressInfo) +{ + if (dry_run) { + return; + } + + uint16 chunkSize = rewindCompressInfo->chunkSize; + + BlockNumber oldBlockNumber = rewindCompressInfo->oldBlockNumber; + BlockNumber newBlockNumber = rewindCompressInfo->newBlockNumber; + + Assert(oldBlockNumber > newBlockNumber); + char pcaPath[MAXPGPATH]; + FormatPathToPca(path, pcaPath, MAXPGPATH, true); + + int pcaFd = open(pcaPath, O_RDWR | PG_BINARY, 0600); + if (pcaFd < 0) { + pg_fatal("CompressedFileTruncate: could not open file \"%s\": %s\n", pcaPath, strerror(errno)); + return; + } + + PageCompressHeader* map = pc_mmap(pcaFd, chunkSize, false); + if (map == MAP_FAILED) { + pg_fatal("CompressedFileTruncate: Failed to mmap file \"%s\": %s\n", pcaPath, strerror(errno)); + return; + } + /* write zero to truncated addr */ + for (BlockNumber blockNumber = newBlockNumber; blockNumber < oldBlockNumber; ++blockNumber) { + PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(map, chunkSize, blockNumber); + for (size_t i = 0; i < addr->allocated_chunks; ++i) { + addr->chunknos[i] = 0; + } + addr->nchunks = 0; + addr->allocated_chunks = 0; + addr->checksum = 0; + } + map->last_synced_nblocks = map->nblocks = newBlockNumber; + + /* find the max used chunk number */ + pc_chunk_number_t beforeUsedChunks = map->allocated_chunks; + pc_chunk_number_t max_used_chunkno = 0; + for (BlockNumber blockNumber = 0; blockNumber < newBlockNumber; ++blockNumber) { + PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(map, chunkSize, blockNumber); + for (uint8 i = 0; i < addr->allocated_chunks; i++) { + if (addr->chunknos[i] > max_used_chunkno) { + max_used_chunkno = addr->chunknos[i]; + } + } + } + map->allocated_chunks = map->last_synced_allocated_chunks = max_used_chunkno; + + /* truncate pcd qfile */ + if (beforeUsedChunks > max_used_chunkno) { + char pcdPath[MAXPGPATH]; + FormatPathToPcd(path, pcdPath, MAXPGPATH, false); + truncate_target_file(pcdPath, max_used_chunkno * chunkSize); + } + pc_munmap(map); + pg_log(PG_PROGRESS, "CompressedFileTruncate: %s\n", path); +} + +void OpenCompressedPcaFile(const char* fileName, int32 chunkSize, int32 algorithm, bool rebuild) +{ + if (dry_run) { + return; + } + if (g_compressedPcaInfo.pcaFd != -1 && strcmp(fileName, &g_compressedPcaInfo.path[strlen(pg_data) + 1]) == 0) { + /* already open */ + return; + } + CloseCompressedPcaFile(); + int rc = snprintf_s(g_compressedPcaInfo.path, sizeof(g_compressedPcaInfo.path), + sizeof(g_compressedPcaInfo.path) - 1, + "%s/%s", pg_data, fileName); + securec_check_ss_c(rc, "\0", "\0"); + + int mode = O_RDWR | PG_BINARY; + mode = rebuild ? (mode | O_TRUNC | O_CREAT) : mode; + + g_compressedPcaInfo.pcaFd = open(g_compressedPcaInfo.path, mode, S_IRUSR | S_IWUSR); + if (g_compressedPcaInfo.pcaFd < 0) { + pg_fatal("could not open compressed pca file \"%s\": %s\n", g_compressedPcaInfo.path, strerror(errno)); + return; + } + g_compressedPcaInfo.algorithm = algorithm; + g_compressedPcaInfo.chunkSize = chunkSize; + g_compressedPcaInfo.pcaMap = (char*) pc_mmap(g_compressedPcaInfo.pcaFd, chunkSize, false); + if ((void*)g_compressedPcaInfo.pcaMap == MAP_FAILED) { + pg_fatal("OpenCompressedPcaFile: Failed to mmap file \"%s\": %s\n", g_compressedPcaInfo.path, strerror(errno)); + return; + } +} + +void CloseCompressedPcaFile() +{ + if (g_compressedPcaInfo.pcaFd == -1) { + return; + } + pc_munmap((PageCompressHeader*)g_compressedPcaInfo.pcaMap); + if (close(g_compressedPcaInfo.pcaFd) != 0) { + pg_fatal("could not close target file \"%s\": %s\n", g_compressedPcaInfo.path, gs_strerror(errno)); + } + g_compressedPcaInfo.pcaFd = -1; + g_compressedPcaInfo.pcaMap = NULL; + g_compressedPcaInfo.chunkSize = 0; + g_compressedPcaInfo.algorithm = 0; +} + +void FetchCompressedFile(char* buf, BlockNumber blockNumber, int32 size) +{ + int32 chunkSize = g_compressedPcaInfo.chunkSize; + int needChunks = size / chunkSize; + + PageCompressHeader* pcMap = (PageCompressHeader*) g_compressedPcaInfo.pcaMap; + PageCompressAddr* pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunkSize, blockNumber); + + // 2. allocate chunks + if (pcAddr->allocated_chunks < needChunks) { + auto chunkno = pg_atomic_fetch_add_u32(&pcMap->allocated_chunks, needChunks - pcAddr->allocated_chunks); + for (int i = pcAddr->allocated_chunks; i < needChunks; i++) { + pcAddr->chunknos[i] = ++chunkno; + } + pcAddr->allocated_chunks = needChunks; + } + for (int32 i = 0; i < needChunks; ++i) { + auto buffer_pos = buf + chunkSize * i; + off_t seekpos = (off_t) OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, pcAddr->chunknos[i]); + int32 start = i; + while (i < needChunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) { + i++; + } + int write_amount = chunkSize * (i - start + 1); + // open file dstfd + write_target_range(buffer_pos, seekpos, write_amount, 0, true); + } + pcAddr->nchunks = pcAddr->allocated_chunks; + pcAddr->checksum = AddrChecksum32(blockNumber, pcAddr); +} diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h index e1a54c767..f28bb79af 100644 --- a/src/bin/pg_rewind/file_ops.h +++ b/src/bin/pg_rewind/file_ops.h @@ -11,10 +11,11 @@ #define FILE_OPS_H #include "filemap.h" +#include "compressed_common.h" extern char* pg_data; extern void open_target_file(const char* path, bool trunc); -extern void write_target_range(char* buf, off_t begin, size_t size, int space); +extern void write_target_range(char* buf, off_t begin, size_t size, int space, bool compressed = false); extern void close_target_file(void); extern void truncate_target_file(const char* path, off_t newsize); extern void create_target(file_entry_t* t); @@ -40,6 +41,9 @@ extern bool restore_target_dir(const char* datadir_target, bool remove_from); extern void delete_target_file(const char* file); extern bool isPathInFilemap(const char* path); extern bool tablespaceDataIsValid(const char* path); - +extern void CompressedFileTruncate(const char* path, const RewindCompressInfo* rewindCompressInfo); +void FetchCompressedFile(char* buf, BlockNumber begin, int32 size); +void OpenCompressedPcaFile(const char* fileName, int32 chunkSize, int32 algorithm, bool rebuild); +void CloseCompressedPcaFile(); #endif /* FILE_OPS_H */ diff --git a/src/bin/pg_rewind/filemap.cpp b/src/bin/pg_rewind/filemap.cpp index 4223f25c0..8c9fcfff1 100755 --- a/src/bin/pg_rewind/filemap.cpp +++ b/src/bin/pg_rewind/filemap.cpp @@ -19,6 +19,7 @@ #include "catalog/catalog.h" #include "catalog/pg_tablespace.h" #include "common/fe_memutils.h" +#include "compressed_rewind.h" #include "storage/cu.h" #include "storage/smgr/fd.h" @@ -127,7 +128,8 @@ void filemapInit(void) filemaptarget = filemap_create(); } -void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, const char* link_target) +void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, const char* link_target, + const RewindCompressInfo* info) { file_entry_t* entry = NULL; filemap_t* map = filemaptarget; @@ -143,6 +145,8 @@ void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, co entry->pagemap.bitmap = NULL; entry->pagemap.bitmapsize = 0; + COPY_REWIND_COMPRESS_INFO(entry, info, info == NULL ? 0 : info->oldBlockNumber, 0) + if (map->last != NULL) { map->last->next = entry; map->last = entry; @@ -211,7 +215,7 @@ BuildErrorCode targetFilemapProcess(void) filemap_t* map = filemaptarget; for (i = 0; i < map->narray; i++) { entry = map->array[i]; - process_target_file(entry->path, entry->type, entry->oldsize, entry->link_target); + process_target_file(entry->path, entry->type, entry->oldsize, entry->link_target, &entry->rewindCompressInfo); } return BUILD_SUCCESS; } @@ -322,7 +326,8 @@ static bool process_source_file_sanity_check(const char* path, file_type_t type) * action needs to be taken for the file, depending on whether the file * exists in the target and whether the size matches. */ -void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target) +void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target, + RewindCompressInfo* info) { bool exists = false; char localpath[MAXPGPATH]; @@ -330,6 +335,7 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con filemap_t* map = filemap; file_action_t action = FILE_ACTION_NONE; size_t oldsize = 0; + BlockNumber oldBlockNumber = 0; file_entry_t* entry = NULL; int ss_c = 0; bool isreldatafile = false; @@ -480,7 +486,21 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con * replayed. */ /* mod blocksize 8k to avoid half page write */ - oldsize = statbuf.oldsize; + RewindCompressInfo oldRewindCompressInfo; + bool sourceCompressed = info != NULL; + bool targetCompressed = ProcessLocalPca(path, &oldRewindCompressInfo); + if (sourceCompressed && !targetCompressed) { + info->compressed = false; + action = FILE_ACTION_REMOVE; + break; + } else if (!sourceCompressed && targetCompressed) { + info = &oldRewindCompressInfo; + action = FILE_ACTION_REMOVE; + break; + } else if (sourceCompressed && targetCompressed) { + oldBlockNumber = oldRewindCompressInfo.oldBlockNumber; + oldsize = oldBlockNumber * BLCKSZ; + } if (oldsize % BLOCKSIZE != 0) { oldsize = oldsize - (oldsize % BLOCKSIZE); pg_log(PG_PROGRESS, "target file size mod BLOCKSIZE not equal 0 %s %ld \n", path, statbuf.oldsize); @@ -511,6 +531,8 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con entry->pagemap.bitmapsize = 0; entry->isrelfile = isreldatafile; + COPY_REWIND_COMPRESS_INFO(entry, info, oldBlockNumber, info == NULL ? 0 : info->newBlockNumber) + if (map->last != NULL) { map->last->next = entry; map->last = entry; @@ -526,7 +548,8 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con * marks target data directory's files that didn't exist in the source for * deletion. */ -void process_target_file(const char* path, file_type_t type, size_t oldsize, const char* link_target) +void process_target_file(const char* path, file_type_t type, size_t oldsize, const char* link_target, + const RewindCompressInfo* info) { bool exists = false; file_entry_t key; @@ -555,7 +578,7 @@ void process_target_file(const char* path, file_type_t type, size_t oldsize, con */ for (int excludeIdx = 0; excludeFiles[excludeIdx] != NULL; excludeIdx++) { if (strstr(path, excludeFiles[excludeIdx]) != NULL) { - pg_log(PG_DEBUG, "entry \"%s\" excluded from target file list", path); + pg_log(PG_DEBUG, "entry \"%s\" excluded from target file list\n", path); return; } } @@ -607,6 +630,8 @@ void process_target_file(const char* path, file_type_t type, size_t oldsize, con entry->pagemap.bitmapsize = 0; entry->isrelfile = isRelDataFile(path); + COPY_REWIND_COMPRESS_INFO(entry, info, info == NULL ? 0 : info->oldBlockNumber, 0) + if (map->last == NULL) map->first = entry; else @@ -769,7 +794,8 @@ void process_waldata_change( entry->pagemap.bitmap = NULL; entry->pagemap.bitmapsize = 0; entry->isrelfile = isRelDataFile(path); - + RewindCompressInfo *rewindCompressInfo = NULL; + COPY_REWIND_COMPRESS_INFO(entry, rewindCompressInfo, 0, 0) if (map->last != NULL) { map->last->next = entry; map->last = entry; diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h index e5c566f1d..f4e0c9ac5 100644 --- a/src/bin/pg_rewind/filemap.h +++ b/src/bin/pg_rewind/filemap.h @@ -8,6 +8,7 @@ #ifndef FILEMAP_H #define FILEMAP_H +#include "compressed_common.h" #include "storage/smgr/relfilenode.h" #include "storage/buf/block.h" @@ -42,6 +43,9 @@ typedef struct file_entry_t { file_action_t action; + /* for compressed table */ + RewindCompressInfo rewindCompressInfo; + /* for a regular file */ size_t oldsize; size_t newsize; @@ -96,8 +100,10 @@ extern void print_filemap(void); extern void print_filemap_to_file(FILE* file); /* Functions for populating the filemap */ -extern void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target); -extern void process_target_file(const char* path, file_type_t type, size_t newsize, const char* link_target); +extern void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target, + RewindCompressInfo* rewindCompressInfo = nullptr); +extern void process_target_file(const char* path, file_type_t type, size_t newsize, const char* link_target, + const RewindCompressInfo* rewindCompressInfo = nullptr); extern void process_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno); extern void process_waldata_change( ForkNumber forknum, RelFileNode rnode, StorageEngine store, off_t file_offset, size_t data_size); diff --git a/src/bin/pg_rewind/parsexlog.cpp b/src/bin/pg_rewind/parsexlog.cpp index f918300f3..cf6757959 100644 --- a/src/bin/pg_rewind/parsexlog.cpp +++ b/src/bin/pg_rewind/parsexlog.cpp @@ -161,7 +161,7 @@ BuildErrorCode findCommonCheckpoint(const char* datadir, TimeLineID tli, XLogRec pg_fatal("find max lsn fail, errmsg:%s\n", returnmsg); return BUILD_FATAL; } - pg_log(PG_PROGRESS, "find max lsn success, %s\n", returnmsg); + pg_log(PG_PROGRESS, "find max lsn success, %s", returnmsg); readprivate.datadir = datadir; readprivate.tli = tli; diff --git a/src/common/backend/catalog/builtin_funcs.ini b/src/common/backend/catalog/builtin_funcs.ini index 1e224889c..1d973063c 100755 --- a/src/common/backend/catalog/builtin_funcs.ini +++ b/src/common/backend/catalog/builtin_funcs.ini @@ -3416,8 +3416,9 @@ AddBuiltinFunc(_0(3470), _1("gs_password_notifytime"), _2(0), _3(true), _4(false), _5(gs_password_notifytime), _6(23), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('i'), _19(0), _20(0), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_password_notifytime"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)) ), AddFuncGroup( - "gs_read_block_from_remote", 1, - AddBuiltinFunc(_0(4767), _1("gs_read_block_from_remote"), _2(9), _3(true), _4(false), _5(gs_read_block_from_remote), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(9, 23, 23, 23, 21, 23, 28, 23, 28, 16), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)) + "gs_read_block_from_remote", 2, + AddBuiltinFunc(_0(4767), _1("gs_read_block_from_remote"), _2(9), _3(true), _4(false), _5(gs_read_block_from_remote), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(9, 23, 23, 23, 21, 23, 28, 23, 28, 16), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)), + AddBuiltinFunc(_0(4768), _1("gs_read_block_from_remote"), _2(10), _3(true), _4(false), _5(gs_read_block_from_remote_compress), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(10, 23, 23, 23, 21, 21, 23, 28, 23, 28, 16), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote_compress"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)) ), AddFuncGroup( "gs_respool_exception_info", 1, @@ -7685,6 +7686,10 @@ AddBuiltinFunc(_0(3827), _1("pg_read_binary_file"), _2(4), _3(true), _4(false), _5(pg_read_binary_file), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(4, 25, 20, 20, 16), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_binary_file"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)), AddBuiltinFunc(_0(3828), _1("pg_read_binary_file"), _2(1), _3(true), _4(false), _5(pg_read_binary_file_all), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(1, 25), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_binary_file_all"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)) ), + AddFuncGroup( + "pg_read_binary_file_blocks", 1, + AddBuiltinFunc(_0(8413), _1("pg_read_binary_file_blocks"), _2(3), _3(true), _4(true), _5(pg_read_binary_file_blocks), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(100), _11(20), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(3, 25, 20, 20), _21(7, 25, 20, 20, 25, 23, 23, 17), _22(7, 'i', 'i', 'i', 'o', 'o', 'o', 'o'), _23(7, "input", "blocknum", "blockcount", "path", "blocknum", "len", "data"), _24(NULL), _25("pg_read_binary_file_blocks"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f')) + ), AddFuncGroup( "pg_read_file", 2, AddBuiltinFunc(_0(2624), _1("pg_read_file"), _2(3), _3(true), _4(false), _5(pg_read_file), _6(25), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(3, 25, 20, 20), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_file"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false)), diff --git a/src/common/backend/catalog/heap.cpp b/src/common/backend/catalog/heap.cpp index c9faf0d0c..066ceaed6 100644 --- a/src/common/backend/catalog/heap.cpp +++ b/src/common/backend/catalog/heap.cpp @@ -83,6 +83,7 @@ #include "pgxc/groupmgr.h" #include "storage/buf/buf.h" #include "storage/predicate.h" +#include "storage/page_compression.h" #include "storage/buf/bufmgr.h" #include "storage/lmgr.h" #include "storage/smgr/smgr.h" @@ -453,8 +454,9 @@ static void InitPartitionDef(Partition newPartition, Oid partOid, char strategy) */ Relation heap_create(const char* relname, Oid relnamespace, Oid reltablespace, Oid relid, Oid relfilenode, Oid bucketOid, TupleDesc tupDesc, char relkind, char relpersistence, bool partitioned_relation, bool rowMovement, - bool shared_relation, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, Oid ownerid, - bool skip_create_storage, TableAmType tam_type, int8 relindexsplit, StorageType storage_type, bool newcbi) + bool shared_relation, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, Datum reloptions, + Oid ownerid, bool skip_create_storage, TableAmType tam_type, int8 relindexsplit, StorageType storage_type, + bool newcbi, Oid accessMethodObjectId) { bool create_storage = false; Relation rel; @@ -564,9 +566,11 @@ Relation heap_create(const char* relname, Oid relnamespace, Oid reltablespace, O relpersistence, relkind, row_compress, + reloptions, tam_type, relindexsplit, - storage_type + storage_type, + accessMethodObjectId ); if (partitioned_relation) { @@ -2640,6 +2644,7 @@ Oid heap_create_with_catalog(const char *relname, Oid relnamespace, Oid reltable mapped_relation, allow_system_table_mods, row_compress, + reloptions, ownerid, false, tam, @@ -5167,7 +5172,7 @@ void dropDeltaTableOnPartition(Oid partId) * */ Partition heapCreatePartition(const char* part_name, bool for_partitioned_table, Oid part_tablespace, Oid part_id, - Oid partFileNode, Oid bucketOid, Oid ownerid, StorageType storage_type, bool newcbi) + Oid partFileNode, Oid bucketOid, Oid ownerid, StorageType storage_type, bool newcbi, Datum reloptions) { Partition new_part_desc = NULL; bool createStorage = false; @@ -5220,7 +5225,8 @@ Partition heapCreatePartition(const char* part_name, bool for_partitioned_table, part_id, /* partition oid */ partFileNode, /* partition's file node, same as partition oid*/ part_tablespace, - for_partitioned_table ? HEAP_DISK : storage_type); + for_partitioned_table ? HEAP_DISK : storage_type, + reloptions); /* * Save newcbi as a context indicator to @@ -5619,7 +5625,9 @@ Oid heapAddRangePartition(Relation pgPartRel, Oid partTableOid, Oid partTablespa newPartrelfileOid, bucketOid, ownerid, - storage_type); + storage_type, + false, + reloptions); Assert(newPartitionOid == PartitionGetPartid(newPartition)); InitPartitionDef(newPartition, partTableOid, PART_STRATEGY_RANGE); @@ -5812,7 +5820,9 @@ Oid HeapAddIntervalPartition(Relation pgPartRel, Relation rel, Oid partTableOid, partrelfileOid, bucketOid, ownerid, - storage_type); + storage_type, + false, + reloptions); pfree(partName); Assert(newPartitionOid == PartitionGetPartid(newPartition)); @@ -5904,7 +5914,10 @@ Oid HeapAddListPartition(Relation pgPartRel, Oid partTableOid, Oid partTablespac partrelfileOid, bucketOid, ownerid, - storage_type); + storage_type, + false, + reloptions); + Assert(newListPartitionOid == PartitionGetPartid(newListPartition)); InitPartitionDef(newListPartition, partTableOid, PART_STRATEGY_LIST); @@ -6167,7 +6180,9 @@ Oid HeapAddHashPartition(Relation pgPartRel, Oid partTableOid, Oid partTablespac partrelfileOid, bucketOid, ownerid, - storage_type); + storage_type, + false, + reloptions); Assert(newHashPartitionOid == PartitionGetPartid(newHashPartition)); InitPartitionDef(newHashPartition, partTableOid, PART_STRATEGY_HASH); @@ -6328,7 +6343,9 @@ static void addNewPartitionTupleForTable(Relation pg_partition_rel, const char* new_partition_rfoid, InvalidOid, ownerid, - HEAP_DISK); + HEAP_DISK, + false, + reloptions); Assert(new_partition_oid == PartitionGetPartid(new_partition)); new_partition->pd_part->parttype = PART_OBJ_TYPE_PARTED_TABLE; diff --git a/src/common/backend/catalog/index.cpp b/src/common/backend/catalog/index.cpp index 5d223e13b..5c5a4cf8a 100644 --- a/src/common/backend/catalog/index.cpp +++ b/src/common/backend/catalog/index.cpp @@ -913,9 +913,9 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid index indexRelation = heap_create(indexRelationName, namespaceId, tableSpaceId, indexRelationId, relFileNode, RELATION_CREATE_BUCKET(heapRelation) ? heapRelation->rd_bucketoid : InvalidOid, indexTupDesc, relKind, relpersistence, isLocalPart, false, shared_relation, mapped_relation, allow_system_table_mods, - REL_CMPRS_NOT_SUPPORT, heapRelation->rd_rel->relowner, skip_create_storage, + REL_CMPRS_NOT_SUPPORT, (Datum)reloptions, heapRelation->rd_rel->relowner, skip_create_storage, isUstore ? TAM_USTORE : TAM_HEAP, /* XXX: Index tables are by default HEAP Table Type */ - relindexsplit, storage_type, extra->crossBucket); + relindexsplit, storage_type, extra->crossBucket, accessMethodObjectId); Assert(indexRelationId == RelationGetRelid(indexRelation)); @@ -933,7 +933,6 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid index * XXX should have a cleaner way to create cataloged indexes */ indexRelation->rd_rel->relowner = heapRelation->rd_rel->relowner; - indexRelation->rd_rel->relam = accessMethodObjectId; indexRelation->rd_rel->relhasoids = false; if (accessMethodObjectId == PSORT_AM_OID) { @@ -1245,7 +1244,8 @@ Oid partition_index_create(const char* partIndexName, /* the name of partition i parentIndex->rd_bucketoid, parentIndex->rd_rel->relowner, RelationGetStorageType(parentIndex), - extra->crossbucket); + extra->crossbucket, + indexRelOptions); partitionIndex->pd_part->parttype = PART_OBJ_TYPE_INDEX_PARTITION; partitionIndex->pd_part->rangenum = 0; partitionIndex->pd_part->parentid = parentIndexId; @@ -1283,9 +1283,13 @@ Oid partition_index_create(const char* partIndexName, /* the name of partition i partitionIndex->pd_part->relfrozenxid = (ShortTransactionId)InvalidTransactionId; /* insert into pg_partition */ +#ifndef ENABLE_MULTIPLE_NODES + insertPartitionEntry(pg_partition_rel, partitionIndex, partitionIndex->pd_id, NULL, NULL, 0, 0, 0, indexRelOptions, + PART_OBJ_TYPE_INDEX_PARTITION); +#else insertPartitionEntry( pg_partition_rel, partitionIndex, partitionIndex->pd_id, NULL, NULL, 0, 0, 0, 0, PART_OBJ_TYPE_INDEX_PARTITION); - +#endif /* Make the above change visible */ CommandCounterIncrement(); diff --git a/src/common/backend/catalog/storage.cpp b/src/common/backend/catalog/storage.cpp index 7c2513830..3d8ee8893 100644 --- a/src/common/backend/catalog/storage.cpp +++ b/src/common/backend/catalog/storage.cpp @@ -316,17 +316,30 @@ void log_smgrcreate(RelFileNode* rnode, ForkNumber forkNum) if (IsSegmentFileNode(*rnode)) { return; } - + + xl_smgr_create_compress xlrec; + uint size; + uint8 info = XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE; + /* + * compressOptions Copy + */ + if (rnode->opt != 0) { + xlrec.pageCompressOpts = rnode->opt; + size = sizeof(xl_smgr_create_compress); + info |= XLR_REL_COMPRESS; + } else { + size = sizeof(xl_smgr_create); + } + /* * Make an XLOG entry reporting the file creation. */ - xl_smgr_create xlrec; - xlrec.forkNum = forkNum; - RelFileNodeRelCopy(xlrec.rnode, *rnode); + xlrec.xlrec.forkNum = forkNum; + RelFileNodeRelCopy(xlrec.xlrec.rnode, *rnode); XLogBeginInsert(); - XLogRegisterData((char*)&xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE, false, rnode->bucketNode); + XLogRegisterData((char*)&xlrec, size); + XLogInsert(RM_SMGR_ID, info, false, rnode->bucketNode); } static void CStoreRelDropStorage(Relation rel, RelFileNode* rnode, Oid ownerid) @@ -688,15 +701,26 @@ void RelationTruncate(Relation rel, BlockNumber nblocks) * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; - xl_smgr_truncate xlrec; + xl_smgr_truncate_compress xlrec; + uint size; + uint8 info = XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE; - xlrec.blkno = nblocks; - RelFileNodeRelCopy(xlrec.rnode, rel->rd_node); + xlrec.xlrec.blkno = nblocks; + + if (rel->rd_node.opt != 0) { + xlrec.pageCompressOpts = rel->rd_node.opt; + size = sizeof(xl_smgr_truncate_compress); + info |= XLR_REL_COMPRESS; + } else { + size = sizeof(xl_smgr_truncate); + } + + RelFileNodeRelCopy(xlrec.xlrec.rnode, rel->rd_node); XLogBeginInsert(); - XLogRegisterData((char*)&xlrec, sizeof(xlrec)); + XLogRegisterData((char*)&xlrec, size); - lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE, false, rel->rd_node.bucketNode); + lsn = XLogInsert(RM_SMGR_ID, info, false, rel->rd_node.bucketNode); /* * Flush, because otherwise the truncation of the main relation might @@ -1207,7 +1231,7 @@ void smgr_redo(XLogReaderState* record) { XLogRecPtr lsn = record->EndRecPtr; uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - + bool compress = XLogRecGetInfo(record) & XLR_REL_COMPRESS; /* Backup blocks are not used in smgr records */ Assert(!XLogRecHasAnyBlockRefs(record)); @@ -1216,14 +1240,14 @@ void smgr_redo(XLogReaderState* record) RelFileNode rnode; RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record)); - smgr_redo_create(rnode, xlrec->forkNum, (char *)xlrec); - /* Redo column file, attid is hidden in forkNum */ - + rnode.opt = compress ? ((xl_smgr_create_compress*)XLogRecGetData(record))->pageCompressOpts : 0; + smgr_redo_create(rnode, xlrec->forkNum, (char *)xlrec); + /* Redo column file, attid is hidden in forkNum */ } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate* xlrec = (xl_smgr_truncate*)XLogRecGetData(record); RelFileNode rnode; RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record)); - + rnode.opt = compress ? ((xl_smgr_truncate_compress*)XLogRecGetData(record))->pageCompressOpts : 0; /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in diff --git a/src/common/backend/utils/adt/dbsize.cpp b/src/common/backend/utils/adt/dbsize.cpp index 3cdf93eb3..93e5b9f30 100644 --- a/src/common/backend/utils/adt/dbsize.cpp +++ b/src/common/backend/utils/adt/dbsize.cpp @@ -69,6 +69,7 @@ #include "storage/custorage.h" #include "storage/smgr/segment.h" #include "storage/cstore/cstore_compress.h" +#include "storage/page_compression.h" #include "vecexecutor/vecnodes.h" #ifdef PGXC @@ -791,6 +792,7 @@ int64 calculate_relation_size(RelFileNode* rfn, BackendId backend, ForkNumber fo relationpath = relpathbackend(*rfn, backend, forknum); + bool rowCompress = IS_COMPRESSED_RNODE((*rfn), forknum); for (segcount = 0;; segcount++) { struct stat fst; @@ -807,7 +809,7 @@ int64 calculate_relation_size(RelFileNode* rfn, BackendId backend, ForkNumber fo else ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathname))); } - totalsize += fst.st_size; + totalsize += rowCompress ? CalculateMainForkSize((char*)pathname, rfn, forknum) : fst.st_size; } pfree_ext(relationpath); diff --git a/src/common/backend/utils/adt/genfile.cpp b/src/common/backend/utils/adt/genfile.cpp index 322138937..295850a60 100644 --- a/src/common/backend/utils/adt/genfile.cpp +++ b/src/common/backend/utils/adt/genfile.cpp @@ -316,6 +316,132 @@ Datum pg_read_binary_file_all(PG_FUNCTION_ARGS) PG_RETURN_BYTEA_P(read_binary_file(filename, 0, -1, false)); } +struct CompressAddressItemState { + uint32 blkno; + int segmentNo; + ReadBlockChunksStruct rbStruct; + FILE *pcaFile; +}; + +static void ReadBinaryFileBlocksFirstCall(PG_FUNCTION_ARGS, int32 startBlockNum, int32 blockCount) +{ + char* path = convert_and_check_filename(PG_GETARG_TEXT_PP(0)); + int segmentNo = 0; + UndoFileType undoFileType = UNDO_INVALID; + if (!is_row_data_file(path, &segmentNo, &undoFileType)) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("%s is not a relation file.", path))); + } + /* create a function context for cross-call persistence */ + FuncCallContext* fctx = SRF_FIRSTCALL_INIT(); + + /* switch to memory context appropriate for multiple function calls */ + MemoryContext mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); + + /* initialize file scanning code */ + CompressAddressItemState* itemState = (CompressAddressItemState*)palloc(sizeof(CompressAddressItemState)); + + /* save mmap to inter_call_data->pcMap */ + char pcaFilePath[MAXPGPATH]; + errno_t rc = snprintf_s(pcaFilePath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + FILE* pcaFile = AllocateFile((const char*)pcaFilePath, "rb"); + if (pcaFile == NULL) { + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcaFilePath))); + } + PageCompressHeader* map = pc_mmap(fileno(pcaFile), ReadChunkSize(pcaFile, pcaFilePath, MAXPGPATH), true); + if (map == MAP_FAILED) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to mmap %s: %m", pcaFilePath))); + } + if ((BlockNumber)startBlockNum + (BlockNumber)blockCount > map->nblocks) { + auto blockNum = map->nblocks; + ReleaseMap(map, pcaFilePath); + ereport(ERROR, + (ERRCODE_INVALID_PARAMETER_VALUE, + errmsg("invalid blocknum \"%d\" and block count \"%d\", the max blocknum is \"%u\"", + startBlockNum, + blockCount, + blockNum))); + } + /* construct ReadBlockChunksStruct */ + char* pcdFilePath = (char*)palloc0(MAXPGPATH); + rc = snprintf_s(pcdFilePath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + FILE* fp = AllocateFile(pcdFilePath, "rb"); + if (fp == NULL) { + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcdFilePath))); + } + char* pageBuffer = (char*)palloc(BLCKSZ); + itemState->pcaFile = pcaFile; + itemState->rbStruct.header = map; + itemState->rbStruct.pageBuffer = pageBuffer; + itemState->rbStruct.pageBufferLen = BLCKSZ; + itemState->rbStruct.fp = fp; + itemState->rbStruct.segmentNo = segmentNo; + itemState->rbStruct.fileName = pcdFilePath; + + /* + * build tupdesc for result tuples. This must match this function's + * pg_proc entry! + */ + TupleDesc tupdesc = CreateTemplateTupleDesc(4, false, TAM_HEAP); + TupleDescInitEntry(tupdesc, (AttrNumber)1, "path", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)2, "blocknum", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)3, "len", INT4OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)4, "data", BYTEAOID, -1, 0); + fctx->tuple_desc = BlessTupleDesc(tupdesc); + + itemState->blkno = startBlockNum; + fctx->max_calls = blockCount; + fctx->user_fctx = itemState; + + MemoryContextSwitchTo(mctx); +} + +Datum pg_read_binary_file_blocks(PG_FUNCTION_ARGS) +{ + int32 startBlockNum = PG_GETARG_INT32(1); + int32 blockCount = PG_GETARG_INT32(2); + + if (startBlockNum < 0 || blockCount <= 0 || startBlockNum + blockCount > RELSEG_SIZE) { + ereport(ERROR, (ERRCODE_INVALID_PARAMETER_VALUE, + errmsg("invalid blocknum \"%d\" or block count \"%d\"", startBlockNum, blockCount))); + } + + /* stuff done only on the first call of the function */ + if (SRF_IS_FIRSTCALL()) { + ReadBinaryFileBlocksFirstCall(fcinfo, startBlockNum, blockCount); + } + + /* stuff done on every call of the function */ + FuncCallContext *fctx = SRF_PERCALL_SETUP(); + CompressAddressItemState *itemState = (CompressAddressItemState *)fctx->user_fctx; + + if (fctx->call_cntr < fctx->max_calls) { + bytea *buf = (bytea *)palloc(BLCKSZ + VARHDRSZ); + size_t len = ReadAllChunkOfBlock(VARDATA(buf), BLCKSZ, itemState->blkno, itemState->rbStruct); + SET_VARSIZE(buf, len + VARHDRSZ); + Datum values[4]; + values[0] = PG_GETARG_DATUM(0); + values[1] = Int32GetDatum(itemState->blkno); + values[2] = Int32GetDatum(len); + values[3] = PointerGetDatum(buf); + + /* Build and return the result tuple. */ + bool nulls[4]; + securec_check(memset_s(nulls, sizeof(nulls), 0, sizeof(nulls)), "\0", "\0"); + HeapTuple tuple = heap_form_tuple(fctx->tuple_desc, (Datum*)values, (bool*)nulls); + Datum result = HeapTupleGetDatum(tuple); + itemState->blkno++; + SRF_RETURN_NEXT(fctx, result); + } else { + if (itemState->rbStruct.header != NULL) { + pc_munmap(itemState->rbStruct.header); + } + FreeFile(itemState->pcaFile); + FreeFile(itemState->rbStruct.fp); + SRF_RETURN_DONE(fctx); + } +} /* * stat a file diff --git a/src/common/backend/utils/adt/pg_lzcompress.cpp b/src/common/backend/utils/adt/pg_lzcompress.cpp index 6ff680251..256c31704 100644 --- a/src/common/backend/utils/adt/pg_lzcompress.cpp +++ b/src/common/backend/utils/adt/pg_lzcompress.cpp @@ -664,3 +664,281 @@ void pglz_decompress(const PGLZ_Header* source, char* dest) * That's it. */ } + +/* ---------- + * lz_compress - + * + * Compresses source into dest using strategy. Returns the number of + * bytes written in buffer dest, or -1 if compression fails. + * ---------- + */ +int32 lz_compress(const char* source, int32 slen, char* dest) +{ + unsigned char* bp = (unsigned char*) dest; + unsigned char* bstart = bp; + int hist_next = 0; + bool hist_recycle = false; + const char* dp = source; + const char* dend = source + slen; + unsigned char ctrl_dummy = 0; + unsigned char* ctrlp = &ctrl_dummy; + unsigned char ctrlb = 0; + unsigned char ctrl = 0; + bool found_match = false; + int32 match_len; + int32 match_off; + int32 good_match; + int32 good_drop; + int32 result_size; + int32 result_max; + int32 need_rate; + errno_t rc; + + const PGLZ_Strategy* strategy = PGLZ_strategy_always; + /* + * Our fallback strategy is the default. + */ + if (strategy == NULL) { + strategy = PGLZ_strategy_default; + } + + /* + * If the strategy forbids compression (at all or if source chunk size out + * of range), fail. + */ + if (strategy->match_size_good <= 0 || slen < strategy->min_input_size || slen > strategy->max_input_size) { + return -1; + } + + /* + * Limit the match parameters to the supported range. + */ + good_match = strategy->match_size_good; + if (good_match > PGLZ_MAX_MATCH) { + good_match = PGLZ_MAX_MATCH; + } else if (good_match < 17) { + good_match = 17; + } + + good_drop = strategy->match_size_drop; + if (good_drop < 0) { + good_drop = 0; + } else if (good_drop > 100) { + good_drop = 100; + } + + need_rate = strategy->min_comp_rate; + if (need_rate < 0) { + need_rate = 0; + } else if (need_rate > 99) { + need_rate = 99; + } + + /* + * Compute the maximum result size allowed by the strategy, namely the + * input size minus the minimum wanted compression rate. This had better + * be <= slen, else we might overrun the provided output buffer. + */ + if (slen > (INT_MAX / 100)) { + /* Approximate to avoid overflow */ + result_max = (slen / 100) * (100 - need_rate); + } else { + result_max = (slen * (100 - need_rate)) / 100; + } + + /* + * Initialize the history lists to empty. We do not need to zero the + * hist_entries[] array; its entries are initialized as they are used. + */ + rc = memset_s(u_sess->utils_cxt.hist_start, HIST_START_LEN, 0, HIST_START_LEN); + securec_check(rc, "\0", "\0"); + + /* + * Compress the source directly into the output buffer. + */ + while (dp < dend) { + /* + * If we already exceeded the maximum result size, fail. + * + * We check once per loop; since the loop body could emit as many as 4 + * bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better + * allow 4 slop bytes. + */ + if (bp - bstart >= result_max) { + return -1; + } + + /* + * If we've emitted more than first_success_by bytes without finding + * anything compressible at all, fail. This lets us fall out + * reasonably quickly when looking at incompressible input (such as + * pre-compressed data). + */ + if (!found_match && bp - bstart >= strategy->first_success_by) { + return -1; + } + + /* + * Try to find a match in the history + */ + if (pglz_find_match(u_sess->utils_cxt.hist_start, dp, dend, &match_len, &match_off, good_match, good_drop)) { + /* + * Create the tag and add history entries for all matched + * characters. + */ + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); + while (match_len--) { + pglz_hist_add( + u_sess->utils_cxt.hist_start, u_sess->utils_cxt.hist_entries, hist_next, hist_recycle, dp, + dend); + dp++; /* Do not do this ++ in the line above! */ + /* The macro would do it four times - Jan. */ + } + found_match = true; + } else { + /* + * No match found. Copy one literal byte. + */ + pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp); + pglz_hist_add( + u_sess->utils_cxt.hist_start, u_sess->utils_cxt.hist_entries, hist_next, hist_recycle, dp, dend); + dp++; /* Do not do this ++ in the line above! */ + /* The macro would do it four times - Jan. */ + } + } + + /* + * Write out the last control byte and check that we haven't overrun the + * output size allowed by the strategy. + */ + *ctrlp = ctrlb; + result_size = bp - bstart; + if (result_size >= result_max) { + return -1; + } + + /* success */ + return result_size; +} + +/* ---------- + * pglz_decompress - + * + * Decompresses source into dest. Returns the number of bytes + * decompressed in the destination buffer, and *optionally* + * checks that both the source and dest buffers have been + * fully read and written to, respectively. + * ---------- + */ +int32 lz_decompress(const char* source, int32 slen, char* dest, int32 rawsize, bool check_complete) +{ + const unsigned char* sp; + const unsigned char* srcend; + unsigned char* dp; + unsigned char* destend; + errno_t rc = 0; + + sp = (const unsigned char*) source; + srcend = ((const unsigned char*) source) + slen; + dp = (unsigned char*) dest; + destend = dp + rawsize; + + while (sp < srcend && dp < destend) { + /* + * Read one control byte and process the next 8 items (or as many as + * remain in the compressed input). + */ + unsigned char ctrl = *sp++; + int ctrlc; + + for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) { + if (ctrl & 1) { + /* + * Set control bit means we must read a match tag. The match + * is coded with two bytes. First byte uses lower nibble to + * code length - 3. Higher nibble contains upper 4 bits of the + * offset. The next following byte contains the lower 8 bits + * of the offset. If the length is coded as 18, another + * extension tag byte tells how much longer the match really + * was (0-255). + */ + int32 len; + int32 off; + + len = (sp[0] & 0x0f) + 3; + off = ((sp[0] & 0xf0) << 4) | sp[1]; + sp += 2; + if (len == 18) { + len += *sp++; + } + + /* + * Now we copy the bytes specified by the tag from OUTPUT to + * OUTPUT (copy len bytes from dp - off to dp). The copied + * areas could overlap, to preven possible uncertainty, we + * copy only non-overlapping regions. + */ + len = Min(len, destend - dp); + while (off < len) { + /*--------- + * When offset is smaller than length - source and + * destination regions overlap. memmove() is resolving + * this overlap in an incompatible way with pglz. Thus we + * resort to memcpy()-ing non-overlapping regions. + * + * Consider input: 112341234123412341234 + * At byte 5 here ^ we have match with length 16 and + * offset 4. 11234M(len=16, off=4) + * We are decoding first period of match and rewrite match + * 112341234M(len=12, off=8) + * + * The same match is now at position 9, it points to the + * same start byte of output, but from another position: + * the offset is doubled. + * + * We iterate through this offset growth until we can + * proceed to usual memcpy(). If we would try to decode + * the match at byte 5 (len=16, off=4) by memmove() we + * would issue memmove(5, 1, 16) which would produce + * 112341234XXXXXXXXXXXX, where series of X is 12 + * undefined bytes, that were at bytes [5:17]. + * --------- + */ + errno_t rc = memcpy_s(dp, off + 1, dp - off, off); + securec_check(rc, "", ""); + len -= off; + dp += off; + off += off; + } + rc = memcpy_s(dp, len + 1, dp - off, len); + securec_check(rc, "", ""); + dp += len; + } else { + /* + * An unset control bit means LITERAL BYTE. So we just copy + * one from INPUT to OUTPUT. + */ + *dp++ = *sp++; + } + + /* + * Advance the control bit + */ + ctrl >>= 1; + } + } + + /* + * Check we decompressed the right amount. If we are slicing, then we + * won't necessarily be at the end of the source or dest buffers when we + * hit a stop, so we don't test them. + */ + if (check_complete && (dp != destend || sp != srcend)) { + return -1; + } + + /* + * That's it. + */ + return (char*) dp - dest; +} diff --git a/src/common/backend/utils/cache/partcache.cpp b/src/common/backend/utils/cache/partcache.cpp index 28a14e2af..c746d96cc 100644 --- a/src/common/backend/utils/cache/partcache.cpp +++ b/src/common/backend/utils/cache/partcache.cpp @@ -57,6 +57,7 @@ #include "rewrite/rewriteDefine.h" #include "rewrite/rewriteHandler.h" #include "storage/lmgr.h" +#include "storage/page_compression.h" #include "storage/smgr/smgr.h" #include "storage/smgr/segment.h" #include "catalog/storage.h" @@ -233,6 +234,47 @@ static Partition AllocatePartitionDesc(Form_pg_partition partp) return partition; } +void SetupPageCompressForPartition(RelFileNode* node, PageCompressOpts* compress_options, const char* relationName) +{ + uint1 algorithm = compress_options->compressType; + if (algorithm == COMPRESS_TYPE_NONE) { + node->opt = 0; + } else { + if (!SUPPORT_PAGE_COMPRESSION) { + ereport(ERROR, (errmsg("unsupported page compression on this platform"))); + } + + uint1 compressLevel; + bool symbol = false; + if (compress_options->compressLevel >= 0) { + symbol = true; + compressLevel = compress_options->compressLevel; + } else { + symbol = false; + compressLevel = -compress_options->compressLevel; + } + bool success = false; + uint1 chunkSize = ConvertChunkSize(compress_options->compressChunkSize, &success); + if (!success) { + ereport(ERROR, (errmsg("invalid compress_chunk_size %d , must be one of %d, %d, %d or %d for %s", + compress_options->compressChunkSize, BLCKSZ / 16, BLCKSZ / 8, BLCKSZ / 4, BLCKSZ / 2, + relationName))); + + } + + uint1 preallocChunks; + if (compress_options->compressPreallocChunks >= BLCKSZ / compress_options->compressChunkSize) { + preallocChunks = (uint1)(BLCKSZ / compress_options->compressChunkSize - 1); + } else { + preallocChunks = (uint1)(compress_options->compressPreallocChunks); + } + Assert(preallocChunks <= MAX_PREALLOC_CHUNKS); + node->opt = 0; + SET_COMPRESS_OPTION((*node), compress_options->compressByteConvert, compress_options->compressDiffConvert, + preallocChunks, symbol, compressLevel, algorithm, chunkSize); + } +} + StorageType PartitionGetStorageType(Oid parentOid) { HeapTuple pg_class_tuple; @@ -376,6 +418,12 @@ static void PartitionInitPhysicalAddr(Partition partition) partition->pd_id))); } } + + partition->pd_node.opt = 0; + if (partition->rd_options) { + SetupPageCompressForPartition(&partition->pd_node, &((StdRdOptions*)(partition->rd_options))->compress, + PartitionGetPartitionName(partition)); + } } /* @@ -464,7 +512,7 @@ void PartitionClose(Partition partition) } Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid partfilenode, Oid parttablespace, - StorageType storage_type) + StorageType storage_type, Datum reloptions) { Partition part; MemoryContext oldcxt; @@ -513,6 +561,11 @@ Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid part if (partfilenode != InvalidOid) { PartitionInitPhysicalAddr(part); + /* compressed option was set by PartitionInitPhysicalAddr if part->rd_options != NULL */ + if (part->rd_options == NULL && reloptions) { + StdRdOptions* options = (StdRdOptions*)default_reloptions(reloptions, false, RELOPT_KIND_HEAP); + SetupPageCompressForPartition(&part->pd_node, &options->compress, PartitionGetPartitionName(part)); + } } if (storage_type == SEGMENT_PAGE) { diff --git a/src/common/backend/utils/cache/relcache.cpp b/src/common/backend/utils/cache/relcache.cpp index 9a4e52231..6676246f4 100644 --- a/src/common/backend/utils/cache/relcache.cpp +++ b/src/common/backend/utils/cache/relcache.cpp @@ -167,6 +167,7 @@ #include "rewrite/rewriteDefine.h" #include "rewrite/rewriteRlsPolicy.h" #include "storage/lmgr.h" +#include "storage/page_compression.h" #include "storage/smgr/smgr.h" #include "storage/smgr/segment.h" #include "threadpool/threadpool.h" @@ -1232,7 +1233,7 @@ static OpClassCacheEnt* LookupOpclassInfo(Oid operatorClassOid, StrategyNumber n static void RelationCacheInitFileRemoveInDir(const char* tblspcpath); static void unlink_initfile(const char* initfilename); static void SetBackendId(Relation relation); - +static void SetupPageCompressForRelation(Relation relation, PageCompressOpts *compress_options); /* * ScanPgRelation * @@ -2420,6 +2421,12 @@ static void RelationInitPhysicalAddr(Relation relation) if (!RelationIsPartitioned(relation) && relation->storage_type == SEGMENT_PAGE) { relation->rd_node.bucketNode = SegmentBktId; } + + // setup page compression options + relation->rd_node.opt = 0; + if (relation->rd_options && REL_SUPPORT_COMPRESSED(relation)) { + SetupPageCompressForRelation(relation, &((StdRdOptions*)(relation->rd_options))->compress); + } } static void IndexRelationInitKeyNums(Relation relation) @@ -4247,8 +4254,9 @@ void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTrans * and enter it into the relcache. */ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, TupleDesc tupDesc, Oid relid, - Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence, char relkind, - int8 row_compress, TableAmType tam_type, int8 relindexsplit, StorageType storage_type) + Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence, + char relkind, int8 row_compress, Datum reloptions, TableAmType tam_type, int8 relindexsplit, + StorageType storage_type, Oid accessMethodObjectId) { Relation rel; MemoryContext oldcxt; @@ -4364,6 +4372,7 @@ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, Tuple rel->rd_rel->relowner = BOOTSTRAP_SUPERUSERID; rel->rd_rel->parttype = PARTTYPE_NON_PARTITIONED_RELATION; rel->rd_rel->relrowmovement = false; + rel->rd_rel->relam = accessMethodObjectId; /* set up persistence and relcache fields dependent on it */ rel->rd_rel->relpersistence = relpersistence; @@ -4420,6 +4429,13 @@ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, Tuple RelationInitPhysicalAddr(rel); + /* compressed option was set by RelationInitPhysicalAddr if rel->rd_options != NULL */ + if (rel->rd_options == NULL && reloptions && SUPPORT_COMPRESSED(relkind, rel->rd_rel->relam)) { + StdRdOptions *options = (StdRdOptions *) default_reloptions(reloptions, false, RELOPT_KIND_HEAP); + SetupPageCompressForRelation(rel, &options->compress); + } + + /* materialized view not initially scannable */ if (relkind == RELKIND_MATVIEW) rel->rd_isscannable = false; @@ -7758,3 +7774,41 @@ void GetTdeInfoFromRel(Relation rel, TdeInfo *tde_info) } } +/* setup page compress options for relation */ +static void SetupPageCompressForRelation(Relation relation, PageCompressOpts* compress_options) +{ + relation->rd_node.opt = 0; + uint1 algorithm = compress_options->compressType; + if (algorithm != COMPRESS_TYPE_NONE) { + if (!SUPPORT_PAGE_COMPRESSION) { + elog(ERROR, "unsupported page compression on this platform"); + } + + uint1 compressLevel; + bool symbol = false; + if (compress_options->compressLevel >= 0) { + symbol = true; + compressLevel = compress_options->compressLevel; + } else { + symbol = false; + compressLevel = -compress_options->compressLevel; + } + bool success = false; + uint1 chunkSize = ConvertChunkSize(compress_options->compressChunkSize, &success); + if (!success) { + elog(ERROR, "invalid compress_chunk_size %d , must be one of %d, %d, %d or %d for %s", + compress_options->compressChunkSize, BLCKSZ / 16, BLCKSZ / 8, BLCKSZ / 4, BLCKSZ / 2, + RelationGetRelationName(relation)); + } + uint1 preallocChunks; + if (compress_options->compressPreallocChunks >= BLCKSZ / compress_options->compressChunkSize) { + preallocChunks = (uint1)(BLCKSZ / compress_options->compressChunkSize - 1); + } else { + preallocChunks = (uint1)(compress_options->compressPreallocChunks); + } + Assert(preallocChunks <= MAX_PREALLOC_CHUNKS); + SET_COMPRESS_OPTION(relation->rd_node, compress_options->compressByteConvert, + compress_options->compressDiffConvert, preallocChunks, + symbol, compressLevel, algorithm, chunkSize); + } +} \ No newline at end of file diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index 11a4ca4b8..15ecb65bb 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -59,7 +59,7 @@ bool open_join_children = true; bool will_shutdown = false; /* hard-wired binary version number */ -const uint32 GRAND_VERSION_NUM = 92423; +const uint32 GRAND_VERSION_NUM = 92424; const uint32 HINT_ENHANCEMENT_VERSION_NUM = 92359; const uint32 MATVIEW_VERSION_NUM = 92213; diff --git a/src/common/backend/utils/misc/guc.cpp b/src/common/backend/utils/misc/guc.cpp index 16d9511fd..792041eb5 100755 --- a/src/common/backend/utils/misc/guc.cpp +++ b/src/common/backend/utils/misc/guc.cpp @@ -961,6 +961,7 @@ const char* const config_group_names[] = { /* INSTRUMENTS_OPTIONS */ gettext_noop("Instruments Options"), gettext_noop("Column Encryption"), + gettext_noop("Compress Options"), #ifdef PGXC /* DATA_NODES */ gettext_noop("Datanodes and Connection Pooling"), diff --git a/src/common/backend/utils/mmgr/memprot.cpp b/src/common/backend/utils/mmgr/memprot.cpp index 75522c1d3..4dbd2013d 100755 --- a/src/common/backend/utils/mmgr/memprot.cpp +++ b/src/common/backend/utils/mmgr/memprot.cpp @@ -114,6 +114,7 @@ bool gs_memory_enjection(void) } #endif + /* * check if the node is on heavy memory status now? * is strict is true, we'll do some pre-judgement. @@ -907,6 +908,36 @@ int MemoryProtectFunctions::gs_posix_memalign(void** memptr, Size alignment, Siz return ENOMEM; /* insufficient memory */ } +/** + * reseve memory for mmap of compressed table + * @tparam mem_type MEM_SHRD is supported only + * @param sz reserved size(bytes) + * @param needProtect + * @return success or not + */ +template +bool MemoryProtectFunctions::gs_memprot_reserve(Size sz, bool needProtect) +{ + if (type != MEM_SHRD) { + return false; + } + return memTracker_ReserveMem(sz, needProtect); +} + +/** + * release the momery allocated by gs_memprot_reserve + * @tparam type MEM_SHRD is supported only + * @param sz free size(bytes) + */ +template +void MemoryProtectFunctions::gs_memprot_release(Size sz) +{ + if (type != MEM_SHRD) { + return; + } + memTracker_ReleaseMem(sz); +} + /* thread level initialization */ void gs_memprot_thread_init(void) { diff --git a/src/gausskernel/Makefile b/src/gausskernel/Makefile index 018f287de..46ecac475 100755 --- a/src/gausskernel/Makefile +++ b/src/gausskernel/Makefile @@ -628,6 +628,7 @@ endif cp $(LIBOBS_LIB_PATH)/libpcre* '$(DESTDIR)$(libdir)/../temp/' mv $(DESTDIR)$(libdir)/../temp/* '$(DESTDIR)$(libdir)/' cp $(SECUREDYNAMICLIB_HOME)/libsecurec* '$(DESTDIR)$(libdir)/' + cp $(ZSTD_LIB_PATH)/libzstd* '$(DESTDIR)$(libdir)/' cp $(LIBOBS_LIB_PATH)/liblog4* '$(DESTDIR)$(libdir)/' cp $(LIBOBS_LIB_PATH)/libeSDK* '$(DESTDIR)$(libdir)/' cp $(LIBOBS_LIB_PATH)/libxml2* '$(DESTDIR)$(libdir)/' diff --git a/src/gausskernel/bootstrap/bootparse.y b/src/gausskernel/bootstrap/bootparse.y index ff99c2bb2..c6a5013fa 100755 --- a/src/gausskernel/bootstrap/bootparse.y +++ b/src/gausskernel/bootstrap/bootparse.y @@ -233,6 +233,7 @@ Boot_CreateStmt: mapped_relation, true, REL_CMPRS_NOT_SUPPORT, + (Datum)0, BOOTSTRAP_SUPERUSERID, false, TAM_HEAP, diff --git a/src/gausskernel/cbb/grpc/remote_read_client.cpp b/src/gausskernel/cbb/grpc/remote_read_client.cpp index 461eaf728..75a79b12f 100755 --- a/src/gausskernel/cbb/grpc/remote_read_client.cpp +++ b/src/gausskernel/cbb/grpc/remote_read_client.cpp @@ -183,6 +183,8 @@ int RemoteGetCU(char* remoteAddress, uint32 spcnode, uint32 dbnode, uint32 relno * @IN spcnode: tablespace id * @IN dbnode: database id * @IN relnode: relfilenode + * @IN bucketnode: bucketnode + * @IN opt: compressed table options * @IN/OUT forknum: forknum * @IN/OUT blocknum: block number * @IN/OUT blocksize: block size @@ -190,7 +192,7 @@ int RemoteGetCU(char* remoteAddress, uint32 spcnode, uint32 dbnode, uint32 relno * @IN/OUT page_data: pointer of page data * @Return: remote read error code */ -extern int RemoteGetPage(char* remoteAddress, uint32 spcnode, uint32 dbnode, uint32 relnode, int4 bucketnode, +int RemoteGetPage(char* remoteAddress, uint32 spcnode, uint32 dbnode, uint32 relnode, int2 bucketnode, uint2 opt, int32 forknum, uint32 blocknum, uint32 blocksize, uint64 lsn, char* pageData) { PGconn* conGet = NULL; @@ -244,8 +246,9 @@ extern int RemoteGetPage(char* remoteAddress, uint32 spcnode, uint32 dbnode, uin } tnRet = snprintf_s(sqlCommands, MAX_PATH_LEN, MAX_PATH_LEN - 1, - "SELECT gs_read_block_from_remote(%u, %u, %u, %d, %d, '%lu', %u, '%lu', false);", - spcnode, dbnode, relnode, bucketnode, forknum, blocknum, blocksize, lsn); + "SELECT gs_read_block_from_remote(%u, %u, %u, %d, %d, %d, '%lu', %u, '%lu', false);", spcnode, + dbnode, relnode, bucketnode, opt, forknum, blocknum, blocksize, lsn); + securec_check_ss(tnRet, "", ""); res = PQexecParams(conGet, (const char*)sqlCommands, 0, NULL, NULL, NULL, NULL, 1); diff --git a/src/gausskernel/optimizer/commands/tablecmds.cpp b/src/gausskernel/optimizer/commands/tablecmds.cpp index 2ad03f063..efd5b8ab9 100644 --- a/src/gausskernel/optimizer/commands/tablecmds.cpp +++ b/src/gausskernel/optimizer/commands/tablecmds.cpp @@ -120,6 +120,7 @@ #include "storage/freespace.h" #include "storage/lmgr.h" #include "storage/lock/lock.h" +#include "storage/page_compression.h" #include "storage/predicate.h" #include "storage/remote_read.h" #include "storage/smgr/segment.h" @@ -1046,10 +1047,10 @@ static bool isOrientationSet(List* options, bool* isCUFormat, bool isDfsTbl) * @Param [IN] relkind: table's kind(ordinary table or other database object). * @return: option with defalut options. */ -static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 relcmprs, Oid relnamespace) +static List* AddDefaultOptionsIfNeed(List* options, const char relkind, CreateStmt* stmt, Oid relnamespace) { List* res = options; - + int8 relcmprs = stmt->row_compress; ListCell* cell = NULL; bool isCStore = false; bool isTsStore = false; @@ -1058,6 +1059,10 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel bool isUstore = false; bool assignedStorageType = false; + bool hasRowCompressType = false; + bool hasRowCompressChunk = false; + bool hasRowCompressPre = false; + bool hasRowCompressLevel = false; (void)isOrientationSet(options, NULL, false); foreach (cell, options) { DefElem* def = (DefElem*)lfirst(cell); @@ -1087,6 +1092,14 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION), errmsg("It is not allowed to assign version option for non-dfs table."))); + } else if (pg_strcasecmp(def->defname, "compresstype") == 0) { + hasRowCompressType = true; + } else if (pg_strcasecmp(def->defname, "compress_chunk_size") == 0) { + hasRowCompressChunk = true; + } else if (pg_strcasecmp(def->defname, "compress_prealloc_chunks") == 0) { + hasRowCompressPre = true; + } else if (pg_strcasecmp(def->defname, "compress_level") == 0) { + hasRowCompressLevel = true; } if (pg_strcasecmp(def->defname, "orientation") == 0 && pg_strcasecmp(defGetString(def), ORIENTATION_ORC) == 0) { @@ -1112,6 +1125,25 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel res = lappend(options, def); } + + if ((isCStore || isTsStore || relkind != RELKIND_RELATION || + stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED || + stmt->relation->relpersistence == RELPERSISTENCE_TEMP || + stmt->relation->relpersistence == RELPERSISTENCE_GLOBAL_TEMP) && + (hasRowCompressType || hasRowCompressChunk || hasRowCompressPre || hasRowCompressLevel)) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_OPTION), + errmsg("only row orientation table support " + "compresstype/compress_chunk_size/compress_prealloc_chunks/compress_level."))); + } + if (!hasRowCompressType && (hasRowCompressChunk || hasRowCompressPre || hasRowCompressLevel)) { + ereport(ERROR, + (errcode(ERRCODE_INVALID_OPTION), + errmsg("compress_chunk_size/compress_prealloc_chunks/compress_level " + "should be used with compresstype."))); + + } + if (isUstore && !isCStore && !hasCompression) { DefElem* def = makeDefElem("compression", (Node *)makeString(COMPRESSION_NO)); res = lappend(options, def); @@ -1147,7 +1179,7 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel DefElem *def1 = makeDefElem("orientation", (Node *)makeString(ORIENTATION_ROW)); res = lcons(def1, options); } - if (!hasCompression) { + if (!hasCompression && !hasRowCompressType) { DefElem *def2 = makeDefElem("compression", (Node *)rowCmprOpt); res = lappend(options, def2); } @@ -1986,7 +2018,7 @@ Oid DefineRelation(CreateStmt* stmt, char relkind, Oid ownerId, bool isCTAS) /* Add default options for relation if need. */ if (!dfsTablespace) { if (!u_sess->attr.attr_common.IsInplaceUpgrade) { - stmt->options = AddDefaultOptionsIfNeed(stmt->options, relkind, stmt->row_compress, namespaceId); + stmt->options = AddDefaultOptionsIfNeed(stmt->options, relkind, stmt, namespaceId); } } else { checkObjectCreatedinHDFSTblspc(stmt, relkind); @@ -2221,10 +2253,13 @@ Oid DefineRelation(CreateStmt* stmt, char relkind, Oid ownerId, bool isCTAS) ereport(LOG, (errmodule(MOD_TIMESERIES), errmsg("use implicit distribution column method."))); } } else if (pg_strcasecmp(storeChar, TABLE_ACCESS_METHOD_USTORE) == 0) { - if (pg_strcasecmp(COMPRESSION_NO, StdRdOptionsGetStringData(std_opt, compression, COMPRESSION_NO)) != 0 || + auto compression = StdRdOptionsGetStringData(std_opt, compression, COMPRESSION_NO); + auto orientation = StdRdOptionsGetStringData(std_opt, orientation, ORIENTATION_ROW); + if ((pg_strcasecmp(COMPRESSION_NO, compression) != 0 && + pg_strcasecmp(ORIENTATION_COLUMN, orientation) == 0) || IsCompressedByCmprsInPgclass((RelCompressType)stmt->row_compress)) { ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("UStore tables do not support compression."))); + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("UStore tables do not support compression."))); } ForbidToSetOptionsForRowTbl(stmt->options); ForbidToSetOptionsForUstoreTbl(stmt->options); @@ -14160,6 +14195,50 @@ static void ATExecSetRelOptionsToast(Oid toastid, List* defList, AlterTableType heap_close(pgclass, RowExclusiveLock); } +/** + * Do not modify compression parameters. + */ +void static CheckSupportModifyCompression(Relation rel, bytea* relOoption) +{ + if (!relOoption || !REL_SUPPORT_COMPRESSED(rel)) { + return; + } + PageCompressOpts* newCompressOpt = &(((StdRdOptions*)relOoption)->compress); + RelFileCompressOption current; + TransCompressOptions(rel->rd_node, ¤t); + if (newCompressOpt) { + int1 algorithm = newCompressOpt->compressType; + if (algorithm != current.compressAlgorithm) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("change compresstype OPTION is not supported"))); + } + if (current.compressAlgorithm != COMPRESS_TYPE_NONE && + newCompressOpt->compressChunkSize != CHUNK_SIZE_LIST[current.compressChunkSize]) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("change compress_chunk_size OPTION is not supported"))); + } + } else { + if (current.compressAlgorithm != COMPRESS_TYPE_NONE) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("change compresstype OPTION is not supported"))); + } + } + + /* + * forbid modify partition CompressOption + */ + if (HEAP_IS_PARTITIONED(rel)) { + if ((int)current.compressLevel != newCompressOpt->compressLevel) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("change partition compressLevel OPTION is not supported"))); + } + if ((int)current.compressPreallocChunks != newCompressOpt->compressPreallocChunks) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("change partition compress_prealloc_chunks OPTION is not supported"))); + } + } +} + /* * Set, reset, or replace reloptions. */ @@ -14297,6 +14376,7 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper } /* Validate */ + bytea* relOpt = NULL; switch (rel->rd_rel->relkind) { case RELKIND_RELATION: { /* this options only can be used when define a new relation. @@ -14305,6 +14385,7 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper ForbidUserToSetDefinedOptions(defList); bytea* heapRelOpt = heap_reloptions(rel->rd_rel->relkind, newOptions, true); + relOpt = heapRelOpt; const char* algo = RelationGetAlgo(rel); if (RelationIsColStore(rel)) { /* un-supported options. dont care its values */ @@ -14338,17 +14419,20 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper break; } case RELKIND_INDEX: - case RELKIND_GLOBAL_INDEX: + case RELKIND_GLOBAL_INDEX: { ForbidUserToSetDefinedIndexOptions(defList); - (void)index_reloptions(rel->rd_am->amoptions, newOptions, true); + relOpt = index_reloptions(rel->rd_am->amoptions, newOptions, true); break; + } default: ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, view, materialized view, index, or TOAST table", RelationGetRelationName(rel)))); break; } - + + CheckSupportModifyCompression(rel, relOpt); + /* * All we need do here is update the pg_class row; the new options will be * propagated into relcaches during post-commit cache inval. @@ -21062,6 +21146,11 @@ static void checkCompressForExchange(Relation partTableRel, Relation ordTableRel (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress"))); } + if (partTableRel->rd_node.opt != ordTableRel->rd_node.opt) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress"))); + } } // Description : Check number, type of column @@ -22705,9 +22794,16 @@ static char* GenTemporaryPartitionName(Relation partTableRel, int sequence) return pstrdup(tmpName); } +#ifndef ENABLE_MULTIPLE_NODES static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *partDef, Oid bucketOid, bool *isTimestamptz, StorageType stype, Datum new_reloptions) { +#else +static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *partDef, + Oid bucketOid, bool *isTimestamptz, StorageType stype) +{ + Datum new_reloptions = (Datum)0; +#endif Oid newPartOid = InvalidOid; switch (nodeTag(partDef)) { case T_RangePartitionDefState: @@ -22729,7 +22825,7 @@ static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *p bucketOid, (ListPartitionDefState *)partDef, partTableRel->rd_rel->relowner, - (Datum)0, + new_reloptions, isTimestamptz, stype); break; @@ -22740,7 +22836,7 @@ static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *p bucketOid, (HashPartitionDefState *)partDef, partTableRel->rd_rel->relowner, - (Datum)0, + new_reloptions, isTimestamptz, stype); break; @@ -22793,8 +22889,13 @@ static Oid AddTemporaryPartition(Relation partTableRel, Node* partDef) } /* Temporary tables do not use segment-page */ +#ifndef ENABLE_MULTIPLE_NODES newPartOid = GetNewPartitionOid(pgPartRel, partTableRel, partDef, bucketOid, isTimestamptz, RelationGetStorageType(partTableRel), new_reloptions); +#else + newPartOid = GetNewPartitionOid( + pgPartRel, partTableRel, partDef, bucketOid, isTimestamptz, RelationGetStorageType(partTableRel)); +#endif // We must bump the command counter to make the newly-created // partition tuple visible for opening. @@ -23054,7 +23155,7 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List* bucketOid = RelationGetBucketOid(partTableRel); pgPartRel = relation_open(PartitionRelationId, RowExclusiveLock); - +#ifndef ENABLE_MULTIPLE_NODES bool isNull = false; HeapTuple tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(partTableRel->rd_id)); Datum relOptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, &isNull); @@ -23062,6 +23163,7 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List* Datum newRelOptions = transformRelOptions((Datum)0, oldRelOptions, NULL, NULL, false, false); ReleaseSysCache(tuple); list_free_ext(oldRelOptions); +#endif foreach (cell, destPartDefList) { RangePartitionDefState* partDef = (RangePartitionDefState*)lfirst(cell); @@ -23072,7 +23174,11 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List* bucketOid, partDef, partTableRel->rd_rel->relowner, +#ifndef ENABLE_MULTIPLE_NODES (Datum)newRelOptions, +#else + (Datum)0, +#endif isTimestamptz, RelationGetStorageType(partTableRel), AccessExclusiveLock); diff --git a/src/gausskernel/process/postmaster/pagewriter.cpp b/src/gausskernel/process/postmaster/pagewriter.cpp index 75dcd0969..4365635ed 100755 --- a/src/gausskernel/process/postmaster/pagewriter.cpp +++ b/src/gausskernel/process/postmaster/pagewriter.cpp @@ -518,7 +518,7 @@ try_get_buf: item->bucketNode = buf_desc->tag.rnode.bucketNode; item->forkNum = buf_desc->tag.forkNum; item->blockNum = buf_desc->tag.blockNum; - if(IsSegmentFileNode(buf_desc->tag.rnode)) { + if(IsSegmentFileNode(buf_desc->tag.rnode) || buf_desc->tag.rnode.opt != 0) { *is_new_relfilenode = true; } } else { diff --git a/src/gausskernel/process/postmaster/pgstat.cpp b/src/gausskernel/process/postmaster/pgstat.cpp index af9d3e41f..ab9874731 100644 --- a/src/gausskernel/process/postmaster/pgstat.cpp +++ b/src/gausskernel/process/postmaster/pgstat.cpp @@ -4470,6 +4470,12 @@ const char* pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_LOGCTRL_SLEEP: event_name = "LOGCTRL_SLEEP"; break; + case WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH: + event_name = "PCA_FLUSH"; + break; + case WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC: + event_name = "PCA_SYNC"; + break; /* no default case, so that compiler will warn */ case IO_EVENT_NUM: break; diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index bcfbafe85..08734ad74 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -2068,6 +2068,8 @@ int PostmasterMain(int argc, char* argv[]) ngroup_info_hash_create(); /*init Role id hash table*/ InitRoleIdHashTable(); + /* pcmap */ + RealInitialMMapLockArray(); /* init unique sql */ InitUniqueSQL(); /* init hypo index */ diff --git a/src/gausskernel/storage/access/common/reloptions.cpp b/src/gausskernel/storage/access/common/reloptions.cpp index a1d80c4ef..c699f4753 100644 --- a/src/gausskernel/storage/access/common/reloptions.cpp +++ b/src/gausskernel/storage/access/common/reloptions.cpp @@ -115,6 +115,10 @@ static relopt_bool boolRelOpts[] = { {{ "on_commit_delete_rows", "global temp table on commit options", RELOPT_KIND_HEAP}, true}, {{ "crossbucket", "Enables cross bucket index creation in this index relation", RELOPT_KIND_BTREE}, false }, {{ "enable_tde", "enable table's level transparent data encryption", RELOPT_KIND_HEAP }, false }, + {{ "compress_byte_convert", "Whether do byte convert in compression", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, + false }, + {{ "compress_diff_convert", "Whether do diiffer convert in compression", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, + false }, /* list terminator */ {{NULL}} }; @@ -235,6 +239,16 @@ static relopt_int intRelOpts[] = { }, -1, 0, 32 }, + {{ "compress_level", "Level of page compression.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, 0, -31, 31}, + {{ "compresstype", "compress type (none, pglz or zstd).", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, 0, 0, 2}, + {{ "compress_chunk_size", "Size of chunk to store compressed page.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, + BLCKSZ / 2, + BLCKSZ / 16, + BLCKSZ / 2}, + {{ "compress_prealloc_chunks", "Number of prealloced chunks for each block.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, + 0, + 0, + 7}, /* list terminator */ {{NULL}} }; @@ -1934,6 +1948,18 @@ bytea *default_reloptions(Datum reloptions, bool validate, relopt_kind kind) { "cmk_id", RELOPT_TYPE_STRING, offsetof(StdRdOptions, cmk_id)}, { "encrypt_algo", RELOPT_TYPE_STRING, offsetof(StdRdOptions, encrypt_algo)}, { "enable_tde", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, enable_tde)}, + { "compresstype", RELOPT_TYPE_INT, + offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressType)}, + { "compress_level", RELOPT_TYPE_INT, + offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressLevel)}, + { "compress_chunk_size", RELOPT_TYPE_INT, + offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressChunkSize)}, + {"compress_prealloc_chunks", RELOPT_TYPE_INT, + offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressPreallocChunks)}, + { "compress_byte_convert", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressByteConvert)}, + { "compress_diff_convert", RELOPT_TYPE_BOOL, + offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressDiffConvert)}, }; options = parseRelOptions(reloptions, validate, kind, &numoptions); diff --git a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp index bdf415321..6bccee587 100644 --- a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp +++ b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp @@ -177,6 +177,9 @@ bool XLogBlockRefreshRedoBufferInfo(XLogBlockHead *blockhead, RedoBufferInfo *bu if (bufferinfo->blockinfo.rnode.relNode != XLogBlockHeadGetRelNode(blockhead)) { return false; } + if (bufferinfo->blockinfo.rnode.opt != XLogBlockHeadGetCompressOpt(blockhead)) { + return false; + } if (bufferinfo->blockinfo.forknum != XLogBlockHeadGetForkNum(blockhead)) { return false; } @@ -200,6 +203,7 @@ void XLogBlockInitRedoBlockInfo(XLogBlockHead *blockhead, RedoBufferTag *blockin blockinfo->rnode.dbNode = XLogBlockHeadGetDbNode(blockhead); blockinfo->rnode.relNode = XLogBlockHeadGetRelNode(blockhead); blockinfo->rnode.bucketNode = XLogBlockHeadGetBucketId(blockhead); + blockinfo->rnode.opt = XLogBlockHeadGetCompressOpt(blockhead); blockinfo->forknum = XLogBlockHeadGetForkNum(blockhead); blockinfo->blkno = XLogBlockHeadGetBlockNum(blockhead); blockinfo->pblk = XLogBlockHeadGetPhysicalBlock(blockhead); @@ -272,7 +276,7 @@ void XLogRecSetBlockCommonState(XLogReaderState *record, XLogBlockParseEnum bloc blockparse->blockhead.spcNode = filenode.rnode.node.spcNode; blockparse->blockhead.dbNode = filenode.rnode.node.dbNode; blockparse->blockhead.bucketNode = filenode.rnode.node.bucketNode; - + blockparse->blockhead.opt = filenode.rnode.node.opt; blockparse->blockhead.blkno = filenode.segno; blockparse->blockhead.forknum = filenode.forknumber; @@ -1361,7 +1365,7 @@ void XLogBlockDdlCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoBu rnode.dbNode = blockhead->dbNode; rnode.relNode = blockhead->relNode; rnode.bucketNode = blockhead->bucketNode; - + rnode.opt = blockhead->opt; switch (blockddlrec->blockddltype) { case BLOCK_DDL_CREATE_RELNODE: smgr_redo_create(rnode, blockhead->forknum, blockddlrec->mainData); @@ -1430,7 +1434,7 @@ void XLogBlockSegDdlDoRealAction(XLogBlockHead* blockhead, void* blockrecbody, R rnode.dbNode = blockhead->dbNode; rnode.relNode = blockhead->relNode; rnode.bucketNode = blockhead->bucketNode; - + rnode.opt = blockhead->opt; switch (segddlrec->blockddlrec.blockddltype) { case BLOCK_DDL_TRUNCATE_RELNODE: xlog_block_segpage_redo_truncate(rnode, blockhead, segddlrec); @@ -1455,7 +1459,7 @@ void XLogBlockDdlDoSmgrAction(XLogBlockHead *blockhead, void *blockrecbody, Redo rnode.dbNode = blockhead->dbNode; rnode.relNode = blockhead->relNode; rnode.bucketNode = blockhead->bucketNode; - + rnode.opt = blockhead->opt; switch (blockddlrec->blockddltype) { case BLOCK_DDL_CREATE_RELNODE: smgr_redo_create(rnode, blockhead->forknum, blockddlrec->mainData); diff --git a/src/gausskernel/storage/access/transam/double_write.cpp b/src/gausskernel/storage/access/transam/double_write.cpp index 1b54d080c..650db59d5 100644 --- a/src/gausskernel/storage/access/transam/double_write.cpp +++ b/src/gausskernel/storage/access/transam/double_write.cpp @@ -281,7 +281,11 @@ inline void dw_prepare_page(dw_batch_t *batch, uint16 page_num, uint16 page_id, if (t_thrd.proc->workingVersionNum < DW_SUPPORT_SINGLE_FLUSH_VERSION) { page_num = page_num | IS_HASH_BKT_SEGPAGE_MASK; } - batch->buftag_ver = HASHBUCKET_TAG; + if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) { + batch->buftag_ver = HASHBUCKET_TAG; + } else { + batch->buftag_ver = PAGE_COMPRESS_TAG; + } } else { batch->buftag_ver = ORIGIN_TAG; } @@ -304,7 +308,7 @@ static void dw_prepare_file_head(char *file_head, uint16 start, uint16 dwn) curr_head->head.page_id = 0; curr_head->head.dwn = dwn; curr_head->start = start; - curr_head->buftag_version = HASHBUCKET_TAG; + curr_head->buftag_version = PAGE_COMPRESS_TAG; curr_head->tail.dwn = dwn; dw_calc_file_head_checksum(curr_head); } @@ -430,15 +434,21 @@ static void dw_recover_pages(T1 *batch, T2 *buf_tag, PageHeader data_page, BufTa for (i = 0; i < GET_REL_PGAENUM(batch->page_num); i++) { buf_tag = &batch->buf_tag[i]; + relnode.dbNode = buf_tag->rnode.dbNode; + relnode.spcNode = buf_tag->rnode.spcNode; + relnode.relNode = buf_tag->rnode.relNode; if (tag_ver == HASHBUCKET_TAG) { - relnode.dbNode = buf_tag->rnode.dbNode; - relnode.spcNode = buf_tag->rnode.spcNode; - relnode.relNode = buf_tag->rnode.relNode; + relnode.opt = 0; + // 2 bytes are used for bucketNode. + relnode.bucketNode = (int2)((BufferTagSecondVer *)buf_tag)->rnode.bucketNode; + } else if (tag_ver == PAGE_COMPRESS_TAG) { + relnode.opt = ((BufferTag *)buf_tag)->rnode.opt; relnode.bucketNode = ((BufferTag *)buf_tag)->rnode.bucketNode; } else { relnode.dbNode = buf_tag->rnode.dbNode; relnode.spcNode = buf_tag->rnode.spcNode; relnode.relNode = buf_tag->rnode.relNode; + relnode.opt = 0; relnode.bucketNode = InvalidBktId; } relation = smgropen(relnode, InvalidBackendId, GetColumnNum(buf_tag->forkNum)); @@ -757,7 +767,10 @@ static void dw_recover_partial_write(knl_g_dw_context *cxt) } if (t_thrd.proc->workingVersionNum < DW_SUPPORT_SINGLE_FLUSH_VERSION) { bool is_hashbucket = ((curr_head->page_num & IS_HASH_BKT_SEGPAGE_MASK) != 0); - curr_head->buftag_ver = is_hashbucket ? HASHBUCKET_TAG : ORIGIN_TAG; + curr_head->buftag_ver = is_hashbucket ? + (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION ? HASHBUCKET_TAG + : PAGE_COMPRESS_TAG) + : ORIGIN_TAG; } remain_pages = read_asst.buf_end - read_asst.buf_start; @@ -1988,9 +2001,9 @@ int buftag_compare(const void *pa, const void *pb) static inline void dw_log_recovery_page(int elevel, const char *state, BufferTag buf_tag) { ereport(elevel, (errmodule(MOD_DW), - errmsg("[single flush] recovery, %s: buf_tag[rel %u/%u/%u blk %u fork %d]", + errmsg("[single flush] recovery, %s: buf_tag[rel %u/%u/%u blk %u fork %d], compress: %u", state, buf_tag.rnode.spcNode, buf_tag.rnode.dbNode, buf_tag.rnode.relNode, buf_tag.blockNum, - buf_tag.forkNum))); + buf_tag.forkNum, buf_tag.rnode.opt))); } void dw_recovery_page_single(const dw_single_flush_item *item, uint16 item_num) diff --git a/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp b/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp index 3d211143b..5f4396d7c 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp @@ -53,6 +53,7 @@ static inline void PRXLogRecGetBlockTag(XLogRecParseState *recordBlockState, Rel rnode->relNode = blockparse->blockhead.relNode; rnode->spcNode = blockparse->blockhead.spcNode; rnode->bucketNode = blockparse->blockhead.bucketNode; + rnode->opt = blockparse->blockhead.opt; } if (blknum != NULL) { *blknum = blockparse->blockhead.blkno; @@ -223,7 +224,7 @@ void PRTrackRelStorageDrop(XLogRecParseState *recordBlockState, HTAB *redoItemHa rNode.dbNode = blockparse->blockhead.dbNode; rNode.relNode = blockparse->blockhead.relNode; rNode.bucketNode = blockparse->blockhead.bucketNode; - + rNode.opt = blockparse->blockhead.opt; #ifdef USE_ASSERT_CHECKING ereport(LOG, (errmsg("PRTrackRelTruncate:(%X/%X)clear relation %u/%u/%u forknum %u record", (uint32)(blockparse->blockhead.end_ptr >> 32), (uint32)(blockparse->blockhead.end_ptr), diff --git a/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp b/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp index ceda89105..fd83c6ce9 100755 --- a/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp @@ -1364,6 +1364,7 @@ void RedoPageWorkerRedoBcmBlock(XLogRecParseState *procState) node.dbNode = procState->blockparse.blockhead.dbNode; node.relNode = procState->blockparse.blockhead.relNode; node.bucketNode = procState->blockparse.blockhead.bucketNode; + node.opt = procState->blockparse.blockhead.opt; XLogBlockNewCuParse *newCuParse = &(procState->blockparse.extra_rec.blocknewcu); uint8 info = XLogBlockHeadGetInfo(&procState->blockparse.blockhead) & ~XLR_INFO_MASK; switch (info & XLOG_HEAP_OPMASK) { diff --git a/src/gausskernel/storage/access/transam/xloginsert.cpp b/src/gausskernel/storage/access/transam/xloginsert.cpp index 6d971d169..a840e42ea 100755 --- a/src/gausskernel/storage/access/transam/xloginsert.cpp +++ b/src/gausskernel/storage/access/transam/xloginsert.cpp @@ -479,7 +479,8 @@ XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, bool isupgrade, int bucket_id, bo * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are * reserved for use by me. */ - if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE | XLR_BTREE_UPGRADE_FLAG | XLR_IS_TOAST)) != 0) { + if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE | + XLR_BTREE_UPGRADE_FLAG | XLR_REL_COMPRESS | XLR_IS_TOAST)) != 0) { ereport(PANIC, (errmsg("invalid xlog info mask %hhx", info))); } @@ -682,6 +683,12 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_ bool samerel = false; bool tde = false; + // must be uncompressed table during upgrade + bool isCompressedTable = regbuf->rnode.opt != 0; + if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) { + Assert(!isCompressedTable); + } + if (!regbuf->in_use) continue; @@ -829,7 +836,7 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_ samerel = false; prev_regbuf = regbuf; - if (!samerel && IsSegmentFileNode(regbuf->rnode)) { + if (!samerel && (IsSegmentFileNode(regbuf->rnode) || isCompressedTable)) { Assert(bkpb.id <= XLR_MAX_BLOCK_ID); bkpb.id += BKID_HAS_BUCKET_OR_SEGPAGE; } @@ -845,9 +852,20 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_ } if (!samerel) { - if (IsSegmentFileNode(regbuf->rnode)) { - XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), ®buf->rnode, remained_size); - hashbucket_flag = true; + if (IsSegmentFileNode(regbuf->rnode) || isCompressedTable) { + if (IsSegmentFileNode(regbuf->rnode)) { + XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), ®buf->rnode, remained_size); + hashbucket_flag = true; + } + if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) { + Assert(!isCompressedTable); + RelFileNodeV2 relFileNodeV2; + RelFileNodeV2Copy(relFileNodeV2, regbuf->rnode); + XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNodeV2), ®buf->rnode, remained_size); + } else { + info |= XLR_REL_COMPRESS; + XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), ®buf->rnode, remained_size); + } } else { XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNodeOld), ®buf->rnode, remained_size); no_hashbucket_flag = true; diff --git a/src/gausskernel/storage/access/transam/xlogreader.cpp b/src/gausskernel/storage/access/transam/xlogreader.cpp index 603724ea3..be095a5b7 100644 --- a/src/gausskernel/storage/access/transam/xlogreader.cpp +++ b/src/gausskernel/storage/access/transam/xlogreader.cpp @@ -1214,6 +1214,18 @@ void ResetDecoder(XLogReaderState *state) remaining -= sizeof(type); \ } while (0) +/** + * happens during the upgrade, copy the RelFileNodeV2 to RelFileNode + * support little-endian system + * @param relfileNode relfileNode + */ +static void CompressTableRecord(RelFileNode* relfileNode) +{ + if (relfileNode->bucketNode <= -1 && relfileNode->opt == 0xFFFF) { + relfileNode->opt = 0; + } +} + /* * Decode the previously read record. * @@ -1333,8 +1345,11 @@ bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errorms if (remaining < filenodelen) goto shortdata_err; blk->rnode.bucketNode = InvalidBktId; + blk->rnode.opt = 0; errno_t rc = memcpy_s(&blk->rnode, filenodelen, ptr, filenodelen); securec_check(rc, "\0", "\0"); + /* support decode old version of relfileNode */ + CompressTableRecord(&blk->rnode); ptr += filenodelen; remaining -= filenodelen; diff --git a/src/gausskernel/storage/access/transam/xlogutils.cpp b/src/gausskernel/storage/access/transam/xlogutils.cpp index 81b61cfc5..c19ec6dfd 100644 --- a/src/gausskernel/storage/access/transam/xlogutils.cpp +++ b/src/gausskernel/storage/access/transam/xlogutils.cpp @@ -1139,7 +1139,7 @@ void XLogDropBktRowRelation(XLogRecParseState *redoblockstate) rnode.spcNode = redoblockstate->blockparse.blockhead.spcNode; rnode.dbNode = redoblockstate->blockparse.blockhead.dbNode; rnode.relNode = redoblockstate->blockparse.blockhead.relNode; - + rnode.opt = redoblockstate->blockparse.blockhead.opt; uint32 *bktmap = (uint32 *)redoblockstate->blockparse.extra_rec.blockddlrec.mainData; for (uint32 bktNode = 0; bktNode < MAX_BUCKETMAPLEN; bktNode++) { if (!GET_BKT_MAP_BIT(bktmap, bktNode)) { @@ -1163,6 +1163,7 @@ void XLogForgetDDLRedo(XLogRecParseState *redoblockstate) relNode.dbNode = redoblockstate->blockparse.blockhead.dbNode; relNode.relNode = redoblockstate->blockparse.blockhead.relNode; relNode.bucketNode = redoblockstate->blockparse.blockhead.bucketNode; + relNode.opt = redoblockstate->blockparse.blockhead.opt; XlogDropRowReation(relNode); } } else if (ddlrecparse->blockddltype == BLOCK_DDL_TRUNCATE_RELNODE) { @@ -1171,6 +1172,7 @@ void XLogForgetDDLRedo(XLogRecParseState *redoblockstate) relNode.dbNode = redoblockstate->blockparse.blockhead.dbNode; relNode.relNode = redoblockstate->blockparse.blockhead.relNode; relNode.bucketNode = redoblockstate->blockparse.blockhead.bucketNode; + relNode.opt = redoblockstate->blockparse.blockhead.opt; XLogTruncateRelation(relNode, redoblockstate->blockparse.blockhead.forknum, redoblockstate->blockparse.blockhead.blkno); } @@ -1182,7 +1184,8 @@ void XLogDropSpaceShrink(XLogRecParseState *redoblockstate) .spcNode = redoblockstate->blockparse.blockhead.spcNode, .dbNode = redoblockstate->blockparse.blockhead.dbNode, .relNode = redoblockstate->blockparse.blockhead.relNode, - .bucketNode = redoblockstate->blockparse.blockhead.bucketNode + .bucketNode = redoblockstate->blockparse.blockhead.bucketNode, + .opt = redoblockstate->blockparse.blockhead.opt }; ForkNumber forknum = redoblockstate->blockparse.blockhead.forknum; BlockNumber target_size = redoblockstate->blockparse.blockhead.blkno; diff --git a/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp b/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp index 0e1bf2f27..2842e4415 100644 --- a/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp +++ b/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp @@ -1389,6 +1389,7 @@ static void UHeapXlogUpdateBlock(XLogBlockHead *blockhead, XLogBlockDataParse *b rnode.dbNode = blockhead->dbNode; rnode.relNode = blockhead->relNode; rnode.bucketNode = blockhead->bucketNode; + rnode.opt = blockhead->opt; XLogRecordPageWithFreeSpace(rnode, bufferinfo->blockinfo.blkno, freespace); } } else { diff --git a/src/gausskernel/storage/buffer/bufmgr.cpp b/src/gausskernel/storage/buffer/bufmgr.cpp index 93ced57a5..13beb7601 100644 --- a/src/gausskernel/storage/buffer/bufmgr.cpp +++ b/src/gausskernel/storage/buffer/bufmgr.cpp @@ -6026,8 +6026,11 @@ void shared_buffer_write_error_callback(void *arg) /* Buffer is pinned, so we can read the tag without locking the spinlock */ if (buf_desc != NULL) { char *path = relpathperm(((BufferDesc *)buf_desc)->tag.rnode, ((BufferDesc *)buf_desc)->tag.forkNum); - - (void)errcontext("writing block %u of relation %s", buf_desc->tag.blockNum, path); + if (((BufferDesc *)buf_desc)->tag.rnode.opt) { + (void)errcontext("writing block %u of relation %s_pcd", buf_desc->tag.blockNum, path); + } else { + (void)errcontext("writing block %u of relation %s", buf_desc->tag.blockNum, path); + } pfree(path); } } @@ -6382,7 +6385,7 @@ retry: PROFILING_REMOTE_START(); int ret_code = RemoteGetPage(remote_address, rnode.node.spcNode, rnode.node.dbNode, rnode.node.relNode, - rnode.node.bucketNode, fork_num, block_num, BLCKSZ, cur_lsn, buf); + rnode.node.bucketNode, rnode.node.opt, fork_num, block_num, BLCKSZ, cur_lsn, buf); PROFILING_REMOTE_END_READ(BLCKSZ, (ret_code == REMOTE_READ_OK)); diff --git a/src/gausskernel/storage/file/fd.cpp b/src/gausskernel/storage/file/fd.cpp index 45f543246..817a45787 100644 --- a/src/gausskernel/storage/file/fd.cpp +++ b/src/gausskernel/storage/file/fd.cpp @@ -191,6 +191,16 @@ static pthread_mutex_t VFDLockArray[NUM_VFD_PARTITIONS]; #define VFDMappingPartitionLock(hashcode) \ (&VFDLockArray[VFDTableHashPartition(hashcode)]) +/* + * pc_munmap + */ +#define SAFE_MUNMAP(vfdP) \ + do { \ + if ((vfdP)->with_pcmap && (vfdP)->pcmap != NULL) { \ + UnReferenceAddrFile((vfdP)); \ + (vfdP)->pcmap = NULL; \ + } \ + } while (0) /* -------------------- * * Private Routines @@ -344,11 +354,13 @@ RelFileNodeForkNum RelFileNodeForkNumFill(RelFileNode* rnode, filenode.rnode.node.spcNode = rnode->spcNode; filenode.rnode.node.dbNode = rnode->dbNode; filenode.rnode.node.bucketNode = rnode->bucketNode; + filenode.rnode.node.opt = rnode->opt; } else { filenode.rnode.node.relNode = InvalidOid; filenode.rnode.node.spcNode = InvalidOid; filenode.rnode.node.dbNode = InvalidOid; filenode.rnode.node.bucketNode = InvalidBktId; + filenode.rnode.node.opt = 0; } filenode.rnode.backend = backend; @@ -898,6 +910,7 @@ static void LruDelete(File file) vfdP = &u_sess->storage_cxt.VfdCache[file]; + SAFE_MUNMAP(vfdP); /* delete the vfd record from the LRU ring */ Delete(file); @@ -1669,6 +1682,8 @@ void FileCloseWithThief(File file) { Vfd* vfdP = &u_sess->storage_cxt.VfdCache[file]; if (!FileIsNotOpen(file)) { + SAFE_MUNMAP(vfdP); + /* remove the file from the lru ring */ Delete(file); /* the thief has close the real fd */ @@ -1807,6 +1822,8 @@ void FileClose(File file) vfdP = &u_sess->storage_cxt.VfdCache[file]; if (!FileIsNotOpen(file)) { + SAFE_MUNMAP(vfdP); + /* remove the file from the lru ring */ Delete(file); @@ -3917,3 +3934,71 @@ static void UnlinkIfExistsFname(const char *fname, bool isdir, int elevel) } } +/* + * initialize page compress memory map. + * + */ +void SetupPageCompressMemoryMap(File file, RelFileNode node, const RelFileNodeForkNum& relFileNodeForkNum) +{ + Vfd *vfdP = &u_sess->storage_cxt.VfdCache[file]; + auto chunk_size = CHUNK_SIZE_LIST[GET_COMPRESS_CHUNK_SIZE(node.opt)]; + int returnCode = FileAccess(file); + if (returnCode < 0) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to open file %s: %m", vfdP->fileName))); + } + RelFileNodeForkNum newOne(relFileNodeForkNum); + newOne.forknumber = PCA_FORKNUM; + PageCompressHeader *map = GetPageCompressHeader(vfdP, chunk_size, newOne); + if (map == (void *) (-1)) { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to mmap page compression address file %s: %m", + vfdP->fileName))); + } + if (map->chunk_size == 0 && map->algorithm == 0) { + map->chunk_size = chunk_size; + map->algorithm = GET_COMPRESS_ALGORITHM(node.opt); + if (pc_msync(map) != 0) { + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", vfdP->fileName))); + } + } + + if (RecoveryInProgress()) { + CheckAndRepairCompressAddress(map, chunk_size, map->algorithm, vfdP->fileName); + } + + vfdP->with_pcmap = true; + vfdP->pcmap = map; +} + +/* + * Return the page compress memory map. + * + */ +PageCompressHeader *GetPageCompressMemoryMap(File file, uint32 chunk_size) +{ + int returnCode; + Vfd *vfdP = &u_sess->storage_cxt.VfdCache[file]; + PageCompressHeader *map = NULL; + + Assert(FileIsValid(file)); + + returnCode = FileAccess(file); + if (returnCode < 0) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to open file %s: %m", vfdP->fileName))); + } + + Assert(vfdP->with_pcmap); + if (vfdP->pcmap == NULL) { + map = GetPageCompressHeader(vfdP, chunk_size, vfdP->fileNode); + if (map == MAP_FAILED) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg( + "Failed to mmap page compression address file %s: %m", vfdP->fileName))); + } + + vfdP->with_pcmap = true; + vfdP->pcmap = map; + } + + return vfdP->pcmap; +} \ No newline at end of file diff --git a/src/gausskernel/storage/remote/remote_adapter.cpp b/src/gausskernel/storage/remote/remote_adapter.cpp index 19232828b..400764f27 100755 --- a/src/gausskernel/storage/remote/remote_adapter.cpp +++ b/src/gausskernel/storage/remote/remote_adapter.cpp @@ -115,7 +115,7 @@ Datum gs_read_block_from_remote(PG_FUNCTION_ARGS) /* if request to read CU block, we use forkNum column to replace colid. */ (void)StandbyReadCUforPrimary(spcNode, dbNode, relNode, forkNum, blockNum, blockSize, lsn, &result); } else { - (void)StandbyReadPageforPrimary(spcNode, dbNode, relNode, bucketNode, forkNum, blockNum, blockSize, + (void)StandbyReadPageforPrimary(spcNode, dbNode, relNode, bucketNode, 0, forkNum, blockNum, blockSize, lsn, &result); } @@ -126,6 +126,53 @@ Datum gs_read_block_from_remote(PG_FUNCTION_ARGS) } } +/* + * Read block from buffer from primary, returning it as bytea + */ +Datum gs_read_block_from_remote_compress(PG_FUNCTION_ARGS) +{ + uint32 spcNode; + uint32 dbNode; + uint32 relNode; + int16 bucketNode; + uint16 opt = 0; + int32 forkNum; + uint64 blockNum; + uint32 blockSize; + uint64 lsn; + bool isForCU = false; + bytea* result = NULL; + + if (GetUserId() != BOOTSTRAP_SUPERUSERID) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be initial account to read files")))); + } + /* handle optional arguments */ + spcNode = PG_GETARG_UINT32(0); + dbNode = PG_GETARG_UINT32(1); + relNode = PG_GETARG_UINT32(2); + bucketNode = PG_GETARG_INT16(3); + opt = PG_GETARG_UINT16(4); + forkNum = PG_GETARG_INT32(5); + blockNum = (uint64)PG_GETARG_TRANSACTIONID(6); + blockSize = PG_GETARG_UINT32(7); + lsn = (uint64)PG_GETARG_TRANSACTIONID(8); + isForCU = PG_GETARG_BOOL(9); + /* get block from local buffer */ + if (isForCU) { + /* if request to read CU block, we use forkNum column to replace colid. */ + (void)StandbyReadCUforPrimary(spcNode, dbNode, relNode, forkNum, blockNum, blockSize, lsn, &result); + } else { + (void)StandbyReadPageforPrimary(spcNode, dbNode, relNode, bucketNode, opt, forkNum, blockNum, blockSize, + lsn, &result); + } + + if (NULL != result) { + PG_RETURN_BYTEA_P(result); + } else { + PG_RETURN_NULL(); + } +} + /* * @Description: read cu for primary * @IN spcnode: tablespace id @@ -203,7 +250,7 @@ int StandbyReadCUforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int32 * @Return: remote read error code * @See also: */ -int StandbyReadPageforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int16 bucketnode, int32 forknum, +int StandbyReadPageforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int16 bucketnode, uint2 opt, int32 forknum, uint32 blocknum, uint32 blocksize, uint64 lsn, bytea** pagedata) { Assert(pagedata); @@ -220,7 +267,7 @@ int StandbyReadPageforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int return ret_code; } - RelFileNode relfilenode {spcnode, dbnode, relnode, bucketnode}; + RelFileNode relfilenode {spcnode, dbnode, relnode, bucketnode, opt}; { bytea* pageData = (bytea*)palloc(BLCKSZ + VARHDRSZ); diff --git a/src/gausskernel/storage/replication/basebackup.cpp b/src/gausskernel/storage/replication/basebackup.cpp index ef6e3cf9e..9502dbbc5 100755 --- a/src/gausskernel/storage/replication/basebackup.cpp +++ b/src/gausskernel/storage/replication/basebackup.cpp @@ -33,6 +33,7 @@ #include "access/xlog.h" #include "storage/smgr/fd.h" #include "storage/ipc.h" +#include "storage/page_compression.h" #include "storage/pmsignal.h" #include "storage/checksum.h" #ifdef ENABLE_MOT @@ -112,6 +113,9 @@ static void send_xlog_header(const char *linkpath); static void save_xlogloc(const char *xloglocation); static XLogRecPtr GetMinArchiveSlotLSN(void); +/* compressed Function */ +static void SendCompressedFile(char* readFileName, int basePathLen, struct stat& statbuf, bool missingOk, int64* size); + /* * save xlog location */ @@ -1101,6 +1105,35 @@ static bool IsDCFPath(const char *pathname) return false; } +#define SEND_DIR_ADD_SIZE(size, statbuf) ((size) = (size) + (((statbuf).st_size + 511) & ~511) + BUILD_PATH_LEN) + +/** + * send file or compressed file + * @param sizeOnly send or not + * @param pathbuf path + * @param pathBufLen pathLen + * @param basepathlen subfix of path + * @param statbuf path stat + */ +static void SendRealFile(bool sizeOnly, char* pathbuf, size_t pathBufLen, int basepathlen, struct stat* statbuf) +{ + int64 size = 0; + // we must ensure the page integrity when in IncrementalCheckpoint + if (!sizeOnly && g_instance.attr.attr_storage.enableIncrementalCheckpoint && + IsCompressedFile(pathbuf, strlen(pathbuf)) != COMPRESSED_TYPE_UNKNOWN) { + SendCompressedFile(pathbuf, basepathlen, (*statbuf), true, &size); + } else { + bool sent = false; + if (!sizeOnly) { + sent = sendFile(pathbuf, pathbuf + basepathlen + 1, statbuf, true); + } + if (sent || sizeOnly) { + /* Add size, rounded up to 512byte block */ + SEND_DIR_ADD_SIZE(size, (*statbuf)); + } + } +} + /* * Include all files from the given directory in the output tar stream. If * 'sizeonly' is true, we just calculate a total length and return it, without @@ -1393,15 +1426,7 @@ static int64 sendDir(const char *path, int basepathlen, bool sizeonly, List *tab if (!skip_this_dir) size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, sendtblspclinks); } else if (S_ISREG(statbuf.st_mode)) { - bool sent = false; - - if (!sizeonly) - sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, true); - - if (sent || sizeonly) { - /* Add size, rounded up to 512byte block */ - size = size + ((statbuf.st_size + 511) & ~511) + BUILD_PATH_LEN; - } + SendRealFile(sizeonly, pathbuf, strlen(pathbuf), basepathlen, &statbuf); } else ereport(WARNING, (errmsg("skipping special file \"%s\"", pathbuf))); } @@ -1528,6 +1553,15 @@ bool is_row_data_file(const char *path, int *segNo, UndoFileType *undoFileType) int nmatch; char *fname = NULL; + /* Skip compressed page files */ + size_t pathLen = strlen(path); + if (pathLen >= 4) { + const char* suffix = path + pathLen - 4; + if (strncmp(suffix, "_pca", 4) == 0 || strncmp(suffix, "_pcd", 4) == 0) { + return false; + } + } + if ((fname = strstr((char *)path, "pg_tblspc/")) != NULL) { nmatch = sscanf_s(fname, "pg_tblspc/%u/%*[^/]/%u/%s", &spcNode, &dbNode, buf, sizeof(buf)); if (nmatch == 3) { @@ -1645,6 +1679,245 @@ static void SendTableSpaceForBackup(basebackup_options* opt, List* tablespaces, } } +/** + * init buf_block if not yet; repalloc PqSendBuffer if necessary + */ +static void SendFilePreInit(void) +{ + if (t_thrd.basebackup_cxt.buf_block == NULL) { + MemoryContext oldcxt = MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE)); + t_thrd.basebackup_cxt.buf_block = (char *)palloc0(TAR_SEND_SIZE); + MemoryContextSwitchTo(oldcxt); + } + + /* + * repalloc to `MaxBuildAllocSize' in one time, to avoid many small step repalloc in `pq_putmessage_noblock' + * and low performance. + */ + if (INT2SIZET(t_thrd.libpq_cxt.PqSendBufferSize) < MaxBuildAllocSize) { + t_thrd.libpq_cxt.PqSendBuffer = (char *)repalloc(t_thrd.libpq_cxt.PqSendBuffer, MaxBuildAllocSize); + t_thrd.libpq_cxt.PqSendBufferSize = MaxBuildAllocSize; + } +} + +/** + * check file + * @param readFileName + * @param statbuf + * @param supress error if missingOk is false when file is not found + * @return return null if file.size > MAX_TAR_MEMBER_FILELEN or file cant found + */ +static FILE *SizeCheckAndAllocate(char *readFileName, const struct stat &statbuf, bool missingOk) +{ + /* + * Some compilers will throw a warning knowing this test can never be true + * because pgoff_t can't exceed the compared maximum on their platform. + */ + if (statbuf.st_size > MAX_TAR_MEMBER_FILELEN) { + ereport(WARNING, (errcode(ERRCODE_NAME_TOO_LONG), + errmsg("archive member \"%s\" too large for tar format", readFileName))); + return NULL; + } + + FILE *fp = AllocateFile(readFileName, "rb"); + if (fp == NULL) { + if (errno == ENOENT && missingOk) + return NULL; + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", readFileName))); + } + return fp; + +} + +static void TransferPcaFile(const char *readFileName, int basePathLen, const struct stat &statbuf, + PageCompressHeader *transfer, + size_t len) +{ + const char *tarfilename = readFileName + basePathLen + 1; + _tarWriteHeader(tarfilename, NULL, (struct stat*)(&statbuf)); + char *data = (char *) transfer; + size_t lenBuffer = len; + while (lenBuffer > 0) { + size_t transferLen = Min(TAR_SEND_SIZE, lenBuffer); + if (pq_putmessage_noblock('d', data, transferLen)) { + ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup"))); + } + data = data + transferLen; + lenBuffer -= transferLen; + } + size_t pad = ((len + 511) & ~511) - len; + if (pad > 0) { + securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, pad, 0, pad), "", ""); + (void) pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, pad); + } +} + +static void FileStat(char* path, struct stat* fileStat) +{ + if (stat(path, fileStat) != 0) { + if (errno != ENOENT) { + ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file or directory \"%s\": %m", path))); + } + } +} + +static void SendCompressedFile(char* readFileName, int basePathLen, struct stat& statbuf, bool missingOk, int64* size) +{ + char* tarfilename = readFileName + basePathLen + 1; + SendFilePreInit(); + FILE* fp = SizeCheckAndAllocate(readFileName, statbuf, missingOk); + if (fp == NULL) { + return; + } + + size_t readFileNameLen = strlen(readFileName); + /* dont send pca file */ + if (readFileNameLen < 4 || strncmp(readFileName + readFileNameLen - 4, "_pca", 4) == 0 || + strncmp(readFileName + readFileNameLen - 4, "_pcd", 4) != 0) { + FreeFile(fp); + return; + } + + char tablePath[MAXPGPATH] = {0}; + securec_check_c(memcpy_s(tablePath, MAXPGPATH, readFileName, readFileNameLen - 4), "", ""); + int segmentNo = 0; + UndoFileType undoFileType = UNDO_INVALID; + if (!is_row_data_file(tablePath, &segmentNo, &undoFileType)) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("%s is not a relation file.", tablePath))); + } + + char pcaFilePath[MAXPGPATH]; + securec_check_c(memcpy_s(pcaFilePath, MAXPGPATH, readFileName, readFileNameLen), "", ""); + pcaFilePath[readFileNameLen - 1] = 'a'; + + FILE* pcaFile = AllocateFile(pcaFilePath, "rb"); + if (pcaFile == NULL) { + if (errno == ENOENT && missingOk) { + FreeFile(fp); + return; + } + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcaFilePath))); + } + + uint16 chunkSize = ReadChunkSize(pcaFile, pcaFilePath, MAXPGPATH); + + struct stat pcaStruct; + FileStat((char*)pcaFilePath, &pcaStruct); + + size_t pcaFileLen = SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize); + PageCompressHeader* map = pc_mmap_real_size(fileno(pcaFile), pcaFileLen, true); + if (map == MAP_FAILED) { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("Failed to mmap page compression address file %s: %m", pcaFilePath))); + } + + PageCompressHeader* transfer = (PageCompressHeader*)palloc0(pcaFileLen); + /* decompressed page buffer, avoid frequent allocation */ + BlockNumber blockNum = 0; + size_t chunkIndex = 1; + off_t totalLen = 0; + off_t sendLen = 0; + /* send the pkg header containing msg like file size */ + BlockNumber totalBlockNum = (BlockNumber)pg_atomic_read_u32(&map->nblocks); + + /* some chunks may have been allocated but not used. + * Reserve 0 chunks for avoiding the error when the size of a compressed block extends */ + auto reservedChunks = 0; + securec_check(memcpy_s(transfer, pcaFileLen, map, pcaFileLen), "", ""); + decltype(statbuf.st_size) realSize = (map->allocated_chunks + reservedChunks) * chunkSize; + statbuf.st_size = statbuf.st_size >= realSize ? statbuf.st_size : realSize; + _tarWriteHeader(tarfilename, NULL, (struct stat*)(&statbuf)); + bool* onlyExtend = (bool*)palloc0(totalBlockNum * sizeof(bool)); + + /* allocated in advance to prevent repeated allocated */ + char pageBuffer[BLCKSZ]; + ReadBlockChunksStruct rbStruct{map, pageBuffer, BLCKSZ, fp, segmentNo, readFileName}; + for (blockNum = 0; blockNum < totalBlockNum; blockNum++) { + PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(transfer, chunkSize, blockNum); + /* skip some blocks which only extends. The size of blocks is 0. */ + if (addr->nchunks == 0) { + onlyExtend[blockNum] = true; + continue; + } + /* read block to t_thrd.basebackup_cxt.buf_block */ + size_t bufferSize = TAR_SEND_SIZE - sendLen; + size_t len = ReadAllChunkOfBlock(t_thrd.basebackup_cxt.buf_block + sendLen, bufferSize, blockNum, rbStruct); + /* merge Blocks */ + sendLen += len; + if (totalLen + (off_t)len > statbuf.st_size) { + ReleaseMap(map, readFileName); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("some blocks in %s had been changed. Retry backup please. PostBlocks:%u, currentReadBlocks " + ":%u, transferSize: %lu. totalLen: %lu, len: %lu", + readFileName, + totalBlockNum, + blockNum, + statbuf.st_size, + totalLen, + len))); + } + if (sendLen > TAR_SEND_SIZE - BLCKSZ) { + if (pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, sendLen)) { + ReleaseMap(map, readFileName); + ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup"))); + } + sendLen = 0; + } + uint8 nchunks = len / chunkSize; + addr->nchunks = addr->allocated_chunks = nchunks; + for (size_t i = 0; i < nchunks; i++) { + addr->chunknos[i] = chunkIndex++; + } + addr->checksum = AddrChecksum32(blockNum, addr); + totalLen += len; + } + ReleaseMap(map, readFileName); + + if (sendLen != 0) { + if (pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, sendLen)) { + ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup"))); + } + } + + /* If the file was truncated while we were sending it, pad it with zeros */ + if (totalLen < statbuf.st_size) { + securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, TAR_SEND_SIZE, 0, TAR_SEND_SIZE), "", ""); + while (totalLen < statbuf.st_size) { + size_t cnt = Min(TAR_SEND_SIZE, statbuf.st_size - totalLen); + (void)pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, cnt); + totalLen += cnt; + } + } + + size_t pad = ((totalLen + 511) & ~511) - totalLen; + if (pad > 0) { + securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, pad, 0, pad), "", ""); + (void)pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, pad); + } + SEND_DIR_ADD_SIZE(*size, statbuf); + + // allocate chunks of some pages which only extend + for (size_t blockNum = 0; blockNum < totalBlockNum; ++blockNum) { + if (onlyExtend[blockNum]) { + PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(transfer, chunkSize, blockNum); + for (size_t i = 0; i < addr->allocated_chunks; i++) { + addr->chunknos[i] = chunkIndex++; + } + } + } + transfer->nblocks = transfer->last_synced_nblocks = blockNum; + transfer->last_synced_allocated_chunks = transfer->allocated_chunks = chunkIndex; + TransferPcaFile(pcaFilePath, basePathLen, pcaStruct, transfer, pcaFileLen); + + SEND_DIR_ADD_SIZE(*size, pcaStruct); + FreeFile(pcaFile); + FreeFile(fp); + pfree(transfer); + pfree(onlyExtend); +} + /* * Given the member, write the TAR header & send the file. * @@ -1669,38 +1942,10 @@ static bool sendFile(char *readfilename, char *tarfilename, struct stat *statbuf int retryCnt = 0; UndoFileType undoFileType = UNDO_INVALID; - if (t_thrd.basebackup_cxt.buf_block == NULL) { - MemoryContext oldcxt = NULL; - - oldcxt = MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE)); - t_thrd.basebackup_cxt.buf_block = (char *)palloc0(TAR_SEND_SIZE); - MemoryContextSwitchTo(oldcxt); - } - - /* - * repalloc to `MaxBuildAllocSize' in one time, to avoid many small step repalloc in `pq_putmessage_noblock' - * and low performance. - */ - if (INT2SIZET(t_thrd.libpq_cxt.PqSendBufferSize) < MaxBuildAllocSize) { - t_thrd.libpq_cxt.PqSendBuffer = (char *)repalloc(t_thrd.libpq_cxt.PqSendBuffer, MaxBuildAllocSize); - t_thrd.libpq_cxt.PqSendBufferSize = MaxBuildAllocSize; - } - - /* - * Some compilers will throw a warning knowing this test can never be true - * because pgoff_t can't exceed the compared maximum on their platform. - */ - if (statbuf->st_size > MAX_TAR_MEMBER_FILELEN) { - ereport(WARNING, (errcode(ERRCODE_NAME_TOO_LONG), - errmsg("archive member \"%s\" too large for tar format", tarfilename))); - return false; - } - - fp = AllocateFile(readfilename, "rb"); + SendFilePreInit(); + fp = SizeCheckAndAllocate(readfilename, *statbuf, missing_ok); if (fp == NULL) { - if (errno == ENOENT && missing_ok) - return false; - ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", readfilename))); + return false; } isNeedCheck = is_row_data_file(readfilename, &segNo, &undoFileType); diff --git a/src/gausskernel/storage/smgr/Makefile b/src/gausskernel/storage/smgr/Makefile index caeaba334..46b25e0cf 100644 --- a/src/gausskernel/storage/smgr/Makefile +++ b/src/gausskernel/storage/smgr/Makefile @@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean" endif endif endif -OBJS = md.o smgr.o smgrtype.o knl_uundofile.o segstore.o +OBJS = md.o smgr.o smgrtype.o knl_uundofile.o segstore.o page_compression.o mmap_shared.o SUBDIRS = segment diff --git a/src/gausskernel/storage/smgr/md.cpp b/src/gausskernel/storage/smgr/md.cpp index eeb864389..31d1eff42 100644 --- a/src/gausskernel/storage/smgr/md.cpp +++ b/src/gausskernel/storage/smgr/md.cpp @@ -33,6 +33,7 @@ #include "storage/buf/bufmgr.h" #include "storage/smgr/relfilenode.h" #include "storage/copydir.h" +#include "storage/page_compression.h" #include "storage/smgr/knl_usync.h" #include "storage/smgr/smgr.h" #include "utils/aiomem.h" @@ -54,6 +55,13 @@ (tag).segno = (segNo); \ } while (false); +constexpr mode_t FILE_RW_PERMISSION = 0600; + +inline static uint4 PageCompressChunkSize(SMgrRelation reln) +{ + return CHUNK_SIZE_LIST[GET_COMPRESS_CHUNK_SIZE((reln)->smgr_rnode.node.opt)]; +} + /* * The magnetic disk storage manager keeps track of open file * descriptors in its own descriptor pool. This is done to make it @@ -96,6 +104,8 @@ */ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ + File mdfd_vfd_pca; /* page compression address file 's fd number in fd.cpp's pool */ + File mdfd_vfd_pcd; /* page compression data file 's fd number in fd.cpp's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */ struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ } MdfdVec; @@ -111,6 +121,10 @@ static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, const MdfdV static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, const MdfdVec *seg); static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno); +/* function of compressed table */ +static int sync_pcmap(PageCompressHeader *pcMap, uint32 wait_event_info); + + bool check_unlink_rel_hashtbl(RelFileNode rnode) { HTAB* relfilenode_hashtbl = g_instance.bgwriter_cxt.unlink_rel_hashtbl; @@ -122,6 +136,44 @@ bool check_unlink_rel_hashtbl(RelFileNode rnode) return found; } +static int OpenPcaFile(const char *path, const RelFileNodeBackend &node, const ForkNumber &forkNum, const uint32 &segNo, int oflags = 0) +{ + Assert(node.node.opt != 0 && forkNum == MAIN_FORKNUM); + char dst[MAXPGPATH]; + CopyCompressedPath(dst, path, COMPRESSED_TABLE_PCA_FILE); + uint32 flags = O_RDWR | PG_BINARY | oflags; + return DataFileIdOpenFile(dst, RelFileNodeForkNumFill(node, PCA_FORKNUM, segNo), (int)flags, S_IRUSR | S_IWUSR); +} + +static int OpenPcdFile(const char *path, const RelFileNodeBackend &node, const ForkNumber &forkNum, const uint32 &segNo, int oflags = 0) +{ + Assert(node.node.opt != 0 && forkNum == MAIN_FORKNUM); + char dst[MAXPGPATH]; + CopyCompressedPath(dst, path, COMPRESSED_TABLE_PCD_FILE); + uint32 flags = O_RDWR | PG_BINARY | oflags; + return DataFileIdOpenFile(dst, RelFileNodeForkNumFill(node, PCD_FORKNUM, segNo), (int)flags, S_IRUSR | S_IWUSR); +} + +static void RegisterCompressDirtySegment(SMgrRelation reln, ForkNumber forknum, const MdfdVec *seg) +{ + PageCompressHeader *pcMap = GetPageCompressMemoryMap(seg->mdfd_vfd_pca, PageCompressChunkSize(reln)); + if (sync_pcmap(pcMap, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC) != 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); + return; + } + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", + FilePathName(seg->mdfd_vfd_pca)))); + } + if (FileSync(seg->mdfd_vfd_pcd, WAIT_EVENT_DATA_FILE_SYNC) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); + return; + } + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", + FilePathName(seg->mdfd_vfd_pcd)))); + } +} /* * register_dirty_segment() -- Mark a relation segment as needing fsync * @@ -142,14 +194,17 @@ static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, const if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */)) { ereport(DEBUG1, (errmsg("could not forward fsync request because request queue is full"))); - - if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) { - if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { - ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); - return; + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + RegisterCompressDirtySegment(reln, forknum, seg); + } else { + if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); + return; + } + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", + FilePathName(seg->mdfd_vfd)))); } - ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); } } } @@ -180,6 +235,28 @@ void md_register_forget_request(RelFileNode rnode, ForkNumber forknum, BlockNumb RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */); } +static void allocate_chunk_check(PageCompressAddr *pcAddr, uint32 chunk_size, BlockNumber blocknum, MdfdVec *v) +{ + /* check allocated chunk number */ + Assert(chunk_size == BLCKSZ / 2 || chunk_size == BLCKSZ / 4 || chunk_size == BLCKSZ / 8 || + chunk_size == BLCKSZ / 16); + if (pcAddr->allocated_chunks > BLCKSZ / chunk_size) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunks %u of block %u in file \"%s\"", + pcAddr->allocated_chunks, blocknum, + FilePathName(v->mdfd_vfd_pca)))); + } + + auto maxChunkNumbers = MAX_CHUNK_NUMBER(chunk_size); + for (auto i = 0; i < pcAddr->allocated_chunks; i++) { + if (pcAddr->chunknos[i] <= 0 || pcAddr->chunknos[i] > maxChunkNumbers) { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunk number %u of block %u in file \"%s\"", + pcAddr->chunknos[i], blocknum, + FilePathName(v->mdfd_vfd_pca)))); + } + } +} + /* * mdinit() -- Initialize private state for magnetic disk storage manager. */ @@ -205,6 +282,44 @@ bool mdexists(SMgrRelation reln, ForkNumber forkNum, BlockNumber blockNum) return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL); } +static int RetryDataFileIdOpenFile(bool isRedo, char* path, const RelFileNodeForkNum &filenode, uint32 flags) +{ + int save_errno = errno; + int fd = -1; + /* + * During bootstrap, there are cases where a system relation will be + * accessed (by internal backend processes) before the bootstrap + * script nominally creates it. Therefore, allow the file to exist + * already, even if isRedo is not set. (See also mdopen) + * + * During inplace upgrade, the physical catalog files may be present + * due to previous failure and rollback. Since the relfilenodes of these + * new catalogs can by no means be used by other relations, we simply + * truncate them. + */ + if (isRedo || IsBootstrapProcessingMode() || + (u_sess->attr.attr_common.IsInplaceUpgrade && filenode.rnode.node.relNode < FirstNormalObjectId)) { + ADIO_RUN() + { + flags = O_RDWR | PG_BINARY | O_DIRECT | (u_sess->attr.attr_common.IsInplaceUpgrade ? O_TRUNC : 0); + } + ADIO_ELSE() + { + flags = O_RDWR | PG_BINARY | (u_sess->attr.attr_common.IsInplaceUpgrade ? O_TRUNC : 0); + } + ADIO_END(); + + fd = DataFileIdOpenFile(path, filenode, flags, FILE_RW_PERMISSION); + } + + if (fd < 0) { + /* be sure to report the error reported by create, not open */ + errno = save_errno; + ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", path))); + } + return fd; +} + /* * mdcreate() -- Create a new relation on magnetic disk. * @@ -232,48 +347,46 @@ void mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) } ADIO_END(); - fd = DataFileIdOpenFile(path, filenode, flags, 0600); + fd = DataFileIdOpenFile(path, filenode, flags, FILE_RW_PERMISSION); if (fd < 0) { - int save_errno = errno; - - /* - * During bootstrap, there are cases where a system relation will be - * accessed (by internal backend processes) before the bootstrap - * script nominally creates it. Therefore, allow the file to exist - * already, even if isRedo is not set. (See also mdopen) - * - * During inplace upgrade, the physical catalog files may be present - * due to previous failure and rollback. Since the relfilenodes of these - * new catalogs can by no means be used by other relations, we simply - * truncate them. - */ - if (isRedo || IsBootstrapProcessingMode() || - (u_sess->attr.attr_common.IsInplaceUpgrade && filenode.rnode.node.relNode < FirstNormalObjectId)) { - ADIO_RUN() - { - flags = O_RDWR | PG_BINARY | O_DIRECT | (u_sess->attr.attr_common.IsInplaceUpgrade ? O_TRUNC : 0); - } - ADIO_ELSE() - { - flags = O_RDWR | PG_BINARY | (u_sess->attr.attr_common.IsInplaceUpgrade ? O_TRUNC : 0); - } - ADIO_END(); - - fd = DataFileIdOpenFile(path, filenode, flags, 0600); - } - - if (fd < 0) { - /* be sure to report the error reported by create, not open */ - errno = save_errno; - ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", path))); - } + fd = RetryDataFileIdOpenFile(isRedo, path, filenode, flags); } + File fd_pca = -1; + File fd_pcd = -1; + if (unlikely(IS_COMPRESSED_MAINFORK(reln, forkNum))) { + // close main fork file + FileClose(fd); + fd = -1; + + /* open page compress address file */ + char pcfile_path[MAXPGPATH]; + errno_t rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + RelFileNodeForkNum pcaFienode = RelFileNodeForkNumFill(reln->smgr_rnode, PCA_FORKNUM, 0); + fd_pca = DataFileIdOpenFile(pcfile_path, pcaFienode, flags, FILE_RW_PERMISSION); + + if (fd_pca < 0) { + fd_pca = RetryDataFileIdOpenFile(isRedo, pcfile_path, pcaFienode, flags); + } + + rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + RelFileNodeForkNum pcdFileNode = RelFileNodeForkNumFill(reln->smgr_rnode, PCD_FORKNUM, 0); + fd_pcd = DataFileIdOpenFile(pcfile_path, pcdFileNode, flags, FILE_RW_PERMISSION); + if (fd_pcd < 0) { + fd_pcd = RetryDataFileIdOpenFile(isRedo, pcfile_path, pcdFileNode, flags); + } + SetupPageCompressMemoryMap(fd_pca, reln->smgr_rnode.node, filenode); + } + + pfree(path); reln->md_fd[forkNum] = _fdvec_alloc(); - + reln->md_fd[forkNum]->mdfd_vfd_pca = fd_pca; + reln->md_fd[forkNum] ->mdfd_vfd_pcd = fd_pcd; reln->md_fd[forkNum]->mdfd_vfd = fd; reln->md_fd[forkNum]->mdfd_segno = 0; reln->md_fd[forkNum]->mdfd_chain = NULL; @@ -351,13 +464,78 @@ void set_max_segno_delrel(int max_segno, RelFileNode rnode) } } +static int ResetPcMap(char *path, const RelFileNodeBackend& rnode) +{ + int ret; + char pcfile_path[MAXPGPATH]; + int rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + int fd_pca = BasicOpenFile(pcfile_path, O_RDWR | PG_BINARY, 0); + if (fd_pca >= 0) { + int save_errno; + int chunkSize = CHUNK_SIZE_LIST[GET_COMPRESS_CHUNK_SIZE(rnode.node.opt)]; + int mapRealSize = SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize); + PageCompressHeader *map = pc_mmap_real_size(fd_pca, mapRealSize, false); + if (map == MAP_FAILED) { + ereport(WARNING, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("Failed to mmap page compression address file %s: %m", pcfile_path))); + } else { + pg_atomic_write_u32(&map->nblocks, 0); + pg_atomic_write_u32(&map->allocated_chunks, 0); + error_t rc = memset_s((char *)map + SIZE_OF_PAGE_COMPRESS_HEADER_DATA, + mapRealSize - SIZE_OF_PAGE_COMPRESS_HEADER_DATA, 0, + SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize) - SIZE_OF_PAGE_COMPRESS_HEADER_DATA); + securec_check_c(rc, "\0", "\0"); + map->sync = false; + if (sync_pcmap(map, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC) != 0) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", pcfile_path))); + } + + if (pc_munmap(map) != 0) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", pcfile_path))); + } + } + save_errno = errno; + (void)close(fd_pca); + errno = save_errno; + } else { + ret = -1; + } + if (ret < 0 && errno != ENOENT) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", pcfile_path))); + } + return ret; +} + +static void UnlinkCompressedFile(const RelFileNode& node, ForkNumber forkNum, char* path) +{ + if (!IS_COMPRESSED_RNODE(node, forkNum)) { + return; + } + /* remove pca */ + char pcfile_path[MAXPGPATH]; + errno_t rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + int ret = unlink(pcfile_path); + if (ret < 0 && errno != ENOENT) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", pcfile_path))); + } + /* remove pcd */ + rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + ret = unlink(pcfile_path); + if (ret < 0 && errno != ENOENT) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", pcfile_path))); + } +} + static void mdunlinkfork(const RelFileNodeBackend& rnode, ForkNumber forkNum, bool isRedo) { char* path = NULL; int ret; path = relpath(rnode, forkNum); - + /* * Delete or truncate the first segment. */ @@ -374,6 +552,7 @@ static void mdunlinkfork(const RelFileNodeBackend& rnode, ForkNumber forkNum, bo if (ret < 0 && errno != ENOENT) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); } + UnlinkCompressedFile(rnode.node, forkNum, path); } else { /* truncate(2) would be easier here, but Windows hasn't got it */ int fd; @@ -393,6 +572,29 @@ static void mdunlinkfork(const RelFileNodeBackend& rnode, ForkNumber forkNum, bo ereport(WARNING, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", path))); } + if (IS_COMPRESSED_RNODE(rnode.node, forkNum)) { + // dont truncate pca! pca may be occupied by other threads by mmap + ret = ResetPcMap(path, rnode); + + // remove pcd + char dataPath[MAXPGPATH]; + int rc = snprintf_s(dataPath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + int fd_pcd = BasicOpenFile(dataPath, O_RDWR | PG_BINARY, 0); + if (fd_pcd >= 0) { + int save_errno; + ret = ftruncate(fd_pcd, 0); + save_errno = errno; + (void)close(fd_pcd); + errno = save_errno; + } else { + ret = -1; + } + if (ret < 0 && errno != ENOENT) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", dataPath))); + } + } + /* Register request to unlink first segment later */ register_unlink_segment(rnode, forkNum, 0); } @@ -427,6 +629,7 @@ static void mdunlinkfork(const RelFileNodeBackend& rnode, ForkNumber forkNum, bo } break; } + if (!RelFileNodeBackendIsTemp(rnode)) { md_register_forget_request(rnode.node, forkNum, segno); } @@ -443,8 +646,24 @@ static void mdunlinkfork(const RelFileNodeBackend& rnode, ForkNumber forkNum, bo rc = sprintf_s(segpath, strlen(path) + 12, "%s.%u", path, segno); securec_check_ss(rc, "", ""); if (unlink(segpath) < 0) { - ereport(WARNING, (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", segpath))); + ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", segpath))); + } + + if (IS_COMPRESSED_RNODE(rnode.node, forkNum)) { + char pcfile_segpath[MAXPGPATH]; + errno_t rc = snprintf_s(pcfile_segpath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, segpath); + securec_check_ss(rc, "\0", "\0"); + if (unlink(pcfile_segpath) < 0) { + ereport(WARNING, + (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", pcfile_segpath))); + } + + rc = snprintf_s(pcfile_segpath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, segpath); + securec_check_ss(rc, "\0", "\0"); + if (unlink(pcfile_segpath) < 0) { + ereport(WARNING, + (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", pcfile_segpath))); + } } } pfree(segpath); @@ -453,6 +672,156 @@ static void mdunlinkfork(const RelFileNodeBackend& rnode, ForkNumber forkNum, bo pfree(path); } +static inline void ExtendChunksOfBlock(PageCompressHeader* pcMap, PageCompressAddr* pcAddr, int needChunks, MdfdVec* v) +{ + if (pcAddr->allocated_chunks < needChunks) { + auto allocateNumber = needChunks - pcAddr->allocated_chunks; + int chunkno = (pc_chunk_number_t)pg_atomic_fetch_add_u32(&pcMap->allocated_chunks, allocateNumber) + 1; + for (int i = pcAddr->allocated_chunks; i < needChunks; ++i, ++chunkno) { + pcAddr->chunknos[i] = chunkno; + } + pcAddr->allocated_chunks = needChunks; + + if (pg_atomic_read_u32(&pcMap->allocated_chunks) - pg_atomic_read_u32(&pcMap->last_synced_allocated_chunks) > + COMPRESS_ADDRESS_FLUSH_CHUNKS) { + pcMap->sync = false; + if (sync_pcmap(pcMap, WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH) != 0) { + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", + FilePathName(v->mdfd_vfd_pca)))); + } + } + } +} + +/* + * mdextend_pc() -- Add a block to the specified page compressed relation. + * + */ +static void mdextend_pc(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char* buffer, bool skipFsync) +{ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + Assert(IS_COMPRESSED_MAINFORK(reln, forknum)); + + MdfdVec* v = _mdfd_getseg(reln, MAIN_FORKNUM, blocknum, skipFsync, EXTENSION_CREATE); + RelFileCompressOption option; + TransCompressOptions(reln->smgr_rnode.node, &option); + uint32 chunk_size = CHUNK_SIZE_LIST[option.compressChunkSize]; + uint8 algorithm = option.compressAlgorithm; + uint8 prealloc_chunk = option.compressPreallocChunks; + PageCompressHeader *pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); + Assert(blocknum % RELSEG_SIZE >= pg_atomic_read_u32(&pcMap->nblocks)); + + uint32 maxAllocChunkNum = (uint32)(BLCKSZ / chunk_size - 1); + PageCompressAddr* pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum); + + prealloc_chunk = (prealloc_chunk > maxAllocChunkNum) ? maxAllocChunkNum : prealloc_chunk; + + /* check allocated chunk number */ + if (pcAddr->allocated_chunks > BLCKSZ / chunk_size) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunks %u of block %u in file \"%s\"", + pcAddr->allocated_chunks, blocknum, + FilePathName(v->mdfd_vfd_pca)))); + } + + for (int i = 0; i < pcAddr->allocated_chunks; ++i) { + if (pcAddr->chunknos[i] <= 0 || pcAddr->chunknos[i] > (BLCKSZ / chunk_size) * RELSEG_SIZE) { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunk number %u of block %u in file \"%s\"", + pcAddr->chunknos[i], blocknum, + FilePathName(v->mdfd_vfd_pca)))); + } + } + + /* compress page only for initialized page */ + char *work_buffer = NULL; + int nchunks = 0; + if (!PageIsNew(buffer)) { + int work_buffer_size = CompressPageBufferBound(buffer, algorithm); + if (work_buffer_size < 0) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("mdextend_pc unrecognized compression algorithm %d", algorithm))); + } + work_buffer = (char *) palloc(work_buffer_size); + int compressed_page_size = CompressPage(buffer, work_buffer, work_buffer_size, option); + if (compressed_page_size < 0) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("mdextend_pc unrecognized compression algorithm %d", algorithm))); + } + nchunks = (compressed_page_size - 1) / chunk_size + 1; + if (nchunks * chunk_size >= BLCKSZ) { + pfree(work_buffer); + work_buffer = (char *) buffer; + nchunks = BLCKSZ / chunk_size; + } else { + /* fill zero in the last chunk */ + int storageSize = chunk_size * nchunks; + if (compressed_page_size < storageSize) { + error_t rc = memset_s(work_buffer + compressed_page_size, work_buffer_size - compressed_page_size, 0, + storageSize - compressed_page_size); + securec_check_c(rc, "\0", "\0"); + } + } + } + + int need_chunks = prealloc_chunk > nchunks ? prealloc_chunk : nchunks; + ExtendChunksOfBlock(pcMap, pcAddr, need_chunks, v); + + /* write chunks of compressed page + * worker_buffer = NULL -> nchunks = 0 + */ + for (int i = 0; i < nchunks; i++) { + char* buffer_pos = work_buffer + chunk_size * i; + off_t seekpos = (off_t) OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, pcAddr->chunknos[i]); + // write continuous chunks + int range = 1; + while (i < nchunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) { + range++; + i++; + } + int write_amount = chunk_size * range; + int nbytes; + if ((nbytes = FileWrite(v->mdfd_vfd_pcd, buffer_pos, write_amount, seekpos)) != write_amount) { + if (nbytes < 0) { + ereport(ERROR, (errcode_for_file_access(), errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd_pcd)), errhint( + "Check free disk space."))); + } + /* short write: complain appropriately */ + ereport(ERROR, (errcode(ERRCODE_DISK_FULL), errmsg( + "could not extend file \"%s\": wrote only %d of %d bytes at block %u", FilePathName(v->mdfd_vfd_pcd), + nbytes, write_amount, blocknum), errhint("Check free disk space."))); + } + } + + /* finally update size of this page and global nblocks */ + if (pcAddr->nchunks != nchunks) { + pcAddr->nchunks = nchunks; + } + + /* write checksum */ + pcAddr->checksum = AddrChecksum32(blocknum, pcAddr); + + + if (pg_atomic_read_u32(&pcMap->nblocks) < blocknum % RELSEG_SIZE + 1) { + pg_atomic_write_u32(&pcMap->nblocks, blocknum % RELSEG_SIZE + 1); + } + + pcMap->sync = false; + if (work_buffer != NULL && work_buffer != buffer) { + pfree(work_buffer); + } + + if (!skipFsync && !SmgrIsTemp(reln)) { + register_dirty_segment(reln, forknum, v); + } + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); +} + /* * mdextend() -- Add a block to the specified relation. * @@ -484,6 +853,10 @@ void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("cannot extend file \"%s\" beyond %u blocks", relpath(reln->smgr_rnode, forknum), InvalidBlockNumber))); } + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + mdextend_pc(reln, forknum, blocknum, buffer, skipFsync); + return; + } v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); if (v == NULL) { @@ -526,6 +899,33 @@ void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)RELSEG_SIZE)); } +static int MdOpenRetryOpenFile(char* path, const RelFileNodeForkNum &filenode, ExtensionBehavior behavior, uint32 flags) +{ + int fd = -1; + /* + * During bootstrap, there are cases where a system relation will be + * accessed (by internal backend processes) before the bootstrap + * script nominally creates it. Therefore, accept mdopen() as a + * substitute for mdcreate() in bootstrap mode only. (See mdcreate) + */ + if (IsBootstrapProcessingMode()) { + flags |= (O_CREAT | O_EXCL); + fd = DataFileIdOpenFile(path, filenode, (int)flags, FILE_RW_PERMISSION); + } + + if (fd < 0) { + if (behavior == EXTENSION_RETURN_NULL && FILE_POSSIBLY_DELETED(errno)) { + return -1; + } + if (check_unlink_rel_hashtbl(filenode.rnode.node)) { + ereport(DEBUG1, (errmsg("\"%s\": %m, this relation has been removed", path))); + return -1; + } + ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); + } + return fd; +} + /* * mdopen() -- Open the specified relation. * @@ -540,8 +940,8 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior { MdfdVec *mdfd = NULL; char *path = NULL; - File fd; - RelFileNodeForkNum filenode; + File fd = -1; + RelFileNodeForkNum filenode = RelFileNodeForkNumFill(reln->smgr_rnode, forknum, 0); uint32 flags = O_RDWR | PG_BINARY; /* No work if already open */ @@ -551,38 +951,50 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior path = relpath(reln->smgr_rnode, forknum); - filenode = RelFileNodeForkNumFill(reln->smgr_rnode, forknum, 0); - - ADIO_RUN() - { - flags |= O_DIRECT; - } - ADIO_END(); - - fd = DataFileIdOpenFile(path, filenode, (int)flags, 0600); - if (fd < 0) { - /* - * During bootstrap, there are cases where a system relation will be - * accessed (by internal backend processes) before the bootstrap - * script nominally creates it. Therefore, accept mdopen() as a - * substitute for mdcreate() in bootstrap mode only. (See mdcreate) - */ - if (IsBootstrapProcessingMode()) { - flags |= (O_CREAT | O_EXCL); - fd = DataFileIdOpenFile(path, filenode, (int)flags, 0600); + File fd_pca = -1; + File fd_pcd = -1; + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + /* open page compression address file */ + char pcfile_path[MAXPGPATH]; + errno_t rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + RelFileNodeForkNum pcaRelFileNode = RelFileNodeForkNumFill(reln->smgr_rnode, PCA_FORKNUM, 0); + fd_pca = DataFileIdOpenFile(pcfile_path, pcaRelFileNode, flags, FILE_RW_PERMISSION); + if (fd_pca < 0) { + fd_pca = MdOpenRetryOpenFile(pcfile_path, pcaRelFileNode, behavior, flags); + if (fd_pca < 0) { + pfree(path); + return NULL; + } } + /* open page compression data file */ + rc = snprintf_s(pcfile_path, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path); + securec_check_ss(rc, "\0", "\0"); + RelFileNodeForkNum pcdRelFileNode = RelFileNodeForkNumFill(reln->smgr_rnode, PCD_FORKNUM, 0); + fd_pcd = DataFileIdOpenFile(pcfile_path, pcdRelFileNode, flags, FILE_RW_PERMISSION); + if (fd_pcd < 0) { + fd_pcd = MdOpenRetryOpenFile(pcfile_path, pcaRelFileNode, behavior, flags); + if (fd_pca < 0) { + pfree(path); + return NULL; + } + } + SetupPageCompressMemoryMap(fd_pca, reln->smgr_rnode.node, filenode); + } else { + ADIO_RUN() + { + flags |= O_DIRECT; + } + ADIO_END(); + filenode = RelFileNodeForkNumFill(reln->smgr_rnode, forknum, 0); + fd = DataFileIdOpenFile(path, filenode, (int)flags, 0600); if (fd < 0) { - if (behavior == EXTENSION_RETURN_NULL && FILE_POSSIBLY_DELETED(errno)) { + fd = MdOpenRetryOpenFile(path, filenode, behavior, flags); + if (fd < 0) { pfree(path); return NULL; } - if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { - ereport(DEBUG1, (errmsg("\"%s\": %m, this relation has been removed", path))); - pfree(path); - return NULL; - } - ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); } } @@ -592,6 +1004,8 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; + mdfd->mdfd_vfd_pca = fd_pca; + mdfd->mdfd_vfd_pcd = fd_pcd; mdfd->mdfd_chain = NULL; Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber)RELSEG_SIZE)); @@ -616,8 +1030,17 @@ void mdclose(SMgrRelation reln, ForkNumber forknum, BlockNumber blockNum) MdfdVec *ov = v; /* if not closed already */ - if (v->mdfd_vfd >= 0) { - FileClose(v->mdfd_vfd); + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + if (v->mdfd_vfd_pca >= 0) { + FileClose(v->mdfd_vfd_pca); + } + if (v->mdfd_vfd_pcd >= 0) { + FileClose(v->mdfd_vfd_pcd); + } + } else { + if (v->mdfd_vfd >= 0) { + FileClose(v->mdfd_vfd); + } } /* Now free vector */ @@ -640,11 +1063,46 @@ void mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return; } - seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + int chunk_size = PageCompressChunkSize(reln); + PageCompressHeader *pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum); + /* check chunk number */ + if (pcAddr->nchunks < 0 || pcAddr->nchunks > BLCKSZ / chunk_size) { + if (u_sess->attr.attr_security.zero_damaged_pages || t_thrd.xlog_cxt.InRecovery) { + return; + } else { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid chunks %u of block %u in file \"%s\"", pcAddr->nchunks, blocknum, + FilePathName(v->mdfd_vfd_pca)))); + } + } - Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); + for (uint8 i = 0; i < pcAddr->nchunks; i++) { + if (pcAddr->chunknos[i] <= 0 || pcAddr->chunknos[i] > (uint32)(BLCKSZ / chunk_size) * RELSEG_SIZE) { + if (u_sess->attr.attr_security.zero_damaged_pages || t_thrd.xlog_cxt.InRecovery) { + return; + } else { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid chunk number %u of block %u in file \"%s\"", pcAddr->chunknos[i], + blocknum, FilePathName(v->mdfd_vfd_pca)))); + } + } + seekpos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, pcAddr->chunknos[i]); + int range = 1; + while (i < pcAddr->nchunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) { + range++; + i++; + } + (void)FilePrefetch(v->mdfd_vfd_pcd, seekpos, chunk_size * range, WAIT_EVENT_DATA_FILE_PREFETCH); + } + } else { + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - (void)FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); + + (void)FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); + } #endif /* USE_PREFETCH */ } @@ -687,10 +1145,41 @@ void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Bl Assert(nflush >= 1); Assert(nflush <= nblocks); - seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - - FileWriteback(v->mdfd_vfd, seekpos, (off_t)BLCKSZ * nflush); - + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + uint32 chunk_size = PageCompressChunkSize(reln); + PageCompressHeader *pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); + pc_chunk_number_t seekpos_chunk; + pc_chunk_number_t last_chunk; + bool firstEnter = true; + for (BlockNumber iblock = 0; iblock < nflush; ++iblock) { + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum + iblock); + // find continue chunk to write back + for (uint8 i = 0; i < pcAddr->nchunks; ++i) { + if (firstEnter) { + seekpos_chunk = pcAddr->chunknos[i]; + last_chunk = seekpos_chunk; + firstEnter = false; + } else if (pcAddr->chunknos[i] == last_chunk + 1) { + last_chunk++; + } else { + seekpos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, seekpos_chunk); + pc_chunk_number_t nchunks = last_chunk - seekpos_chunk + 1; + FileWriteback(v->mdfd_vfd_pcd, seekpos, (off_t)nchunks * chunk_size); + seekpos_chunk = pcAddr->chunknos[i]; + last_chunk = seekpos_chunk; + } + } + } + /* flush the rest chunks */ + if (!firstEnter) { + seekpos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, seekpos_chunk); + pc_chunk_number_t nchunks = last_chunk - seekpos_chunk + 1; + FileWriteback(v->mdfd_vfd_pcd, seekpos, (off_t)nchunks * chunk_size); + } + } else { + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush); + } nblocks -= nflush; blocknum += nflush; } @@ -979,6 +1468,158 @@ static void check_file_stat(char *file_name) (c) = (value); \ } while (0) +/* + * mdread_pc() -- Read the specified block from a page compressed relation. + */ +bool mdread_pc(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) +{ + Assert(IS_COMPRESSED_MAINFORK(reln, forknum)); + + MdfdVec *v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); + + RelFileCompressOption option; + TransCompressOptions(reln->smgr_rnode.node, &option); + uint32 chunk_size = CHUNK_SIZE_LIST[option.compressChunkSize]; + uint8 algorithm = option.compressAlgorithm; + PageCompressHeader *pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum); + uint8 nchunks = pcAddr->nchunks; + if (nchunks == 0) { + MemSet(buffer, 0, BLCKSZ); + return true; + } + + if (nchunks > BLCKSZ / chunk_size) { + if (u_sess->attr.attr_security.zero_damaged_pages || t_thrd.xlog_cxt.InRecovery) { + MemSet(buffer, 0, BLCKSZ); + return true; + } else { +#ifndef ENABLE_MULTIPLE_NODES + if (RecoveryInProgress()) { + return false; + } +#endif + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunks %u of block %u in file \"%s\"", nchunks, + blocknum, FilePathName(v->mdfd_vfd_pca)))); + } + } + + for (auto i = 0; i < nchunks; ++i) { + if (pcAddr->chunknos[i] <= 0 || pcAddr->chunknos[i] > MAX_CHUNK_NUMBER(chunk_size)) { + if (u_sess->attr.attr_security.zero_damaged_pages || t_thrd.xlog_cxt.InRecovery) { + MemSet(buffer, 0, BLCKSZ); + return true; + } else { + check_file_stat(FilePathName(v->mdfd_vfd_pcd)); + force_backtrace_messages = true; +#ifndef ENABLE_MULTIPLE_NODES + if (RecoveryInProgress()) { + return false; + } +#endif + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunks %u of block %u in file \"%s\"", + nchunks, blocknum, + FilePathName(v->mdfd_vfd_pca)))); + } + } + } + + // read chunk data + char *buffer_pos = NULL; + uint8 start; + int read_amount; + char *compress_buffer = (char*)palloc(chunk_size * nchunks); + for (uint8 i = 0; i < nchunks; ++i) { + buffer_pos = compress_buffer + chunk_size * i; + off_t seekpos = (off_t) OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, pcAddr->chunknos[i]); + start = i; + while (i < nchunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) { + i++; + } + read_amount = chunk_size * (i - start + 1); + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + int nbytes = FilePRead(v->mdfd_vfd_pcd, buffer_pos, read_amount, seekpos, WAIT_EVENT_DATA_FILE_READ); + TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, nbytes, BLCKSZ); + + if (nbytes != read_amount) { + if (nbytes < 0) { +#ifndef ENABLE_MULTIPLE_NODES + if (RecoveryInProgress()) { + return false; + } +#endif + ereport(ERROR, + (errcode_for_file_access(), errmsg("could not read block %u in file \"%s\": %m", blocknum, + FilePathName(v->mdfd_vfd_pcd)))); + } + /* + * Short read: we are at or past EOF, or we read a partial block at + * EOF. Normally this is an error; upper levels should never try to + * read a nonexistent block. However, if zero_damaged_pages is ON or + * we are InRecovery, we should instead return zeroes without + * complaining. This allows, for example, the case of trying to + * update a block that was later truncated away. + */ + if (u_sess->attr.attr_security.zero_damaged_pages || t_thrd.xlog_cxt.InRecovery) { + MemSet(buffer, 0, BLCKSZ); + return true; + } else { + check_file_stat(FilePathName(v->mdfd_vfd_pcd)); + force_backtrace_messages = true; + +#ifndef ENABLE_MULTIPLE_NODES + if (RecoveryInProgress()) { + return false; + } +#endif + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg( + "could not read block %u in file \"%s\": read only %d of %d bytes", blocknum, + FilePathName(v->mdfd_vfd_pcd), nbytes, read_amount))); + } + } + } + + /* decompress chunk data */ + int nbytes; + if (pcAddr->nchunks == BLCKSZ / chunk_size) { + error_t rc = memcpy_s(buffer, BLCKSZ, compress_buffer, BLCKSZ); + securec_check(rc, "", ""); + } else { + nbytes = DecompressPage(compress_buffer, buffer, algorithm); + if (nbytes != BLCKSZ) { + if (nbytes == -2) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg( + "could not recognized compression algorithm %d for file \"%s\"", algorithm, + FilePathName(v->mdfd_vfd_pcd)))); + } + if (u_sess->attr.attr_security.zero_damaged_pages || t_thrd.xlog_cxt.InRecovery) { + pfree(compress_buffer); + MemSet(buffer, 0, BLCKSZ); + return true; + } else { + check_file_stat(FilePathName(v->mdfd_vfd_pcd)); + force_backtrace_messages = true; + +#ifndef ENABLE_MULTIPLE_NODES + if (RecoveryInProgress()) { + return false; + } +#endif + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg( + "could not decompress block %u in file \"%s\": decompress %d of %d bytes", blocknum, + FilePathName(v->mdfd_vfd_pcd), nbytes, BLCKSZ))); + } + } + } + pfree(compress_buffer); + return true; +} + /* * mdread() -- Read the specified block from a relation. */ @@ -1001,6 +1642,10 @@ SMGR_READ_STATUS mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber block static THR_LOCAL Oid lstDb = InvalidOid; static THR_LOCAL Oid lstSpc = InvalidOid; + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + return mdread_pc(reln, forknum, blocknum, buffer) ? SMGR_RD_OK : SMGR_RD_CRC_ERROR; + } + (void)INSTR_TIME_SET_CURRENT(startTime); TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, @@ -1096,6 +1741,136 @@ SMGR_READ_STATUS mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber block } } +/* + * mdwrite_pc() -- Write the supplied block at the appropriate location for page compressed relation. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +static void mdwrite_pc(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool skipFsync) +{ + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum < mdnblocks(reln, forknum)); +#endif + Assert(IS_COMPRESSED_MAINFORK(reln, forknum)); + bool mmapSync = false; + MdfdVec *v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL); + + + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not write block %u in file \"%s\": %m, this relation has been removed", + blocknum, FilePathName(v->mdfd_vfd)))); + /* this file need skip sync */ + return; + } + + RelFileCompressOption option; + TransCompressOptions(reln->smgr_rnode.node, &option); + uint32 chunk_size = CHUNK_SIZE_LIST[option.compressChunkSize]; + uint8 algorithm = option.compressAlgorithm; + int8 level = option.compressLevelSymbol ? option.compressLevel : -option.compressLevel; + uint8 prealloc_chunk = option.compressPreallocChunks; + + PageCompressHeader *pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum); + Assert(blocknum % RELSEG_SIZE < pg_atomic_read_u32(&pcMap->nblocks)); + auto maxChunkSize = BLCKSZ / chunk_size - 1; + if (prealloc_chunk > maxChunkSize) { + prealloc_chunk = maxChunkSize; + } + + allocate_chunk_check(pcAddr, chunk_size, blocknum, v); + + /* compress page */ + auto work_buffer_size = CompressPageBufferBound(buffer, algorithm); + if (work_buffer_size < 0) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg( + "mdwrite_pc unrecognized compression algorithm %d,chunk_size:%ud,level:%d,prealloc_chunk:%ud", algorithm, + chunk_size, level, prealloc_chunk))); + } + char *work_buffer = (char *) palloc(work_buffer_size); + auto compress_buffer_size = CompressPage(buffer, work_buffer, work_buffer_size, option); + if (compress_buffer_size < 0) { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg( + "mdwrite_pc unrecognized compression algorithm %d,chunk_size:%ud,level:%d,prealloc_chunk:%ud", algorithm, + chunk_size, level, prealloc_chunk))); + } + + uint8 nchunks = (compress_buffer_size - 1) / chunk_size + 1; + auto bufferSize = chunk_size * nchunks; + if (bufferSize >= BLCKSZ) { + /* store original page if can not save space? */ + pfree(work_buffer); + work_buffer = (char *) buffer; + nchunks = BLCKSZ / chunk_size; + } else { + /* fill zero in the last chunk */ + if ((uint32) compress_buffer_size < bufferSize) { + auto leftSize = bufferSize - compress_buffer_size; + errno_t rc = memset_s(work_buffer + compress_buffer_size, leftSize, 0, leftSize); + securec_check(rc, "", ""); + } + } + + uint8 need_chunks = prealloc_chunk > nchunks ? prealloc_chunk : nchunks; + ExtendChunksOfBlock(pcMap, pcAddr, need_chunks, v); + + // write chunks of compressed page + for (auto i = 0; i < nchunks; ++i) { + auto buffer_pos = work_buffer + chunk_size * i; + off_t seekpos = (off_t) OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, pcAddr->chunknos[i]); + auto start = i; + while (i < nchunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) { + i++; + } + int write_amount = chunk_size * (i - start + 1); + + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend); + int nbytes = FilePWrite(v->mdfd_vfd_pcd, buffer_pos, write_amount, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, nbytes, BLCKSZ); + + if (nbytes != write_amount) { + if (nbytes < 0) { + ereport(ERROR, + (errcode_for_file_access(), errmsg("could not write block %u in file \"%s\": %m", blocknum, + FilePathName(v->mdfd_vfd_pcd)))); + } + /* short write: complain appropriately */ + ereport(ERROR, (errcode(ERRCODE_DISK_FULL), errmsg( + "could not write block %u in file \"%s\": wrote only %d of %d bytes", blocknum, + FilePathName(v->mdfd_vfd_pcd), nbytes, BLCKSZ), errhint("Check free disk space."))); + } + } + + /* finally update size of this page and global nblocks */ + if (pcAddr->nchunks != nchunks) { + mmapSync = true; + pcAddr->nchunks = nchunks; + } + + /* write checksum */ + if (mmapSync) { + pcMap->sync = false; + pcAddr->checksum = AddrChecksum32(blocknum, pcAddr); + } + + if (work_buffer != buffer) { + pfree(work_buffer); + } + + + if (!skipFsync && !SmgrIsTemp(reln)) { + register_dirty_segment(reln, forknum, v); + } +} + + /* * mdwrite() -- Write the supplied block at the appropriate location. * @@ -1109,6 +1884,13 @@ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const int nbytes; MdfdVec *v = NULL; + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + if (!check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + Assert(blocknum < mdnblocks(reln, forknum)); + } +#endif + instr_time start_time; instr_time end_time; PgStat_Counter time_diff = 0; @@ -1124,13 +1906,6 @@ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const (void)INSTR_TIME_SET_CURRENT(start_time); - /* This assert is too expensive to have on normally ... */ -#ifdef CHECK_WRITE_VS_EXTEND - if (!check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { - Assert(blocknum < mdnblocks(reln, forknum)); - } -#endif - TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, reln->smgr_rnode.backend); @@ -1139,15 +1914,20 @@ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const return; } - seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); + bool compressed = IS_COMPRESSED_MAINFORK(reln, forknum); + if (compressed) { + mdwrite_pc(reln, forknum, blocknum, buffer, skipFsync); + } else { + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); - nbytes = FilePWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); - - TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, reln->smgr_rnode.backend, nbytes, BLCKSZ); + nbytes = FilePWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, + reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, + reln->smgr_rnode.backend, nbytes, BLCKSZ); + } (void)INSTR_TIME_SET_CURRENT(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); time_diff = (PgStat_Counter)INSTR_TIME_GET_MICROSEC(end_time); @@ -1194,7 +1974,9 @@ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const if (max_time < time_diff) { max_time = time_diff; } - + if (compressed) { + return; + } if (nbytes != BLCKSZ) { if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { ereport(DEBUG1, (errmsg("could not write block %u in file \"%s\": %m, this relation has been removed", @@ -1300,6 +2082,9 @@ void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) MdfdVec *v = NULL; BlockNumber curnblk; BlockNumber prior_blocks; + int chunk_size, i; + PageCompressHeader *pcMap = NULL; + PageCompressAddr *pcAddr = NULL; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1334,16 +2119,42 @@ void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * from the mdfd_chain). We truncate the file, but do not delete * it, for reasons explained in the header comments. */ - if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) { - if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { - ereport(DEBUG1, (errmsg("could not truncate file \"%s\": %m, this relation has been removed", - FilePathName(v->mdfd_vfd)))); - FileClose(ov->mdfd_vfd); - pfree(ov); - break; + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + chunk_size = PageCompressChunkSize(reln); + pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); + pg_atomic_write_u32(&pcMap->nblocks, 0); + pg_atomic_write_u32(&pcMap->allocated_chunks, 0); + MemSet((char *)pcMap + SIZE_OF_PAGE_COMPRESS_HEADER_DATA, 0, + SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunk_size) - SIZE_OF_PAGE_COMPRESS_HEADER_DATA); + pcMap->sync = false; + if (sync_pcmap(pcMap, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC) != 0) { + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not msync file \"%s\": %m", FilePathName(v->mdfd_vfd_pca)))); + } + if (FileTruncate(v->mdfd_vfd_pcd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not truncate file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd_pcd)))); + FileClose(ov->mdfd_vfd_pcd); + pfree(ov); + break; + } + ereport(ERROR, (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", FilePathName(v->mdfd_vfd_pcd)))); + } + } else { + if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not truncate file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd)))); + FileClose(ov->mdfd_vfd); + pfree(ov); + break; + } + ereport(ERROR, (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", FilePathName(v->mdfd_vfd)))); } - ereport(ERROR, (errcode_for_file_access(), - errmsg("could not truncate file \"%s\": %m", FilePathName(v->mdfd_vfd)))); } if (!SmgrIsTemp(reln)) { @@ -1352,7 +2163,12 @@ void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) v = v->mdfd_chain; Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st segment */ - FileClose(ov->mdfd_vfd); + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + FileClose(ov->mdfd_vfd_pca); + FileClose(ov->mdfd_vfd_pcd); + } else { + FileClose(ov->mdfd_vfd); + } pfree(ov); } else if (prior_blocks + ((BlockNumber)RELSEG_SIZE) > nblocks) { /* @@ -1364,15 +2180,70 @@ void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * given in the header comments. */ BlockNumber last_seg_blocks = nblocks - prior_blocks; + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + pc_chunk_number_t max_used_chunkno = (pc_chunk_number_t) 0; + uint32 allocated_chunks; + chunk_size = PageCompressChunkSize(reln); + pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, chunk_size); - if (FileTruncate(v->mdfd_vfd, (off_t)last_seg_blocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) { - if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { - ereport(DEBUG1, (errmsg("could not truncate file \"%s\": %m, this relation has been removed", - FilePathName(v->mdfd_vfd)))); - break; + for (BlockNumber blk = 0; blk < RELSEG_SIZE; ++blk) { + pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blk); + pcAddr->nchunks = 0; + pcAddr->checksum = AddrChecksum32(blk, pcAddr); + } + pg_atomic_write_u32(&pcMap->nblocks, last_seg_blocks); + pcMap->sync = false; + if (sync_pcmap(pcMap, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC) != 0) { + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", + FilePathName(v->mdfd_vfd_pca)))); + } + allocated_chunks = pg_atomic_read_u32(&pcMap->allocated_chunks); + /* find the max used chunkno */ + for (BlockNumber blk = (BlockNumber) 0; blk < (BlockNumber) last_seg_blocks; blk++) { + pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blk); + + /* check allocated_chunks for one page */ + if (pcAddr->allocated_chunks > BLCKSZ / chunk_size) { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid chunks %u of block %u in file \"%s\"", + pcAddr->allocated_chunks, blk, + FilePathName(v->mdfd_vfd_pca)))); + } + + /* check chunknos for one page */ + for (i = 0; i < pcAddr->allocated_chunks; i++) { + if (pcAddr->chunknos[i] == 0 || pcAddr->chunknos[i] > allocated_chunks) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg( + "invalid chunk number %u of block %u in file \"%s\"", pcAddr->chunknos[i], blk, + FilePathName(v->mdfd_vfd_pca)))); + } + + if (pcAddr->chunknos[i] > max_used_chunkno) { + max_used_chunkno = pcAddr->chunknos[i]; + } + } + } + off_t compressedOffset = (off_t)max_used_chunkno * chunk_size; + if (FileTruncate(v->mdfd_vfd_pcd, compressedOffset, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not truncate file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd)))); + break; + } + ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", + FilePathName(v->mdfd_vfd), nblocks))); + } + } else { + if (FileTruncate(v->mdfd_vfd, (off_t)last_seg_blocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not truncate file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd)))); + break; + } + ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", + FilePathName(v->mdfd_vfd), nblocks))); } - ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", - FilePathName(v->mdfd_vfd), nblocks))); } if (!SmgrIsTemp(reln)) { @@ -1391,6 +2262,29 @@ void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } } +static bool CompressMdImmediateSync(SMgrRelation reln, ForkNumber forknum, MdfdVec* v) +{ + PageCompressHeader* pcMap = GetPageCompressMemoryMap(v->mdfd_vfd_pca, PageCompressChunkSize(reln)); + if (sync_pcmap(pcMap, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC) != 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd_pca)))); + return false; + } + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", FilePathName(v->mdfd_vfd_pca)))); + } + if (FileSync(v->mdfd_vfd_pcd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd_pcd)))); + return false; + } + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd_pcd)))); + } + return true; +} /* * mdimmedsync() -- Immediately sync a relation to stable storage. * @@ -1410,15 +2304,20 @@ void mdimmedsync(SMgrRelation reln, ForkNumber forknum) v = mdopen(reln, forknum, EXTENSION_FAIL); while (v != NULL) { - if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) { - if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { - ereport(DEBUG1, - (errmsg("could not fsync file \"%s\": %m, this relation has been removed", - FilePathName(v->mdfd_vfd)))); + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + if (!CompressMdImmediateSync(reln, forknum, v)) { break; } - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd)))); + } else { + if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) { + if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { + ereport(DEBUG1, (errmsg("could not fsync file \"%s\": %m, this relation has been removed", + FilePathName(v->mdfd_vfd)))); + break; + } + ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", + FilePathName(v->mdfd_vfd)))); + } } v = v->mdfd_chain; } @@ -1524,19 +2423,37 @@ static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber ADIO_END(); /* open the file */ - fd = DataFileIdOpenFile(fullpath, filenode, O_RDWR | PG_BINARY | oflags, 0600); - - pfree(fullpath); - + fd = DataFileIdOpenFile(fullpath, filenode, O_RDWR | PG_BINARY | oflags, FILE_RW_PERMISSION); if (fd < 0) { return NULL; } + int fd_pca = -1; + int fd_pcd = -1; + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + FileClose(fd); + fd = -1; + fd_pca = OpenPcaFile(fullpath, reln->smgr_rnode, MAIN_FORKNUM, segno, oflags); + if (fd_pca < 0) { + pfree(fullpath); + return NULL; + } + fd_pcd = OpenPcdFile(fullpath, reln->smgr_rnode, MAIN_FORKNUM, segno, oflags); + if (fd_pcd < 0) { + pfree(fullpath); + return NULL; + } + SetupPageCompressMemoryMap(fd_pca, reln->smgr_rnode.node, filenode); + } + pfree(fullpath); + /* allocate an mdfdvec entry for it */ v = _fdvec_alloc(); /* fill the entry */ v->mdfd_vfd = fd; + v->mdfd_vfd_pca = fd_pca; + v->mdfd_vfd_pcd = fd_pcd; v->mdfd_segno = segno; v->mdfd_chain = NULL; Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)RELSEG_SIZE)); @@ -1650,7 +2567,10 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, const MdfdVec *seg) { off_t len; - + if (IS_COMPRESSED_MAINFORK(reln, forknum)) { + PageCompressHeader *pcMap = GetPageCompressMemoryMap(seg->mdfd_vfd_pca, PageCompressChunkSize(reln)); + return (BlockNumber) pg_atomic_read_u32(&pcMap->nblocks); + } len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END); if (len < 0) { if (check_unlink_rel_hashtbl(reln->smgr_rnode.node)) { @@ -1666,6 +2586,13 @@ static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, const MdfdV return (BlockNumber)(len / BLCKSZ); } +int OpenForkFile(const char *path, const RelFileNodeBackend& rnode, const ForkNumber& forkNumber, const uint32& segNo) +{ + RelFileNodeForkNum fileNode = RelFileNodeForkNumFill(rnode, forkNumber, segNo); + uint32 flags = O_RDWR | PG_BINARY; + return DataFileIdOpenFile((char*)path, fileNode, (int)flags, S_IRUSR | S_IWUSR); +} + /* * Sync a file to disk, given a file tag. Write the path into an output * buffer so the caller can use it in error messages. @@ -1677,10 +2604,11 @@ int SyncMdFile(const FileTag *ftag, char *path) SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, GetColumnNum(ftag->forknum)); MdfdVec *v; char *p; - File file; + File file = -1; + File pcaFd = -1; + File pcdFd = -1; int result; int savedErrno; - bool needClose = false; /* Provide the path for informational messages. */ p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); @@ -1690,26 +2618,56 @@ int SyncMdFile(const FileTag *ftag, char *path) /* Try to open the requested segment. */ v = _mdfd_getseg(reln, ftag->forknum, ftag->segno * (BlockNumber) RELSEG_SIZE, false, EXTENSION_RETURN_NULL); - if (v == NULL) { - RelFileNodeForkNum filenode = RelFileNodeForkNumFill(reln->smgr_rnode, - ftag->forknum, ftag->segno); - uint32 flags = O_RDWR | PG_BINARY; - file = DataFileIdOpenFile(path, filenode, (int)flags, S_IRUSR | S_IWUSR); - if (file < 0) { - return -1; - } - needClose = true; - } else { - file = v->mdfd_vfd; - } - /* Try to fsync the file. */ - result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); + if (IS_COMPRESSED_RNODE(ftag->rnode, ftag->forknum)) { + if (v == NULL) { + pcaFd = OpenPcaFile(path, reln->smgr_rnode, ftag->forknum, ftag->segno); + if (pcaFd < 0) { + return -1; + } + pcdFd = OpenPcdFile(path, reln->smgr_rnode, ftag->forknum, ftag->segno); + if (pcdFd < 0) { + savedErrno = errno; + FileClose(pcaFd); + errno = savedErrno; + return -1; + } + } else { + pcaFd = v->mdfd_vfd_pca; + pcdFd = v->mdfd_vfd_pcd; + } + + PageCompressHeader *map = GetPageCompressMemoryMap(pcaFd, PageCompressChunkSize(reln)); + result = sync_pcmap(map, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC); + if (result == 0) { + result = FileSync(pcdFd, WAIT_EVENT_DATA_FILE_SYNC); + } else { + ereport(data_sync_elevel(WARNING), + (errcode_for_file_access(), errmsg("could not fsync pcmap \"%s\": %m", path))); + } + } else { + if (v == NULL) { + file = OpenForkFile(path, reln->smgr_rnode, ftag->forknum, ftag->segno); + if (file < 0) { + return -1; + } + } else { + file = v->mdfd_vfd; + } + /* Try to fsync the file. */ + result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); + } savedErrno = errno; - if (needClose) { - FileClose(file); + if (v == NULL) { + if (IS_COMPRESSED_RNODE(ftag->rnode, ftag->forknum)) { + FileClose(pcaFd); + FileClose(pcdFd); + } else { + FileClose(file); + } } errno = savedErrno; + return result; } @@ -1729,6 +2687,7 @@ int UnlinkMdFile(const FileTag *ftag, char *path) pfree(p); /* Try to unlink the file. */ + UnlinkCompressedFile(ftag->rnode, MAIN_FORKNUM, path); return unlink(path); } @@ -1747,3 +2706,28 @@ bool MatchMdFileTag(const FileTag *ftag, const FileTag *candidate) */ return ftag->rnode.dbNode == candidate->rnode.dbNode; } + +static int sync_pcmap(PageCompressHeader *pcMap, uint32 wait_event_info) +{ + if (pg_atomic_read_u32(&pcMap->sync) == true) { + return 0; + } + int returnCode; + uint32 nblocks, allocated_chunks, last_synced_nblocks, last_synced_allocated_chunks; + nblocks = pg_atomic_read_u32(&pcMap->nblocks); + allocated_chunks = pg_atomic_read_u32(&pcMap->allocated_chunks); + last_synced_nblocks = pg_atomic_read_u32(&pcMap->last_synced_nblocks); + last_synced_allocated_chunks = pg_atomic_read_u32(&pcMap->last_synced_allocated_chunks); + returnCode = pc_msync(pcMap); + if (returnCode == 0) { + if (last_synced_nblocks != nblocks) { + pg_atomic_write_u32(&pcMap->last_synced_nblocks, nblocks); + } + + if (last_synced_allocated_chunks != allocated_chunks) { + pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, allocated_chunks); + } + } + pcMap->sync = true; + return returnCode; +} diff --git a/src/gausskernel/storage/smgr/mmap_shared.cpp b/src/gausskernel/storage/smgr/mmap_shared.cpp new file mode 100644 index 000000000..4011e7954 --- /dev/null +++ b/src/gausskernel/storage/smgr/mmap_shared.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * + * + * IDENTIFICATION + * src/gausskernel/storage/smgr/mmap_shared.cpp + * + * --------------------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "miscadmin.h" +#include "catalog/pg_type.h" +#include "utils/datum.h" +#include "utils/relcache.h" + +#include "utils/memutils.h" +#include "utils/memprot.h" + +#include "storage/page_compression.h" +#include "executor/executor.h" +#include "storage/vfd.h" + +struct MmapEntry { + RelFileNodeForkNum relFileNodeForkNum; + /* + * the following are setting sin runtime + */ + size_t reference = 0; + PageCompressHeader *pcmap = NULL; +}; + +constexpr size_t LOCK_ARRAY_SIZE = 1024; +static pthread_mutex_t mmapLockArray[LOCK_ARRAY_SIZE]; + +static inline uint32 MmapTableHashCode(const RelFileNodeForkNum &relFileNodeForkNum) +{ + return tag_hash((void *)&relFileNodeForkNum, sizeof(RelFileNodeForkNum)); +} + +static inline pthread_mutex_t *MmapPartitionLock(size_t hashCode) +{ + return &mmapLockArray[hashCode % LOCK_ARRAY_SIZE]; +} + +static inline PageCompressHeader *MmapSharedMapFile(Vfd *vfdP, int chunkSize, bool readonly) +{ + size_t pcMapSize = SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize); + PageCompressHeader *map = pc_mmap_real_size(vfdP->fd, pcMapSize, false); + if (map == MAP_FAILED) { + ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("Failed to mmap page compression address file %s: %m", vfdP->fileName))); + } + return map; +} + +void RealInitialMMapLockArray() +{ + for (size_t i = 0; i < LOCK_ARRAY_SIZE; ++i) { + pthread_mutex_init(&mmapLockArray[i], NULL); + } + + HASHCTL ctl; + /* hash accessed by database file id */ + errno_t rc = memset_s(&ctl, sizeof(ctl), 0, sizeof(ctl)); + securec_check(rc, "", ""); + + ctl.keysize = sizeof(RelFileNodeForkNum); + ctl.entrysize = sizeof(MmapEntry); + ctl.hash = tag_hash; + ctl.num_partitions = LOCK_ARRAY_SIZE; + const size_t initLen = 256; + g_instance.mmapCache = HeapMemInitHash( + "mmap hash", initLen, + (Max(g_instance.attr.attr_common.max_files_per_process, t_thrd.storage_cxt.max_userdatafiles)) / 2, &ctl, + HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); +} + +PageCompressHeader *GetPageCompressHeader(void *vfd, int chunkSize, const RelFileNodeForkNum &relFileNodeForkNum) +{ + if (IsInitdb && g_instance.mmapCache == NULL) { + RealInitialMMapLockArray(); + } + Vfd *currentVfd = (Vfd *)vfd; + uint32 hashCode = MmapTableHashCode(relFileNodeForkNum); + AutoMutexLock mmapLock(MmapPartitionLock(hashCode)); + + mmapLock.lock(); + bool find = false; + MmapEntry *mmapEntry = (MmapEntry *)hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum, + hashCode, HASH_ENTER, &find); + if (!find) { + mmapEntry->pcmap = NULL; + mmapEntry->reference = 0; + } + if (mmapEntry->pcmap == NULL) { + mmapEntry->pcmap = MmapSharedMapFile(currentVfd, chunkSize, false); + } + ++mmapEntry->reference; + mmapLock.unLock(); + return mmapEntry->pcmap; +} + +void UnReferenceAddrFile(void *vfd) +{ + Vfd *currentVfd = (Vfd *)vfd; + RelFileNodeForkNum relFileNodeForkNum = currentVfd->fileNode; + uint32 hashCode = MmapTableHashCode(relFileNodeForkNum); + AutoMutexLock mmapLock(MmapPartitionLock(hashCode)); + mmapLock.lock(); + + MmapEntry *mmapEntry = (MmapEntry *)hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum, + hashCode, HASH_FIND, NULL); + if (mmapEntry == NULL) { + ereport(ERROR, (errcode_for_file_access(), + errmsg("UnReferenceAddrFile failed! mmap not found, filePath: %s", currentVfd->fileName))); + } + --mmapEntry->reference; + if (mmapEntry->reference == 0) { + if (pc_munmap(mmapEntry->pcmap) != 0) { + ereport(ERROR, + (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", currentVfd->fileName))); + } + if (hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum, hashCode, HASH_REMOVE, + NULL) == NULL) { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("UnReferenceAddrFile failed! remove hash key failed, filePath: %s", currentVfd->fileName))); + } + } else if (mmapEntry->reference < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", currentVfd->fileName))); + } + mmapLock.unLock(); +} \ No newline at end of file diff --git a/src/gausskernel/storage/smgr/page_compression.cpp b/src/gausskernel/storage/smgr/page_compression.cpp new file mode 100644 index 000000000..f5ea9d654 --- /dev/null +++ b/src/gausskernel/storage/smgr/page_compression.cpp @@ -0,0 +1,472 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved. + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * page_compression.cpp + * Routines for page compression + * + * There are two implementations at the moment: zstd, and the Postgres + * pg_lzcompress(). zstd support requires that the server was compiled + * with --with-zstd. + * IDENTIFICATION + * ./src/gausskernel/storage/smgr/page_compression.cpp + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "miscadmin.h" +#include "catalog/pg_type.h" +#include "utils/datum.h" +#include "utils/relcache.h" + +#include "utils/timestamp.h" +#include "storage/checksum.h" +#include "storage/page_compression.h" +#include "storage/page_compression_impl.h" + +static void CheckHeaderOfCompressAddr(PageCompressHeader* pcMap, uint16 chunk_size, uint8 algorithm, const char* path) +{ + if (pcMap->chunk_size != chunk_size || pcMap->algorithm != algorithm) { + if (u_sess->attr.attr_security.zero_damaged_pages) { + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid chunk_size %u or algorithm %u in head of compress relation address file \"%s\", " + "and reinitialized it.", + pcMap->chunk_size, + pcMap->algorithm, + path))); + + pcMap->algorithm = algorithm; + pg_atomic_write_u32(&pcMap->nblocks, RELSEG_SIZE); + pg_atomic_write_u32(&pcMap->allocated_chunks, 0); + pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, 0); + pcMap->chunk_size = chunk_size; + } else { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid chunk_size %u or algorithm %u in head of compress relation address file \"%s\"", + pcMap->chunk_size, + pcMap->algorithm, + path))); + } + } +} + +void CheckAndRepairCompressAddress(PageCompressHeader *pcMap, uint16 chunk_size, uint8 algorithm, const char *path) +{ + TimestampTz lastRecoveryTime = pcMap->last_recovery_start_time; + TimestampTz pgStartTime = t_thrd.time_cxt.pg_start_time; + error_t rc; + /* if the relation had been checked in this startup, skip */ + if (lastRecoveryTime == pgStartTime) { + return; + } + + /* check head of compress address file */ + CheckHeaderOfCompressAddr(pcMap, chunk_size, algorithm, path); + + uint32 nblocks = pg_atomic_read_u32(&pcMap->nblocks); + uint32 allocated_chunks = pg_atomic_read_u32(&pcMap->allocated_chunks); + BlockNumber *global_chunknos = (BlockNumber *)palloc0(MAX_CHUNK_NUMBER(chunk_size) * sizeof(BlockNumber)); + + BlockNumber max_blocknum = (BlockNumber)-1; + BlockNumber max_nonzero_blocknum = (BlockNumber)-1; + BlockNumber max_allocated_chunkno = (pc_chunk_number_t)0; + + /* check compress address of every pages */ + for (BlockNumber blocknum = 0; blocknum < (BlockNumber)RELSEG_SIZE; ++blocknum) { + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum); + if (pcAddr->checksum != AddrChecksum32(blocknum, pcAddr)) { + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid checkum %u of block %u in file \"%s\"", + pcAddr->checksum, blocknum, path))); + pcAddr->allocated_chunks = pcAddr->nchunks = 0; + for (int i = 0; i < BLCKSZ / chunk_size; ++i) { + pcAddr->chunknos[i] = 0; + } + pcAddr->checksum = 0; + } + /* + * skip when found first zero filled block after nblocks + * if(blocknum >= (BlockNumber)nblocks && pcAddr->allocated_chunks == 0) + * break; + */ + + /* check allocated_chunks for one page */ + if (pcAddr->allocated_chunks > BLCKSZ / chunk_size) { + if (u_sess->attr.attr_security.zero_damaged_pages) { + rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, + SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size)); + securec_check_c(rc, "\0", "\0"); + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid allocated_chunks %u of block %u in file \"%s\", and zero this block", + pcAddr->allocated_chunks, blocknum, path))); + continue; + } else { + pfree(global_chunknos); + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid allocated_chunks %u of block %u in file \"%s\"", + pcAddr->allocated_chunks, blocknum, path))); + } + } + + /* check chunknos for one page */ + for (int i = 0; i < pcAddr->allocated_chunks; ++i) { + /* check for invalid chunkno */ + if (pcAddr->chunknos[i] == 0 || pcAddr->chunknos[i] > MAX_CHUNK_NUMBER(chunk_size)) { + if (u_sess->attr.attr_security.zero_damaged_pages) { + rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, + SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size)); + securec_check_c(rc, "\0", "\0"); + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid chunk number %u of block %u in file \"%s\", and zero this block", + pcAddr->chunknos[i], blocknum, path))); + continue; + } else { + pfree(global_chunknos); + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid chunk number %u of block %u in file \"%s\"", pcAddr->chunknos[i], + blocknum, path))); + } + } + + /* check for duplicate chunkno */ + if (global_chunknos[pcAddr->chunknos[i] - 1] != 0) { + if (u_sess->attr.attr_security.zero_damaged_pages) { + rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, + SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size)); + securec_check_c(rc, "\0", "\0"); + ereport( + WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg( + "chunk number %u of block %u duplicate with block %u in file \"%s\", and zero this block", + pcAddr->chunknos[i], blocknum, global_chunknos[pcAddr->chunknos[i] - 1], path))); + continue; + } else { + pfree(global_chunknos); + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("chunk number %u of block %u duplicate with block %u in file \"%s\"", + pcAddr->chunknos[i], blocknum, global_chunknos[pcAddr->chunknos[i] - 1], path))); + } + } + } + + /* clean chunknos beyond allocated_chunks for one page */ + for (int i = pcAddr->allocated_chunks; i < BLCKSZ / chunk_size; ++i) { + if (pcAddr->chunknos[i] != 0) { + pcAddr->chunknos[i] = 0; + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("clear chunk number %u beyond allocated_chunks %u of block %u in file \"%s\"", + pcAddr->chunknos[i], pcAddr->allocated_chunks, blocknum, path))); + } + } + + /* check nchunks for one page */ + if (pcAddr->nchunks > pcAddr->allocated_chunks) { + if (u_sess->attr.attr_security.zero_damaged_pages) { + rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, + SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size)); + securec_check_c(rc, "\0", "\0"); + ereport( + WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("nchunks %u exceeds allocated_chunks %u of block %u in file \"%s\", and zero this block", + pcAddr->nchunks, pcAddr->allocated_chunks, blocknum, path))); + continue; + } else { + pfree(global_chunknos); + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("nchunks %u exceeds allocated_chunks %u of block %u in file \"%s\"", + pcAddr->nchunks, pcAddr->allocated_chunks, blocknum, path))); + } + } + + max_blocknum = blocknum; + if (pcAddr->nchunks > 0) { + max_nonzero_blocknum = blocknum; + } + + for (int i = 0; i < pcAddr->allocated_chunks; ++i) { + global_chunknos[pcAddr->chunknos[i] - 1] = blocknum + 1; + if (pcAddr->chunknos[i] > max_allocated_chunkno) { + max_allocated_chunkno = pcAddr->chunknos[i]; + } + } + } + + int unused_chunks = 0; + /* check for holes in allocated chunks */ + for (BlockNumber i = 0; i < max_allocated_chunkno; i++) { + if (global_chunknos[i] == 0) { + unused_chunks++; + } + } + + if (unused_chunks > 0) { + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("there are %u chunks of total allocated chunks %u can not be use in file \"%s\"", + unused_chunks, max_allocated_chunkno, path), + errhint("You may need to run VACUMM FULL to optimize space allocation."))); + } + + /* update nblocks in head of compressed file */ + if (nblocks < max_nonzero_blocknum + 1) { + pg_atomic_write_u32(&pcMap->nblocks, max_nonzero_blocknum + 1); + pg_atomic_write_u32(&pcMap->last_synced_nblocks, max_nonzero_blocknum + 1); + + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("update nblocks head of compressed file \"%s\". old: %u, new: %u", path, nblocks, + max_nonzero_blocknum + 1))); + } + + /* update allocated_chunks in head of compress file */ + if (allocated_chunks != max_allocated_chunkno) { + pg_atomic_write_u32(&pcMap->allocated_chunks, max_allocated_chunkno); + pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, max_allocated_chunkno); + + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("update allocated_chunks in head of compressed file \"%s\". old: %u, new: %u", path, + allocated_chunks, max_allocated_chunkno))); + } + + /* clean compress address after max_blocknum + 1 */ + for (BlockNumber blocknum = max_blocknum + 1; blocknum < (BlockNumber)RELSEG_SIZE; blocknum++) { + char buf[128]; + char *p = NULL; + PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum); + + /* skip zero block */ + if (pcAddr->allocated_chunks == 0 && pcAddr->nchunks == 0) { + continue; + } + + /* clean compress address and output content of the address */ + rc = memset_s(buf, sizeof(buf), 0, sizeof(buf)); + securec_check_c(rc, "\0", "\0"); + p = buf; + + for (int i = 0; i < pcAddr->allocated_chunks; i++) { + if (pcAddr->chunknos[i]) { + const char *formatStr = i == 0 ? "%u" : ",%u"; + errno_t rc = + snprintf_s(p, sizeof(buf) - (p - buf), sizeof(buf) - (p - buf) - 1, formatStr, pcAddr->chunknos[i]); + securec_check_ss(rc, "\0", "\0"); + p += strlen(p); + } + } + + rc = + memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size)); + securec_check_c(rc, "\0", "\0"); + ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("clean unused compress address of block %u in file \"%s\", old " + "allocated_chunks/nchunks/chunknos: %u/%u/{%s}", + blocknum, path, pcAddr->allocated_chunks, pcAddr->nchunks, buf))); + } + + pfree(global_chunknos); + + if (pc_msync(pcMap) != 0) { + ereport(ERROR, (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", path))); + } + + pcMap->last_recovery_start_time = pgStartTime; +} + +int64 CalculateMainForkSize(char* pathName, RelFileNode* rnode, ForkNumber forkNumber) +{ + Assert(IS_COMPRESSED_RNODE((*rnode), forkNumber)); + Assert(rnode->bucketNode == -1); + return CalculateCompressMainForkSize(pathName); +} + +void CopyCompressedPath(char dst[MAXPGPATH], const char* pathName, CompressedFileType compressFileType) +{ + int rc; + if (compressFileType == COMPRESSED_TABLE_PCA_FILE) { + rc = snprintf_s(dst, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, pathName); + } else { + rc = snprintf_s(dst, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, pathName); + } + securec_check_ss(rc, "\0", "\0"); +} + +int64 CalculateCompressMainForkSize(char* pathName, bool suppressedENOENT) +{ + int64 totalsize = 0; + + char pcFilePath[MAXPGPATH]; + CopyCompressedPath(pcFilePath, pathName, COMPRESSED_TABLE_PCA_FILE); + totalsize += CalculateFileSize(pcFilePath, MAXPGPATH, suppressedENOENT); + + CopyCompressedPath(pcFilePath, pathName, COMPRESSED_TABLE_PCD_FILE); + totalsize += CalculateFileSize(pcFilePath, MAXPGPATH, suppressedENOENT); + + return totalsize; +} + +uint16 ReadChunkSize(FILE* pcaFile, char* pcaFilePath, size_t len) +{ + uint16 chunkSize; + if (fseeko(pcaFile, (off_t)offsetof(PageCompressHeader, chunk_size), SEEK_SET) != 0) { + ereport(ERROR, + (errcode_for_file_access(), errmsg("could not seek in file \"%s\": \"%lu\": %m", pcaFilePath, len))); + } + + if (fread(&chunkSize, sizeof(chunkSize), 1, pcaFile) <= 0) { + ereport(ERROR, + (errcode_for_file_access(), errmsg("could not open file \"%s\": \"%lu\": %m", pcaFilePath, len))); + } + return chunkSize; +} + +int64 CalculateFileSize(char* pathName, size_t size, bool suppressedENOENT) +{ + struct stat structstat; + if (stat(pathName, &structstat)) { + if (errno == ENOENT) { + if (suppressedENOENT) { + return 0; + } + ereport(ERROR, (errcode_for_file_access(), errmsg("could not FIND file \"%s\": %m", pathName))); + } else { + ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathName))); + } + } + return structstat.st_size; +} + +uint1 ConvertChunkSize(uint32 compressedChunkSize, bool *success) +{ + uint1 chunkSize = INDEX_OF_HALF_BLCKSZ; + switch (compressedChunkSize) { + case BLCKSZ / 2: + chunkSize = INDEX_OF_HALF_BLCKSZ; + break; + case BLCKSZ / 4: + chunkSize = INDEX_OF_QUARTER_BLCKSZ; + break; + case BLCKSZ / 8: + chunkSize = INDEX_OF_EIGHTH_BRICK_BLCKSZ; + break; + case BLCKSZ / 16: + chunkSize = INDEX_OF_SIXTEENTHS_BLCKSZ; + break; + default: + *success = false; + return chunkSize; + } + *success = true; + return chunkSize; +} + +constexpr int MAX_RETRY_LIMIT = 60; +constexpr long RETRY_SLEEP_TIME = 1000000L; + +size_t ReadAllChunkOfBlock(char *dst, size_t destLen, BlockNumber blockNumber, ReadBlockChunksStruct& rbStruct) +{ + PageCompressHeader* header = rbStruct.header; + if (blockNumber >= header->nblocks) { + ereport(ERROR, + (ERRCODE_INVALID_PARAMETER_VALUE, + errmsg("blocknum \"%u\" exceeds max block number", blockNumber))); + } + char* pageBuffer = rbStruct.pageBuffer; + const char* fileName = rbStruct.fileName; + decltype(PageCompressHeader::chunk_size) chunkSize = header->chunk_size; + decltype(ReadBlockChunksStruct::segmentNo) segmentNo = rbStruct.segmentNo; + PageCompressAddr* currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber); + + size_t tryCount = 0; + /* for empty chunks write */ + uint8 allocatedChunks; + uint8 nchunks; + do { + allocatedChunks = currentAddr->allocated_chunks; + nchunks = currentAddr->nchunks; + for (uint8 i = 0; i < nchunks; ++i) { + off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]); + uint8 start = i; + while (i < nchunks - 1 && currentAddr->chunknos[i + 1] == currentAddr->chunknos[i] + 1) { + i++; + } + if (fseeko(rbStruct.fp, seekPos, SEEK_SET) != 0) { + ReleaseMap(header, fileName); + ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in file \"%s\": %m", fileName))); + } + size_t readAmount = chunkSize * (i - start + 1); + if (fread(dst + start * chunkSize, 1, readAmount, rbStruct.fp) != readAmount && ferror(rbStruct.fp)) { + ReleaseMap(header, fileName); + ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fileName))); + } + } + if (nchunks == 0) { + break; + } + if (DecompressPage(dst, pageBuffer, header->algorithm) == BLCKSZ) { + PageHeader phdr = PageHeader(pageBuffer); + BlockNumber blkNo = blockNumber + segmentNo * ((BlockNumber)RELSEG_SIZE); + if (PageIsNew(phdr) || pg_checksum_page(pageBuffer, blkNo) == phdr->pd_checksum) { + break; + } + } + + if (tryCount < MAX_RETRY_LIMIT) { + ++tryCount; + pg_usleep(RETRY_SLEEP_TIME); + } else { + ReleaseMap(header, fileName); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("base backup cheksum or Decompressed blockno %u failed in file \"%s\", aborting backup. " + "nchunks: %u, allocatedChunks: %u, segno: %d.", + blockNumber, + fileName, + nchunks, + allocatedChunks, + segmentNo))); + } + } while (true); + if (allocatedChunks > nchunks) { + auto currentWriteSize = nchunks * chunkSize; + securec_check( + memset_s(dst + currentWriteSize, destLen - currentWriteSize, 0, (allocatedChunks - nchunks) * chunkSize), + "", + ""); + } + return allocatedChunks * chunkSize; +} + +CompressedFileType IsCompressedFile(char *fileName, size_t fileNameLen) +{ + size_t suffixLen = 4; + if (fileNameLen >= suffixLen) { + const char *suffix = fileName + fileNameLen - suffixLen; + if (strncmp(suffix, "_pca", suffixLen) == 0) { + return COMPRESSED_TABLE_PCA_FILE; + } else if (strncmp(suffix, "_pcd", suffixLen) == 0) { + return COMPRESSED_TABLE_PCD_FILE; + } + } + return COMPRESSED_TYPE_UNKNOWN; +} + +void ReleaseMap(PageCompressHeader* map, const char* fileName) +{ + if (map != NULL && pc_munmap(map) != 0) { + ereport(WARNING, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", fileName))); + } +} diff --git a/src/include/access/double_write.h b/src/include/access/double_write.h index 0c64ef71d..fa74cdf9c 100644 --- a/src/include/access/double_write.h +++ b/src/include/access/double_write.h @@ -31,7 +31,8 @@ typedef enum BufTagVer { ORIGIN_TAG = 0, - HASHBUCKET_TAG + HASHBUCKET_TAG, + PAGE_COMPRESS_TAG } BufTagVer; typedef struct st_dw_batch { diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 01021af8c..449374cfa 100755 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -20,8 +20,10 @@ #include "storage/buf/block.h" #include "storage/buf/buf.h" #include "storage/buf/bufpage.h" +#include "storage/page_compression.h" #include "storage/smgr/relfilenode.h" + struct XLogPhyBlock; /* * The minimum size of the WAL construction working area. If you need to @@ -47,7 +49,7 @@ struct XLogPhyBlock; * is taken */ /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); -extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, bool isupgrade = false, int bucket_id = InvalidBktId, +extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, bool isupgrade = false, int bucket_id = InvalidBktId, bool isSwitchoverBarrier = false); extern void XLogEnsureRecordSpace(int nbuffers, int ndatas); extern void XLogRegisterData(char* data, int len); diff --git a/src/include/access/xlogproc.h b/src/include/access/xlogproc.h index 9e7156852..5e0bf6bf2 100755 --- a/src/include/access/xlogproc.h +++ b/src/include/access/xlogproc.h @@ -59,6 +59,7 @@ typedef void (*relasexlogreadstate)(void* record); #define XLogBlockHeadGetForkNum(blockhead) ((blockhead)->forknum) #define XLogBlockHeadGetBlockNum(blockhead) ((blockhead)->blkno) #define XLogBlockHeadGetBucketId(blockhead) ((blockhead)->bucketNode) +#define XLogBlockHeadGetCompressOpt(blockhead) ((blockhead)->opt) #define XLogBlockHeadGetValidInfo(blockhead) ((blockhead)->block_valid) #define XLogBlockHeadGetPhysicalBlock(blockhead) ((blockhead)->pblk) /* for common blockhead end */ @@ -492,7 +493,8 @@ typedef struct { TransactionId xl_xid; /* xact id */ Oid spcNode; /* tablespace */ Oid dbNode; /* database */ - int4 bucketNode; /* bucket */ + int2 bucketNode; /* bucket */ + uint2 opt; XLogPhyBlock pblk; } XLogBlockHead; diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 9d7a07d93..47fd856a0 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -37,7 +37,8 @@ */ #define XLR_SPECIAL_REL_UPDATE 0x01 #define XLR_BTREE_UPGRADE_FLAG 0x02 - +/* If xlog record is the compress table creation */ +#define XLR_REL_COMPRESS 0X04 #define XLR_IS_TOAST 0X08 /* If xlog record is from toast page */ @@ -84,7 +85,7 @@ typedef struct XLogRecordBlockHeader { #define BKID_HAS_TDE_PAGE (0x40) #define BKID_GET_BKID(id) (id & 0x3F) -/* +/* * In segment-page storage, RelFileNode and block number are logic for XLog. Thus, we need record * physical location in xlog. This macro is used to check whether in such situation. */ diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h index ec0a8b1c5..ca1fa2ddc 100644 --- a/src/include/catalog/heap.h +++ b/src/include/catalog/heap.h @@ -79,12 +79,14 @@ extern Relation heap_create(const char *relname, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, + Datum reloptions, Oid ownerid, bool skip_create_storage, TableAmType tam_type, int8 relindexsplit = 0, StorageType storage_type = HEAP_DISK, - bool newcbi = false); + bool newcbi = false, + Oid accessMethodObjectId = 0); extern bool heap_is_matview_init_state(Relation rel); @@ -97,7 +99,9 @@ heapCreatePartition(const char* part_name, Oid bucketOid, Oid ownerid, StorageType storage_type, - bool newcbi = false); + bool newcbi = false, + Datum reloptions = Datum(0)); + extern Oid heap_create_with_catalog(const char *relname, Oid relnamespace, @@ -119,7 +123,7 @@ extern Oid heap_create_with_catalog(const char *relname, bool use_user_acl, bool allow_system_table_mods, PartitionState *partTableState, - int8 row_compress, + int8 row_compress, HashBucketInfo *bucketinfo, bool record_dependce = true, List* ceLst = NULL, @@ -192,7 +196,7 @@ extern void CheckAttributeType(const char *attname, Oid atttypid, Oid attcollati #ifdef PGXC /* Functions related to distribution data of relations */ extern void AddRelationDistribution(const char *relname, Oid relid, DistributeBy *distributeby, - PGXCSubCluster *subcluster, List *parentOids, TupleDesc descriptor, bool isinstallationgroup, + PGXCSubCluster *subcluster, List *parentOids, TupleDesc descriptor, bool isinstallationgroup, bool isbucket = false, int bucketmaplen = 0); extern void GetRelationDistributionItems(Oid relid, DistributeBy *distributeby, TupleDesc descriptor, char *locatortype, int *hashalgorithm, int *hashbuckets, AttrNumber *attnum); diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index df5c265bc..19dbf5ff5 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -38,11 +38,23 @@ typedef struct xl_smgr_create { ForkNumber forkNum; } xl_smgr_create; +typedef struct xl_smgr_create_compress { + xl_smgr_create xlrec; + uint2 pageCompressOpts; +} xl_smgr_create_compress; + typedef struct xl_smgr_truncate { BlockNumber blkno; RelFileNodeOld rnode; } xl_smgr_truncate; +typedef struct xl_smgr_truncate_compress { + xl_smgr_truncate xlrec; + uint2 pageCompressOpts; +} xl_smgr_truncate_compress; + + + extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum); extern void smgr_redo(XLogReaderState *record); diff --git a/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_424.sql b/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_424.sql new file mode 100644 index 000000000..36dafafbf --- /dev/null +++ b/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_424.sql @@ -0,0 +1,2 @@ +DROP FUNCTION IF EXISTS pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8) CASCADE; +DROP FUNCTION IF EXISTS pg_catalog.gs_read_block_from_remote(int4, int4, int4, int2, int2, int4, xid, int4, xid, boolean) CASCADE; diff --git a/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_424.sql b/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_424.sql new file mode 100644 index 000000000..36dafafbf --- /dev/null +++ b/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_424.sql @@ -0,0 +1,2 @@ +DROP FUNCTION IF EXISTS pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8) CASCADE; +DROP FUNCTION IF EXISTS pg_catalog.gs_read_block_from_remote(int4, int4, int4, int2, int2, int4, xid, int4, xid, boolean) CASCADE; diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_424.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_424.sql new file mode 100644 index 000000000..2ecbe12fe --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_424.sql @@ -0,0 +1,22 @@ +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 4768; +CREATE OR REPLACE FUNCTION pg_catalog.gs_read_block_from_remote +( int4, + int4, + int4, + int2, + int2, + int4, + xid, + int4, + xid, + boolean) +RETURNS SETOF record LANGUAGE INTERNAL ROWS 1 STRICT as 'gs_read_block_from_remote_compress'; +-- pg_read_binary_file_blocks() +-- +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 8413; +CREATE FUNCTION pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8, + OUT path text, + OUT blocknum int4, + OUT len int4, + OUT data bytea) + AS 'pg_read_binary_file_blocks' LANGUAGE INTERNAL IMMUTABLE STRICT; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_424.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_424.sql new file mode 100644 index 000000000..be265bf52 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_424.sql @@ -0,0 +1,22 @@ +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 4768; +CREATE OR REPLACE FUNCTION pg_catalog.gs_read_block_from_remote +( int4, + int4, + int4, + int2, + int2, + int4, + xid, + int4, + xid, + boolean) +RETURNS SETOF record LANGUAGE INTERNAL ROWS 1 STRICT as 'gs_read_block_from_remote_compress'; +-- pg_read_binary_file_blocks() +-- +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 8413; +CREATE FUNCTION pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8, + OUT path text, + OUT blocknum int4, + OUT len int4, + OUT data bytea) + AS 'pg_read_binary_file_blocks' LANGUAGE INTERNAL IMMUTABLE STRICT; diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index 1e06dac92..21d99f804 100755 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -979,6 +979,7 @@ typedef struct knl_instance_context { knl_g_archive_obs_context archive_obs_cxt; knl_g_archive_obs_thread_info archive_obs_thread_info; struct HTAB* ngroup_hash_table; + struct HTAB* mmapCache; knl_g_hypo_context hypo_cxt; knl_g_segment_context segment_cxt; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 26c871c9a..3ecf20cc9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1313,6 +1313,8 @@ typedef enum WaitEventIO { WAIT_EVENT_OBS_READ, WAIT_EVENT_OBS_WRITE, WAIT_EVENT_LOGCTRL_SLEEP, + WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH, + WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC, IO_EVENT_NUM = WAIT_EVENT_LOGCTRL_SLEEP - WAIT_EVENT_BUFFILE_READ + 1 // MUST be last, DO NOT use this value. } WaitEventIO; diff --git a/src/include/service/remote_read_client.h b/src/include/service/remote_read_client.h index ae05a29a7..abf57305c 100755 --- a/src/include/service/remote_read_client.h +++ b/src/include/service/remote_read_client.h @@ -33,7 +33,7 @@ extern int RemoteGetCU(char* remote_address, uint32 spcnode, uint32 dbnode, uint32 relnode, int32 colid, uint64 offset, int32 size, uint64 lsn, char* cu_data); -extern int RemoteGetPage(char* remote_address, uint32 spcnode, uint32 dbnode, uint32 relnode, int4 bucketnode, +extern int RemoteGetPage(char* remote_address, uint32 spcnode, uint32 dbnode, uint32 relnode, int2 bucketnode, uint2 opt, int32 forknum, uint32 blocknum, uint32 blocksize, uint64 lsn, char* page_data); #endif /* REMOTE_READ_CLIENT_H */ diff --git a/src/include/storage/buf/buf_internals.h b/src/include/storage/buf/buf_internals.h index b8debfc7b..7813dd107 100644 --- a/src/include/storage/buf/buf_internals.h +++ b/src/include/storage/buf/buf_internals.h @@ -96,6 +96,13 @@ typedef struct buftag { BlockNumber blockNum; /* blknum relative to begin of reln */ } BufferTag; +typedef struct buftagnocompress { + RelFileNodeV2 rnode; + ForkNumber forkNum; + BlockNumber blockNum; /* blknum relative to begin of reln */ +} BufferTagSecondVer; + + typedef struct buftagnohbkt { RelFileNodeOld rnode; /* physical relation identifier */ ForkNumber forkNum; diff --git a/src/include/storage/buf/bufpage.h b/src/include/storage/buf/bufpage.h index 8e5cd5faa..2538496b8 100644 --- a/src/include/storage/buf/bufpage.h +++ b/src/include/storage/buf/bufpage.h @@ -180,6 +180,8 @@ typedef HeapPageHeaderData* HeapPageHeader; #define GetPageHeaderSize(page) (PageIs8BXidHeapVersion(page) ? SizeOfHeapPageHeaderData : SizeOfPageHeaderData) #define SizeOfHeapPageUpgradeData MAXALIGN(offsetof(HeapPageHeaderData, pd_linp) - offsetof(PageHeaderData, pd_linp)) + +#define GET_ITEMID_BY_IDX(buf, i) ((ItemIdData *)(buf + GetPageHeaderSize(buf) + (i) * sizeof(ItemIdData))) #define PageXLogRecPtrGet(val) \ ((uint64) (val).xlogid << 32 | (val).xrecoff) @@ -406,6 +408,7 @@ inline OffsetNumber PageGetMaxOffsetNumber(char* pghr) #define PageSetLSNInternal(page, lsn) \ (((PageHeader)(page))->pd_lsn.xlogid = (uint32)((lsn) >> 32), ((PageHeader)(page))->pd_lsn.xrecoff = (uint32)(lsn)) +#ifndef FRONTEND inline void PageSetLSN(Page page, XLogRecPtr LSN, bool check = true) { if (check && XLByteLT(LSN, PageGetLSN(page))) { @@ -413,6 +416,7 @@ inline void PageSetLSN(Page page, XLogRecPtr LSN, bool check = true) } PageSetLSNInternal(page, LSN); } +#endif #define PageHasFreeLinePointers(page) (((PageHeader)(page))->pd_flags & PD_HAS_FREE_LINES) #define PageSetHasFreeLinePointers(page) (((PageHeader)(page))->pd_flags |= PD_HAS_FREE_LINES) diff --git a/src/include/storage/page_compression.h b/src/include/storage/page_compression.h new file mode 100644 index 000000000..aa302bc91 --- /dev/null +++ b/src/include/storage/page_compression.h @@ -0,0 +1,336 @@ +/* + * page_compression.h + * internal declarations for page compression + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/storage/page_compression.h + */ + +#ifndef PAGE_COMPRESSION_H +#define PAGE_COMPRESSION_H + +#include + +#include "storage/buf/bufpage.h" +#include "datatype/timestamp.h" +#include "catalog/pg_class.h" +#include "catalog/pg_am.h" +#include "utils/atomic.h" + +/* The page compression feature relies on native atomic operation support. + * On platforms that do not support native atomic operations, the members + * of pg_atomic_uint32 contain semaphore objects, which will affect the + * persistence of compressed page address files. + */ +#define SUPPORT_PAGE_COMPRESSION (sizeof(pg_atomic_uint32) == sizeof(uint32)) + +/* In order to avoid the inconsistency of address metadata data when the server + * is down, it is necessary to prevent the address metadata of one data block + * from crossing two storage device blocks. The block size of ordinary storage + * devices is a multiple of 512, so 512 is used as the block size of the + * compressed address file. + */ +#define COMPRESS_ADDR_BLCKSZ 512 + +/* COMPRESS_ALGORITHM_XXX must be the same as COMPRESS_TYPE_XXX */ +#define COMPRESS_ALGORITHM_PGLZ 1 +#define COMPRESS_ALGORITHM_ZSTD 2 + +constexpr uint32 COMPRESS_ADDRESS_FLUSH_CHUNKS = 5000; + +#define SUPPORT_COMPRESSED(relKind, relam) \ + ((relKind) == RELKIND_RELATION || ((relKind) == RELKIND_INDEX && (relam) == BTREE_AM_OID)) +#define REL_SUPPORT_COMPRESSED(relation) \ + ((relation->rd_rel->relkind) == RELKIND_RELATION || \ + ((relation->rd_rel->relkind) == RELKIND_INDEX && (relation->rd_rel->relam) == BTREE_AM_OID)) + +typedef uint32 pc_chunk_number_t; +const uint32 PAGE_COMPRESSION_VERSION = 92424; + +enum CompressedFileType { + COMPRESSED_TYPE_UNKNOWN, + COMPRESSED_TABLE_FILE, + COMPRESSED_TABLE_PCA_FILE, + COMPRESSED_TABLE_PCD_FILE +}; + +/* + * layout of files for Page Compress: + * + * 1. page compression address file(_pca) + * - PageCompressHeader + * - PageCompressAddr[] + * + * 2. page compression data file(_pcd) + * - PageCompressData[] + * + */ +typedef struct PageCompressHeader { + pg_atomic_uint32 nblocks; /* number of total blocks in this segment */ + pg_atomic_uint32 allocated_chunks; /* number of total allocated chunks in data area */ + uint16 chunk_size; /* size of each chunk, must be 1/2 1/4 or 1/8 of BLCKSZ */ + uint8 algorithm; /* compress algorithm, 1=pglz, 2=lz4 */ + pg_atomic_uint32 last_synced_nblocks; /* last synced nblocks */ + pg_atomic_uint32 last_synced_allocated_chunks; /* last synced allocated_chunks */ + pg_atomic_uint32 sync; + TimestampTz last_recovery_start_time; /* postmaster start time of last recovery */ +} PageCompressHeader; + +typedef struct PageCompressAddr { + uint32 checksum; + volatile uint8 nchunks; /* number of chunks for this block */ + volatile uint8 allocated_chunks; /* number of allocated chunks for this block */ + /* variable-length fields, 1 based chunk no array for this block, size of the array must be 2, 4 or 8 */ + pc_chunk_number_t chunknos[FLEXIBLE_ARRAY_MEMBER]; +} PageCompressAddr; + +struct ReadBlockChunksStruct { + PageCompressHeader* header; // header: pca file + char* pageBuffer; // pageBuffer: decompressed page + size_t pageBufferLen; + FILE* fp; // fp: table fp + int segmentNo; + char* fileName; // fileName: for error report +}; + +typedef struct PageCompressData { + char page_header[SizeOfPageHeaderData]; /* page header */ + uint32 size : 16; /* size of compressed data */ + uint32 byte_convert : 1; + uint32 diff_convert : 1; + uint32 unused : 14; + char data[FLEXIBLE_ARRAY_MEMBER]; /* compressed page, except for the page header */ +} PageCompressData; + + +typedef struct HeapPageCompressData { + char page_header[SizeOfHeapPageHeaderData]; /* page header */ + uint32 size : 16; /* size of compressed data */ + uint32 byte_convert : 1; + uint32 diff_convert : 1; + uint32 unused : 14; + char data[FLEXIBLE_ARRAY_MEMBER]; /* compressed page, except for the page header */ +} HeapPageCompressData; + + +const uint4 CHUNK_SIZE_LIST[4] = {BLCKSZ / 2, BLCKSZ / 4, BLCKSZ / 8, BLCKSZ / 16}; +constexpr uint4 INDEX_OF_HALF_BLCKSZ = 0; +constexpr uint4 INDEX_OF_QUARTER_BLCKSZ = 1; +constexpr uint4 INDEX_OF_EIGHTH_BRICK_BLCKSZ = 2; +constexpr uint4 INDEX_OF_SIXTEENTHS_BLCKSZ = 3; +#define MAX_PREALLOC_CHUNKS 7 +#define PCA_SUFFIX "%s_pca" +#define PCD_SUFFIX "%s_pcd" + +#define SIZE_OF_PAGE_COMPRESS_HEADER_DATA sizeof(PageCompressHeader) +#define SIZE_OF_PAGE_COMPRESS_ADDR_HEADER_DATA offsetof(PageCompressAddr, chunknos) +#define SIZE_OF_PAGE_COMPRESS_DATA_HEADER_DATA(heapData) \ + ((heapData) ? offsetof(HeapPageCompressData, data) : offsetof(PageCompressData, data)) + +#define SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size) \ + (SIZE_OF_PAGE_COMPRESS_ADDR_HEADER_DATA + sizeof(pc_chunk_number_t) * (BLCKSZ / (chunk_size))) + +#define NUMBER_PAGE_COMPRESS_ADDR_PER_BLOCK(chunk_size) (COMPRESS_ADDR_BLCKSZ / SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size)) + +#define OFFSET_OF_PAGE_COMPRESS_ADDR(chunk_size, blockno) \ + (COMPRESS_ADDR_BLCKSZ * (1 + (blockno) / NUMBER_PAGE_COMPRESS_ADDR_PER_BLOCK(chunk_size)) + \ + SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size) * ((blockno) % NUMBER_PAGE_COMPRESS_ADDR_PER_BLOCK(chunk_size))) + +#define GET_PAGE_COMPRESS_ADDR(pcbuffer, chunk_size, blockno) \ + (PageCompressAddr*)((char*)(pcbuffer) + OFFSET_OF_PAGE_COMPRESS_ADDR((chunk_size), (blockno) % RELSEG_SIZE)) + +#define SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunk_size) OFFSET_OF_PAGE_COMPRESS_ADDR((chunk_size), RELSEG_SIZE) + +#define OFFSET_OF_PAGE_COMPRESS_CHUNK(chunk_size, chunkno) ((chunk_size) * ((chunkno)-1)) + +/* Abnormal scenarios may cause holes in the space allocation of data files, + * causing data file expansion. Usually the holes are not too big, so the definition + * allows a maximum of 10,000 chunks for holes. If allocated_chunks exceeds this value, + * VACUUM FULL needs to be executed to reclaim space. + */ +#define MAX_CHUNK_NUMBER(chunk_size) ((uint32)(RELSEG_SIZE * (BLCKSZ / (chunk_size)) + 10000)) + +constexpr unsigned CMP_BYTE_CONVERT_LEN = 1; +constexpr unsigned CMP_DIFF_CONVERT_LEN = 1; +constexpr unsigned CMP_PRE_CHUNK_LEN = 3; +constexpr unsigned CMP_LEVEL_SYMBOL_LEN = 1; +constexpr unsigned CMP_LEVEL_LEN = 5; +constexpr unsigned CMP_ALGORITHM_LEN = 3; +constexpr unsigned CMP_CHUNK_SIZE_LEN = 2; + +constexpr unsigned CMP_BYTE_CONVERT_INDEX = 0; +constexpr unsigned CMP_DIFF_CONVERT_INDEX = 1; +constexpr unsigned CMP_PRE_CHUNK_INDEX = 2; +constexpr unsigned CMP_COMPRESS_LEVEL_SYMBOL = 3; +constexpr unsigned CMP_LEVEL_INDEX = 4; +constexpr unsigned CMP_ALGORITHM_INDEX = 5; +constexpr unsigned CMP_CHUNK_SIZE_INDEX = 6; + +struct CmpBitStuct { + unsigned int bitLen; + unsigned int mask; + unsigned int moveBit; +}; + +constexpr CmpBitStuct g_cmpBitStruct[] = {{CMP_BYTE_CONVERT_LEN, 0x01, 15}, + {CMP_DIFF_CONVERT_LEN, 0x01, 14}, + {CMP_PRE_CHUNK_LEN, 0x07, 11}, + {CMP_LEVEL_SYMBOL_LEN, 0x01, 10}, + {CMP_LEVEL_LEN, 0x1F, 5}, + {CMP_ALGORITHM_LEN, 0x07, 2}, + {CMP_CHUNK_SIZE_LEN, 0x03, 0}}; +/* RelFileCompressOption: Row-oriented table compress option */ +struct RelFileCompressOption { + unsigned byteConvert : g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].bitLen, /* need byte convert? */ + diffConvert : g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].bitLen, /* need diff convert processed? */ + compressPreallocChunks : g_cmpBitStruct[CMP_PRE_CHUNK_INDEX] + .bitLen, /* prealloced chunks to store compressed data */ + compressLevelSymbol : g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL] + .bitLen, /* compress level symbol, true for positive and false for negative */ + compressLevel : g_cmpBitStruct[CMP_LEVEL_INDEX].bitLen, /* compress level */ + compressAlgorithm : g_cmpBitStruct[CMP_ALGORITHM_INDEX].bitLen, /* compress algorithm */ + compressChunkSize : g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].bitLen; /* chunk size of compressed data */ +}; + +inline void TransCompressOptions(const RelFileNode& node, RelFileCompressOption* opt) +{ + unsigned short compressOption = node.opt; + opt->compressChunkSize = compressOption & g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].bitLen; + opt->compressAlgorithm = compressOption & g_cmpBitStruct[CMP_ALGORITHM_INDEX].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_ALGORITHM_INDEX].bitLen; + opt->compressLevel = compressOption & g_cmpBitStruct[CMP_LEVEL_INDEX].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_LEVEL_INDEX].bitLen; + opt->compressLevelSymbol = compressOption & g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL].bitLen; + opt->compressPreallocChunks = compressOption & g_cmpBitStruct[CMP_PRE_CHUNK_INDEX].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_PRE_CHUNK_INDEX].bitLen; + opt->diffConvert = compressOption & g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].bitLen; + opt->byteConvert = compressOption & g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].mask; + compressOption = compressOption >> g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].bitLen; +} + +#define SET_COMPRESS_OPTION(node, byteConvert, diffConvert, preChunks, symbol, level, algorithm, chunkSize) \ + do { \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].bitLen; \ + (node).opt += (byteConvert)&g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].mask; \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].bitLen; \ + (node).opt += (diffConvert)&g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].mask; \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_PRE_CHUNK_INDEX].bitLen; \ + (node).opt += (preChunks)&g_cmpBitStruct[CMP_PRE_CHUNK_INDEX].mask; \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL].bitLen; \ + (node).opt += (symbol)&g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL].mask; \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_LEVEL_INDEX].bitLen; \ + (node).opt += (level)&g_cmpBitStruct[CMP_LEVEL_INDEX].mask; \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_ALGORITHM_INDEX].bitLen; \ + (node).opt += (algorithm)&g_cmpBitStruct[CMP_ALGORITHM_INDEX].mask; \ + (node).opt = (node).opt << g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].bitLen; \ + (node).opt += (chunkSize)&g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].mask; \ + } while (0) + +#define GET_ROW_COL_CONVERT(opt) \ + (((opt) >> g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].moveBit) & g_cmpBitStruct[CMP_BYTE_CONVERT_INDEX].mask) +#define GET_DIFF_CONVERT(opt) \ + (((opt) >> g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].moveBit) & g_cmpBitStruct[CMP_DIFF_CONVERT_INDEX].mask) +#define GET_COMPRESS_PRE_CHUNKS(opt) \ + (((opt) >> g_cmpBitStruct[CMP_PRE_CHUNK_INDEX].moveBit) & g_cmpBitStruct[CMP_PRE_CHUNK_INDEX].mask) +#define GET_COMPRESS_LEVEL_SYMBOL(opt) \ + (((opt) >> g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL].moveBit) & g_cmpBitStruct[CMP_COMPRESS_LEVEL_SYMBOL].mask) +#define GET_COMPRESS_LEVEL(opt) \ + (((opt) >> g_cmpBitStruct[CMP_LEVEL_INDEX].moveBit) & g_cmpBitStruct[CMP_LEVEL_INDEX].mask) +#define GET_COMPRESS_ALGORITHM(opt) \ + (((opt) >> g_cmpBitStruct[CMP_ALGORITHM_INDEX].moveBit) & g_cmpBitStruct[CMP_ALGORITHM_INDEX].mask) +#define GET_COMPRESS_CHUNK_SIZE(opt) \ + (((opt) >> g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].moveBit) & g_cmpBitStruct[CMP_CHUNK_SIZE_INDEX].mask) + +#define IS_COMPRESSED_MAINFORK(reln, forkNum) ((reln)->smgr_rnode.node.opt != 0 && (forkNum) == MAIN_FORKNUM) +#define IS_COMPRESSED_RNODE(rnode, forkNum) ((rnode).opt != 0 && (forkNum) == MAIN_FORKNUM) + +/* Compress function */ +template +extern int TemplateCompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option); + +template +extern int TemplateDecompressPage(const char* src, char* dst, uint8 algorithm); + +int CompressPageBufferBound(const char* page, uint8 algorithm); + +int CompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option); + +int DecompressPage(const char* src, char* dst, uint8 algorithm); + +/* Memory mapping function */ +extern PageCompressHeader* pc_mmap(int fd, int chunk_size, bool readonly); +extern PageCompressHeader* pc_mmap_real_size(int fd, int size, bool readonly); +extern int pc_munmap(PageCompressHeader * map); +extern int pc_msync(PageCompressHeader * map); + +/** + * format mainfork path name to compressed path + * @param dst destination buffer + * @param pathName uncompressed table name + * @param compressFileType pca or pcd + */ +extern void CopyCompressedPath(char dst[MAXPGPATH], const char* pathName, CompressedFileType compressFileType); + +/** + * @param pathName mainFork File path name + * @param relFileNode physically access, for validation + * @param forkNumber for validation + * @return size of mainFork + */ +extern int64 CalculateMainForkSize(char* pathName, RelFileNode* relFileNode, ForkNumber forkNumber); +extern int64 CalculateCompressMainForkSize(char* pathName, bool suppressedENOENT = false); + +extern uint16 ReadChunkSize(FILE *pcaFile, char* pcaFilePath, size_t len); + +/** + * read compressed chunks into dst, and decompressed page into pageBuffer + * @param dst destination + * @param destLen destination length + * @param blockNumber blockNumber + * @param ReadBlockChunksStruct other data needed + */ +size_t ReadAllChunkOfBlock(char *dst, size_t destLen, BlockNumber blockNumber, ReadBlockChunksStruct& rbStruct); +/** + * check if fileName is end with pca or pcd + * @param fileName fileName + * @return filetype + */ +CompressedFileType IsCompressedFile(char *fileName, size_t fileNameLen); + +int64 CalculateFileSize(char* pathName, size_t size, bool suppressedENOENT = false); +/** + * release mmap. print warning log if failed + * @param map mmap pointer + * @param fileName mmap filename, for loggging + */ +void ReleaseMap(PageCompressHeader* map, const char* fileName); + +/** + * convert chunk size to the index of CHUNK_SIZE_LIST + * @param compressedChunkSize {BLCKSZ / 2, BLCKSZ / 4, BLCKSZ / 8, BLCKSZ / 16} + * @param success success or not + * @return index of CHUNK_SIZE_LIST + */ +extern uint1 ConvertChunkSize(uint32 compressedChunkSize, bool* success); + +/** + * + * @param blockNumber block number + * @param pageCompressAddr addr of block + * @return checksum uint32 + */ +extern uint32 AddrChecksum32(BlockNumber blockNumber, const PageCompressAddr* pageCompressAddr); + +#ifndef FRONTEND +extern void CheckAndRepairCompressAddress(PageCompressHeader *pcMap, uint16 chunk_size, uint8 algorithm, const char *path); +PageCompressHeader* GetPageCompressHeader(void* vfd, int chunkSize, const RelFileNodeForkNum &relFileNodeForkNum); +void UnReferenceAddrFile(void* vfd); +void RealInitialMMapLockArray(); +#endif + +#endif /* PAGE_COMPRESSION_H */ diff --git a/src/include/storage/page_compression_impl.h b/src/include/storage/page_compression_impl.h new file mode 100644 index 000000000..c804bc43b --- /dev/null +++ b/src/include/storage/page_compression_impl.h @@ -0,0 +1,715 @@ +/* + * page_compression.h + * internal declarations for page compression + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/storage/page_compression_impl.h + */ + +#ifndef RC_INCLUDE_STORAGE_PAGE_COMPRESSION_IMPL_H +#define RC_INCLUDE_STORAGE_PAGE_COMPRESSION_IMPL_H + +#include +#include +#include +#include +#include +#include +#include + +#include "storage/page_compression.h" +#include "utils/pg_lzcompress.h" + +#include + +#define DEFAULT_ZSTD_COMPRESSION_LEVEL (1) +#define MIN_ZSTD_COMPRESSION_LEVEL ZSTD_minCLevel() +#define MAX_ZSTD_COMPRESSION_LEVEL ZSTD_maxCLevel() + +#define COMPRESS_DEFAULT_ERROR (-1) +#define COMPRESS_UNSUPPORTED_ERROR (-2) +#define GS_INVALID_ID16 (uint16)0xFFFF +#define MIN_DIFF_SIZE (64) +#define MIN_CONVERT_CNT (4) + +#ifndef USE_ASSERT_CHECKING +#define ASSERT(condition) +#else +#define ASSERT(condition) assert(condition) +#endif + + +#ifndef FRONTEND + +/** + * return data of page + * @param dst HeapPageCompressData or HeapPageCompressData + * @param heapPageData heapPageData or pagedata + * @return dst->data + */ +static inline char* GetPageCompressedData(char* dst, bool heapPageData) +{ + return heapPageData ? ((HeapPageCompressData*)dst)->data : ((PageCompressData*)dst)->data; +} + +static inline void FreePointer(void* pointer) +{ + if (pointer != NULL) { + pfree(pointer); + } +} + +/*======================================================================================*/ +#define COMPRESS "" +void cprs_diff_convert_rows(char *buf, uint32 offset,uint16 min_row_len, uint16 real_row_cnt) { + uint16 row_cnt = real_row_cnt; + uint32 common_size = min_row_len; + uint8 *copy_begin = (uint8 *)(buf + offset); + uint16 i, j; + + for (i = 0; i < common_size; i++) { + for (j = row_cnt - 1; j > 0; j--) { + copy_begin[i * row_cnt + j] -= copy_begin[i * row_cnt + (j - 1)]; + } + } + return ; +} + +void cprs_diff_deconvert_rows(char *buf, uint32 offset, uint16 min_row_len, uint16 real_row_cnt) { + uint16 row_cnt = real_row_cnt; + uint32 common_size = min_row_len; + uint8 *copy_begin = (uint8 *)(buf + offset); + uint16 i, j; + + for (i = 0; i < common_size; i++) { + for (j = 1; j < row_cnt; j++) { + copy_begin[i * row_cnt + j] += copy_begin[i * row_cnt + (j - 1)]; + } + } + return ; +} + +void CompressConvertItemIds(char *buf, char *aux_buf) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + uint32 total_size = row_cnt * sizeof(ItemIdData); + char *copy_begin = buf + GetPageHeaderSize(page); + uint16 i, j, k; + + // clear aux_buf + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + k = 0; + for (i = 0; i < row_cnt; i++) { + for (j = 0; j < sizeof(ItemIdData); j++) { + aux_buf[j * row_cnt + i] = copy_begin[k++]; + } + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size); + securec_check(ret, "", ""); + return ; +} + + +void CompressConvertRows(char *buf, char *aux_buf, int16 *real_order, uint16 max_row_len, uint16 real_row_cnt) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = real_row_cnt; + uint32 total_size = page->pd_special - page->pd_upper; + char *copy_begin = buf + page->pd_upper; + char *row; + uint16 i, j, k, cur, up, row_size; + + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + k = 0; + for (i = 0; i < max_row_len; i++) { + for (j = 0; j < row_cnt; j++) { + up = (j == (row_cnt - 1)) ? page->pd_special : GET_ITEMID_BY_IDX(buf, (real_order[j + 1]))->lp_off; + cur = GET_ITEMID_BY_IDX(buf, (real_order[j]))->lp_off; + row_size = up - cur; + row = buf + cur; + if (i < row_size) { + aux_buf[k++] = row[i]; // this part is reshaped + } + } + } + + if (k != total_size) { + printf("ERROR!!! convert_rows_2 error...!!!\n"); + ASSERT(0); + return; + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size); + securec_check(ret, "", ""); + return ; +} + +// 1: as tuple_offset order, that means asc order. +// 2: store all itemid's idx. +// 3:maybe some itemid is not in order. +void CompressConvertItemRealOrder(char *buf, int16 *real_order, uint16 real_row_cnt) { + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + ItemIdData *begin = (ItemIdData *)(buf + GetPageHeaderSize(page)); + int16 *link_order = real_order + real_row_cnt; + + int16 i, head, curr, prev; + int16 end = -1; // invalid index + + head = end; + // very likely to seems that itemids stored by desc order, and ignore invalid itemid + for (i = 0; i < row_cnt; i++) { + if (!ItemIdIsNormal(begin + i)) { + continue; + } + + if (head == end) { // set the head idx, insert the first + link_order[i] = end; + head = i; + continue; + } + + if ((begin + i)->lp_off < (begin + head)->lp_off) { + link_order[i] = head; // update the head idx + head = i; + continue; + } + + prev = head; + curr = link_order[head]; + while ((curr != end) && ((begin + i)->lp_off > (begin + curr)->lp_off)) { + prev = curr; + curr = link_order[curr]; + } + + link_order[prev] = i; + link_order[i] = curr; + } + + // arrange the link to array + curr = head; + for (i = 0; i < real_row_cnt; i++) { + real_order[i] = curr; + curr = link_order[curr]; + } + + if (curr != end) { + printf("ERROR!!! pre_convert_real_order error...!!!\n"); + ASSERT(0); + return; + } + +} + +// maybe some itemid is not valid +uint16 HeapPageCalcRealRowCnt (char *buf) { + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 cnt = 0; + uint16 i; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + + for (i = 0; i < row_cnt; i++) { + if (ItemIdIsNormal(GET_ITEMID_BY_IDX(buf, i))) { + cnt++; + } + } + return cnt; +} + +// to find all row size are diffs in MIN_DIFF_SIZE byts. +bool CompressConvertCheck(char *buf, int16 **real_order, uint16 *max_row_len, uint16 *min_row_len, uint16 *real_row_cnt) { + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + int16 i, row_size; + ItemIdData *ptr = NULL; + uint16 up = page->pd_special; + uint16 min_size = GS_INVALID_ID16; + uint16 max_size = 0; + errno_t ret; + if (page->pd_lower < GetPageHeaderSize(page) || (page->pd_lower > page->pd_upper)) { + return false; + } + + uint16 normal_row_cnt = HeapPageCalcRealRowCnt(buf); + if (normal_row_cnt < MIN_CONVERT_CNT) { // no need convert + return false; + } + + // to store the real tuple order. + /* + --------------------------|-------------------------- + xxxxxxxxxxxxxxxxxxxxxxxxxx|xxxxxxxxxxxxxxxxxxxxxxxxxx + --------------------------|-------------------------- + */ + // the first part is real array order, and the second part is link. + *real_order = (int16 *)palloc(sizeof(uint16) * row_cnt * 2); + if (*real_order == NULL) { + printf("zfunc compress file"); + return false; + } + ret = memset_sp(*real_order, sizeof(uint16) * row_cnt * 2, 0, sizeof(uint16) * row_cnt * 2); + securec_check(ret, "", ""); + + // order the ItemIds by tuple_offset order. + CompressConvertItemRealOrder(buf, *real_order, normal_row_cnt); + + // do the check, to check all size of tuples. + for (i = normal_row_cnt - 1; i >= 0; i--) { + ptr = GET_ITEMID_BY_IDX(buf, ((*real_order)[i])); + + row_size = up - ptr->lp_off; + if (row_size < MIN_CONVERT_CNT * 2) { + return false; + } + + min_size = (row_size < min_size) ? row_size : min_size; + max_size = (row_size > max_size) ? row_size : max_size; + + if ((max_size - min_size) > MIN_DIFF_SIZE) { // no need convert + return false; + } + up = ptr->lp_off; + } + + // get the min row common size. + *max_row_len = max_size; + *min_row_len = min_size; + *real_row_cnt = normal_row_cnt; + return true; +} + +bool CompressConvertOnePage(char *buf, char *aux_buf, bool diff_convert) { + uint16 max_row_len = 0; + uint16 min_row_len = 0; + int16 *real_order = NULL; // itemids are not in order sometimes. we must find the real + uint16 real_row_cnt = 0; + if (!CompressConvertCheck(buf, &real_order, &max_row_len, &min_row_len, &real_row_cnt)) { + FreePointer((void*)real_order); + return false; + } + + CompressConvertRows(buf, aux_buf, real_order, max_row_len, real_row_cnt); + CompressConvertItemIds(buf, aux_buf); + + if (diff_convert) { + cprs_diff_convert_rows(buf, ((HeapPageHeaderData *)buf)->pd_upper, min_row_len, real_row_cnt); + cprs_diff_convert_rows(buf, GetPageHeaderSize(buf), sizeof(ItemIdData), + (((HeapPageHeaderData *)buf)->pd_lower - GetPageHeaderSize(buf)) / sizeof(ItemIdData)); + } + + FreePointer((void*)real_order); + return true; +} + +void CompressPagePrepareConvert(char *src, bool diff_convert, bool *real_ByteConvert) +{ + char *aux_buf = NULL; + errno_t rc; + + aux_buf = (char *)palloc(BLCKSZ); + if (aux_buf == NULL) { + // add log + return; + } + rc = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(rc, "", ""); + + // do convert + *real_ByteConvert = false; + if (CompressConvertOnePage(src, aux_buf, diff_convert)) { + *real_ByteConvert = true; + } + + FreePointer((void*)aux_buf); +} + +/** + * CompressPageBufferBound() + * -- Get the destination buffer boundary to compress one page. + * Return needed destination buffer size for compress one page or + * -1 for unrecognized compression algorithm + */ +int CompressPageBufferBound(const char* page, uint8 algorithm) +{ + switch (algorithm) { + case COMPRESS_ALGORITHM_PGLZ: + return BLCKSZ + 4; + case COMPRESS_ALGORITHM_ZSTD: + return ZSTD_compressBound(BLCKSZ - GetPageHeaderSize(page)); + default: + return -1; + } +} + +int CompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option) +{ + if (PageIs8BXidHeapVersion(src)) { + return TemplateCompressPage(src, dst, dst_size, option); + } else { + return TemplateCompressPage(src, dst, dst_size, option); + } +} + +int DecompressPage(const char* src, char* dst, uint8 algorithm) +{ + if (PageIs8BXidHeapVersion(src)) { + return TemplateDecompressPage(src, dst, algorithm); + } else { + return TemplateDecompressPage(src, dst, algorithm); + } +} + +inline size_t GetSizeOfHeadData(bool heapPageData) +{ + if (heapPageData) { + return SizeOfHeapPageHeaderData; + } else { + return SizeOfPageHeaderData; + } +} + +/** + * CompressPage() -- Compress one page. + * + * Only the parts other than the page header will be compressed. The + * compressed data is rounded by chunck_size, The insufficient part is + * filled with zero. Compression needs to be able to save at least one + * chunk of space, otherwise it fail. + * This function returen the size of compressed data or + * -1 for compression fail + * COMPRESS_UNSUPPORTED_ERROR for unrecognized compression algorithm + */ +template +int TemplateCompressPage(const char* src, char* dst, int dst_size, RelFileCompressOption option) +{ + int compressed_size; + int8 level = option.compressLevelSymbol ? option.compressLevel : -option.compressLevel; + size_t sizeOfHeaderData = GetSizeOfHeadData(heapPageData); + char* src_copy = NULL; + bool real_ByteConvert = false; + errno_t rc; + + if (option.byteConvert) { + // copy and maybe change it + src_copy = (char*)palloc(BLCKSZ); + if (src_copy == NULL) { + // add log + return -1; + } + rc = memcpy_s(src_copy, BLCKSZ, src, BLCKSZ); + securec_check(rc, "", ""); + CompressPagePrepareConvert(src_copy, option.diffConvert, &real_ByteConvert); /* preprocess convert src */ + } + + char* data = GetPageCompressedData(dst, heapPageData); + + switch (option.compressAlgorithm) { + case COMPRESS_ALGORITHM_PGLZ: + if (real_ByteConvert) { + compressed_size = lz_compress(src_copy + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, data); + } else { + compressed_size = lz_compress(src + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, data); + } + break; + case COMPRESS_ALGORITHM_ZSTD: { + if (level == 0 || level < MIN_ZSTD_COMPRESSION_LEVEL || level > MAX_ZSTD_COMPRESSION_LEVEL) { + level = DEFAULT_ZSTD_COMPRESSION_LEVEL; + } + + if (real_ByteConvert) { + compressed_size = + ZSTD_compress(data, dst_size, src_copy + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, level); + } else { + compressed_size = + ZSTD_compress(data, dst_size, src + sizeOfHeaderData, BLCKSZ - sizeOfHeaderData, level); + } + + if (ZSTD_isError(compressed_size)) { + FreePointer((void*)src_copy); + return -1; + } + break; + } + default: + FreePointer((void*)src_copy); + return COMPRESS_UNSUPPORTED_ERROR; + } + + if (compressed_size < 0) { + FreePointer((void*)src_copy); + return -1; + } + + if (heapPageData) { + HeapPageCompressData* pcdptr = ((HeapPageCompressData*)dst); + rc = memcpy_s(pcdptr->page_header, sizeOfHeaderData, src, sizeOfHeaderData); + securec_check(rc, "", ""); + pcdptr->size = compressed_size; + pcdptr->byte_convert = real_ByteConvert; + pcdptr->diff_convert = option.diffConvert; + } else { + PageCompressData* pcdptr = ((PageCompressData*)dst); + rc = memcpy_s(pcdptr->page_header, sizeOfHeaderData, src, sizeOfHeaderData); + securec_check(rc, "", ""); + pcdptr->size = compressed_size; + pcdptr->byte_convert = real_ByteConvert; + pcdptr->diff_convert = option.diffConvert; + } + + FreePointer((void*)src_copy); + return SIZE_OF_PAGE_COMPRESS_DATA_HEADER_DATA(heapPageData) + compressed_size; +} + +/*======================================================================================*/ +#define DECOMPRESS "" +void DecompressDeconvertRows(char *buf, char *aux_buf, int16 *real_order, uint16 max_row_len, uint16 real_row_cnt) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = real_row_cnt; + uint32 total_size = page->pd_special - page->pd_upper; + char *copy_begin = buf + page->pd_upper; + char *row; + uint16 i, j, k, cur, up, row_size; + + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + for (i = 0, k = 0; i < max_row_len; i++) { + for (j = 0; j < row_cnt; j++) { + up = (j == (row_cnt - 1)) ? page->pd_special : GET_ITEMID_BY_IDX(buf, (real_order[j + 1]))->lp_off; + cur = GET_ITEMID_BY_IDX(buf, (real_order[j]))->lp_off; + row_size = up - cur; + row = aux_buf + cur; + if (i < row_size) { + row[i] = copy_begin[k++]; // this part is reshaped + } + } + } + + if (k != total_size) { + printf("ERROR!!! pg_deconvert_rows error...!!!\n"); + ASSERT(0); + return; + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf + page->pd_upper, total_size); + securec_check(ret, "", ""); + return ; +} + +void DecompressDeconvertItemIds(char *buf, char *aux_buf) { + errno_t ret; + HeapPageHeaderData *page = (HeapPageHeaderData *)buf; + uint16 row_cnt = (page->pd_lower - GetPageHeaderSize(page)) / sizeof(ItemIdData); + uint32 total_size = row_cnt * sizeof(ItemIdData); + char* copy_begin = buf + GetPageHeaderSize(page); + uint16 i, j, k; + + // clear aux_buf + ret = memset_sp(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(ret, "", ""); + + for (i = 0, k = 0; i < sizeof(ItemIdData); i++) { + for (j = 0; j < row_cnt; j++) { + aux_buf[j * sizeof(ItemIdData) + i] = copy_begin[k++]; + } + } + + // cp aux_buf to page_buf + ret = memcpy_sp(copy_begin, total_size, aux_buf, total_size); + securec_check(ret, "", ""); + return ; +} + +void DecompressDeconvertOnePage(char *buf, char *aux_buf, bool diff_convert) { + uint16 max_row_len = 0; + uint16 min_row_len = 0; + int16 *real_order = NULL; // itemids are not in order sometimes. we must find the real + uint16 real_row_cnt = 0; + + if (diff_convert) { + cprs_diff_deconvert_rows(buf, GetPageHeaderSize(buf), sizeof(ItemIdData), + (((HeapPageHeaderData *)buf)->pd_lower - GetPageHeaderSize(buf)) / sizeof(ItemIdData)); + } + + // =======firstly, arrange the itemids. + DecompressDeconvertItemIds(buf, aux_buf); + + if (!CompressConvertCheck(buf, &real_order, &max_row_len, &min_row_len, &real_row_cnt)) { + FreePointer((void*)real_order); + ASSERT(0); + return ; + } + + // =======and last, the tuples + if (diff_convert) { + cprs_diff_deconvert_rows(buf, ((HeapPageHeaderData *)buf)->pd_upper, min_row_len, real_row_cnt); + } + DecompressDeconvertRows(buf, aux_buf, real_order, max_row_len, real_row_cnt); + FreePointer((void*)real_order); +} + +void DecompressPageDeconvert(char *src, bool diff_convert) +{ + char *aux_buf = NULL; + errno_t rc; + + aux_buf = (char *)palloc(BLCKSZ); + if (aux_buf == NULL) { + // add log + return; + } + rc = memset_s(aux_buf, BLCKSZ, 0, BLCKSZ); + securec_check(rc, "", ""); + + // do convert + DecompressDeconvertOnePage(src, aux_buf, diff_convert); + + FreePointer((void*)aux_buf); +} + +/** + * DecompressPage() -- Decompress one compressed page. + * return size of decompressed page which should be BLCKSZ or + * -1 for decompress error + * -2 for unrecognized compression algorithm + * + * note:The size of dst must be greater than or equal to BLCKSZ. + */ +template +int TemplateDecompressPage(const char* src, char* dst, uint8 algorithm) +{ + int decompressed_size; + char* data; + uint32 size; + bool byte_convert, diff_convert; + size_t headerSize = GetSizeOfHeadData(heapPageData); + int rc = memcpy_s(dst, headerSize, src, headerSize); + securec_check(rc, "", ""); + + if (heapPageData) { + data = ((HeapPageCompressData*)src)->data; + size = ((HeapPageCompressData*)src)->size; + byte_convert = ((HeapPageCompressData*)src)->byte_convert; + diff_convert = ((HeapPageCompressData*)src)->diff_convert; + } else { + data = ((PageCompressData*)src)->data; + size = ((PageCompressData*)src)->size; + byte_convert = ((PageCompressData*)src)->byte_convert; + diff_convert = ((PageCompressData*)src)->diff_convert; + } + + switch (algorithm) { + case COMPRESS_ALGORITHM_PGLZ: + decompressed_size = lz_decompress(data, size, dst + headerSize, BLCKSZ - headerSize, false); + break; + case COMPRESS_ALGORITHM_ZSTD: + decompressed_size = ZSTD_decompress(dst + headerSize, BLCKSZ - headerSize, data, size); + if (ZSTD_isError(decompressed_size)) { + return -1; + } + break; + default: + return COMPRESS_UNSUPPORTED_ERROR; + break; + } + + if (byte_convert) { + DecompressPageDeconvert(dst, diff_convert); + } + + return headerSize + decompressed_size; +} +#endif + +/** + * pc_mmap() -- create memory map for page compress file's address area. + * + */ +PageCompressHeader* pc_mmap(int fd, int chunk_size, bool readonly) +{ + int pc_memory_map_size = SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunk_size); + return pc_mmap_real_size(fd, pc_memory_map_size, readonly); +} + +/** + * pc_mmap_real_size() -- create memory map for page compress file's address area. + * + */ +extern PageCompressHeader* pc_mmap_real_size(int fd, int pc_memory_map_size, bool readonly) +{ + PageCompressHeader* map = NULL; + int file_size = lseek(fd, 0, SEEK_END); + if (file_size != pc_memory_map_size) { + if (ftruncate(fd, pc_memory_map_size) != 0) { + return (PageCompressHeader*) MAP_FAILED; + } + } + if (readonly) { + map = (PageCompressHeader*) mmap(NULL, pc_memory_map_size, PROT_READ, MAP_SHARED, fd, 0); + } else { + map = (PageCompressHeader*) mmap(NULL, pc_memory_map_size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + } + return map; +} + +/** + * pc_munmap() -- release memory map of page compress file. + * + */ +int pc_munmap(PageCompressHeader *map) +{ + return munmap(map, SIZE_OF_PAGE_COMPRESS_ADDR_FILE(map->chunk_size)); +} + +/** + * pc_msync() -- sync memory map of page compress file. + * + */ +int pc_msync(PageCompressHeader *map) +{ +#ifndef FRONTEND + if (!u_sess->attr.attr_storage.enableFsync) { + return 0; + } +#endif + return msync(map, SIZE_OF_PAGE_COMPRESS_ADDR_FILE(map->chunk_size), MS_SYNC); +} + + +uint32 AddrChecksum32(BlockNumber blockNumber, const PageCompressAddr* pageCompressAddr) +{ +#define UINT_LEN sizeof(uint32) + uint32 checkSum = 0; + char* addr = ((char*) pageCompressAddr) + UINT_LEN; + size_t len = sizeof(PageCompressAddr) - UINT_LEN; + do { + if (len >= UINT_LEN) { + checkSum += *((uint32*) addr); + addr += UINT_LEN; + len -= UINT_LEN; + } else { + char finalNum[UINT_LEN] = {0}; + size_t i = 0; + for (; i < len; ++i) { + finalNum[i] = addr[i]; + } + checkSum += *((uint32*) finalNum); + len -= i; + } + } while (len); + return checkSum; +} + +#endif diff --git a/src/include/storage/remote_adapter.h b/src/include/storage/remote_adapter.h index 29e800fa5..db786a2a9 100755 --- a/src/include/storage/remote_adapter.h +++ b/src/include/storage/remote_adapter.h @@ -37,7 +37,7 @@ extern int StandbyReadCUforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int32 colid, uint64 offset, int32 size, uint64 lsn, bytea** cudata); -extern int StandbyReadPageforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int16 bucketnode, int32 forknum, uint32 blocknum, +extern int StandbyReadPageforPrimary(uint32 spcnode, uint32 dbnode, uint32 relnode, int16 bucketnode, uint16 opt, int32 forknum, uint32 blocknum, uint32 blocksize, uint64 lsn, bytea** pagedata); #endif /* REMOTE_ADAPTER_H */ diff --git a/src/include/storage/smgr/fd.h b/src/include/storage/smgr/fd.h index 145aed28e..9e11a9760 100644 --- a/src/include/storage/smgr/fd.h +++ b/src/include/storage/smgr/fd.h @@ -42,6 +42,7 @@ #include #include "utils/hsearch.h" #include "storage/smgr/relfilenode.h" +#include "storage/page_compression.h" #include "postmaster/aiocompleter.h" /* @@ -182,6 +183,10 @@ extern int data_sync_elevel(int elevel); extern bool FdRefcntIsZero(SMgrRelation reln, ForkNumber forkNum); extern FileExistStatus CheckFileExists(const char* path); +/* Page compression support routines */ +extern void SetupPageCompressMemoryMap(File file, RelFileNode node, const RelFileNodeForkNum& relFileNodeForkNum); +extern PageCompressHeader *GetPageCompressMemoryMap(File file, uint32 chunk_size); + /* Filename components for OpenTemporaryFile */ // Note that this macro must be the same to macro in initdb.cpp // If you change it, you must also change initdb.cpp diff --git a/src/include/storage/smgr/relfilenode.h b/src/include/storage/smgr/relfilenode.h index 7a2b3b9c1..7a67bae31 100644 --- a/src/include/storage/smgr/relfilenode.h +++ b/src/include/storage/smgr/relfilenode.h @@ -45,6 +45,9 @@ typedef int ForkNumber; #define VISIBILITYMAP_FORKNUM 2 #define BCM_FORKNUM 3 #define INIT_FORKNUM 4 +// used for data file cache, you can modify than as you like +#define PCA_FORKNUM 5 +#define PCD_FORKNUM 6 /* * NOTE: if you add a new fork, change MAX_FORKNUM below and update the @@ -97,9 +100,18 @@ typedef struct RelFileNode { Oid spcNode; /* tablespace */ Oid dbNode; /* database */ Oid relNode; /* relation */ - int4 bucketNode; /* bucketid */ + int2 bucketNode; /* bucketid */ + uint2 opt; } RelFileNode; +typedef struct RelFileNodeV2 { + Oid spcNode; /* tablespace */ + Oid dbNode; /* database */ + Oid relNode; /* relation */ + int4 bucketNode; /* bucketid */ +} RelFileNodeV2; + + #define IsSegmentFileNode(rnode) ((rnode).bucketNode > InvalidBktId) #define IsHeapFileNode(rnode) (!IsSegmentFileNode(rnode)) #define IsSegmentPhysicalRelNode(rNode) (IsSegmentFileNode(rNode) && (rNode).relNode <= 5) @@ -130,6 +142,14 @@ typedef struct RelFileNodeOld (relFileNode).bucketNode = (bucketid); \ } while(0) +#define RelFileNodeV2Copy(relFileNodeV2, relFileNode) \ + do { \ + (relFileNodeV2).spcNode = (relFileNode).spcNode; \ + (relFileNodeV2).dbNode = (relFileNode).dbNode; \ + (relFileNodeV2).relNode = (relFileNode).relNode; \ + (relFileNodeV2).bucketNode = (relFileNode).bucketNode; \ + } while (0) + /*This struct used for remove duplicated file list where we scan part of BCM files*/ typedef struct RelFileNodeKey { RelFileNode relfilenode; /*relfilenode*/ diff --git a/src/include/storage/vfd.h b/src/include/storage/vfd.h index 33a0fdab4..1135517da 100644 --- a/src/include/storage/vfd.h +++ b/src/include/storage/vfd.h @@ -17,6 +17,7 @@ #include #include "utils/resowner.h" +#include "storage/page_compression.h" #include "storage/smgr/relfilenode.h" typedef struct vfd { @@ -34,6 +35,8 @@ typedef struct vfd { int fileFlags; /* open(2) flags for (re)opening the file */ int fileMode; /* mode to pass to open(2) */ RelFileNodeForkNum fileNode; /* current logical file node */ + bool with_pcmap; /* is page compression relation */ + PageCompressHeader *pcmap; /* memory map of page compression address file */ } Vfd; #endif /* VFD_H */ diff --git a/src/include/utils/aset.h b/src/include/utils/aset.h index d30083ce1..7fcef0a63 100644 --- a/src/include/utils/aset.h +++ b/src/include/utils/aset.h @@ -221,6 +221,12 @@ public: template static int gs_posix_memalign(void** memptr, Size alignment, Size sz, bool needProtect); + + template + static bool gs_memprot_reserve(Size sz, bool needProtect); + + template + static void gs_memprot_release(Size sz); }; extern int alloc_trunk_size(int width); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 33daf3b66..b574098b1 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -588,6 +588,7 @@ extern Datum pg_read_file(PG_FUNCTION_ARGS); extern Datum pg_read_file_all(PG_FUNCTION_ARGS); extern Datum pg_read_binary_file(PG_FUNCTION_ARGS); extern Datum pg_read_binary_file_all(PG_FUNCTION_ARGS); +extern Datum pg_read_binary_file_blocks(PG_FUNCTION_ARGS); extern Datum pg_ls_dir(PG_FUNCTION_ARGS); extern Datum pg_stat_file_recursive(PG_FUNCTION_ARGS); diff --git a/src/include/utils/partcache.h b/src/include/utils/partcache.h index dcb53adae..a4618cdee 100644 --- a/src/include/utils/partcache.h +++ b/src/include/utils/partcache.h @@ -64,7 +64,7 @@ extern void PartitionCacheInitializePhase3(void); * Routine to create a partcache entry for an about-to-be-created relation */ Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid partfilenode, Oid parttablespace, - StorageType storage_type); + StorageType storage_type, Datum reloptions); /* * Routines for backend startup */ diff --git a/src/include/utils/pg_lzcompress.h b/src/include/utils/pg_lzcompress.h index 77fd1e306..431b4ba1b 100644 --- a/src/include/utils/pg_lzcompress.h +++ b/src/include/utils/pg_lzcompress.h @@ -125,6 +125,11 @@ extern const PGLZ_Strategy* const PGLZ_strategy_always; * ---------- */ extern bool pglz_compress(const char* source, int32 slen, PGLZ_Header* dest, const PGLZ_Strategy* strategy); + extern void pglz_decompress(const PGLZ_Header* source, char* dest); +extern int32 lz_compress(const char* source, int32 slen, char* dest); + +extern int32 lz_decompress(const char* source, int32 slen, char* dest, int32 rawsize, bool check_complete); + #endif /* _PG_LZCOMPRESS_H_ */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a6ab17dc8..f09efcaf8 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -94,6 +94,16 @@ typedef struct RelationBucketKey Oid *bucketKeyType; /*the data type of partition key*/ }RelationBucketKey; +/* page compress related reloptions. */ +typedef struct PageCompressOpts { + int compressType; /* compress algorithm */ + int compressLevel; /* compress level */ + uint32 compressChunkSize; /* chunk size of compressed data */ + uint32 compressPreallocChunks; /* prealloced chunks to store compressed data */ + bool compressByteConvert; /* byte row-coll-convert */ + bool compressDiffConvert; /* make difference convert */ +} PageCompressOpts; + /* describe commit sequence number of object in pg_object */ typedef struct ObjectCSN { @@ -305,6 +315,12 @@ typedef enum RedisRelAction { REDIS_REL_RESET_CTID } RedisHtlAction; +/* PageCompressOpts->compressType values */ +typedef enum CompressTypeOption { + COMPRESS_TYPE_NONE = 0, COMPRESS_TYPE_PGLZ = 1, COMPRESS_TYPE_ZSTD = 2 +} CompressTypeOption; + + typedef struct StdRdOptions { int32 vl_len_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ @@ -370,6 +386,7 @@ typedef struct StdRdOptions { char* encrypt_algo; bool enable_tde; /* switch flag for table-level TDE encryption */ bool on_commit_delete_rows; /* global temp table */ + PageCompressOpts compress; /* page compress related reloptions. */ } StdRdOptions; #define HEAP_MIN_FILLFACTOR 10 diff --git a/src/include/utils/rel_gs.h b/src/include/utils/rel_gs.h index 52f5b1283..c8d5e82f5 100644 --- a/src/include/utils/rel_gs.h +++ b/src/include/utils/rel_gs.h @@ -636,6 +636,11 @@ extern void PartitionDecrementReferenceCount(Partition part); ((PARTTYPE_VALUE_PARTITIONED_RELATION == (relation)->rd_rel->parttype) && \ (RELKIND_RELATION == (relation)->rd_rel->relkind)) +#define HEAP_IS_PARTITIONED(relation) \ + ((PARTTYPE_PARTITIONED_RELATION == (relation)->rd_rel->parttype || \ + PARTTYPE_VALUE_PARTITIONED_RELATION == (relation)->rd_rel->parttype) && \ + (RELKIND_RELATION == (relation)->rd_rel->relkind || RELKIND_INDEX == (relation)->rd_rel->relkind)) + /* * type bucketOid bucketKey meaning * N INV INV relation has no bucket diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index cb2491ac4..8ef0d4a62 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -107,7 +107,8 @@ extern void RelationCacheInitializePhase3(void); */ extern Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, TupleDesc tupDesc, Oid relid, Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence, char relkind, - int8 row_compress, TableAmType tam_type, int8 relindexsplit = 0, StorageType storage_type = HEAP_DISK); + int8 row_compress, Datum reloptions, TableAmType tam_type, int8 relindexsplit = 0, StorageType storage_type = HEAP_DISK, + Oid accessMethodObjectId = 0); /* * Routine to manage assignment of new relfilenode to a relation diff --git a/src/test/regress/expected/hw_package.out b/src/test/regress/expected/hw_package.out index e7eaf8bff..38e642470 100644 --- a/src/test/regress/expected/hw_package.out +++ b/src/test/regress/expected/hw_package.out @@ -8,8 +8,6 @@ drop table if exists test_trigger_src_tbl; NOTICE: table "test_trigger_src_tbl" does not exist, skipping drop package if exists trigger_test; NOTICE: package trigger_test() does not exist, skipping -drop table if exists test1; -NOTICE: table "test1" does not exist, skipping drop table if exists dams_ci.test1; ERROR: schema "dams_ci" does not exist drop table if exists dams_ci.DB_LOG; diff --git a/src/test/regress/expected/row_compression/normal_test.out b/src/test/regress/expected/row_compression/normal_test.out new file mode 100644 index 000000000..7a234bb01 --- /dev/null +++ b/src/test/regress/expected/row_compression/normal_test.out @@ -0,0 +1,183 @@ +create schema normal_test; +CREATE TABLE normal_test.tbl_pc(id int, c1 text) WITH(compresstype=1); +\d+ normal_test.tbl_pc + Table "normal_test.tbl_pc" + Column | Type | Modifiers | Storage | Stats target | Description +--------+---------+-----------+----------+--------------+------------- + id | integer | | plain | | + c1 | text | | extended | | +Has OIDs: no +Options: orientation=row, compresstype=1 + +INSERT INTO normal_test.tbl_pc SELECT id, id::text FROM generate_series(1,1000) id; +select count(*) from normal_test.tbl_pc; + count +------- + 1000 +(1 row) + +select count(*) from normal_test.tbl_pc where id < 100; + count +------- + 99 +(1 row) + +checkpoint; +vacuum normal_test.tbl_pc; +select count(*) from normal_test.tbl_pc; + count +------- + 1000 +(1 row) + +select count(*) from normal_test.tbl_pc where id < 100; + count +------- + 99 +(1 row) + +-- normal index +create index on normal_test.tbl_pc(id) WITH (compresstype=2,compress_chunk_size=1024); +alter index normal_test.tbl_pc_id_idx set (compresstype=1); --failed +ERROR: change compresstype OPTION is not supported +alter index normal_test.tbl_pc_id_idx set (compress_chunk_size=2048); --failed +ERROR: change compress_chunk_size OPTION is not supported +alter index normal_test.tbl_pc_id_idx set (compress_prealloc_chunks=2); --success +alter index normal_test.tbl_pc_id_idx set (compress_level=2); --success +set enable_seqscan = off; +set enable_bitmapscan = off; +select count(*) from normal_test.tbl_pc; + count +------- + 1000 +(1 row) + +CREATE TABLE normal_test.tbl_partition(id int) WITH(compresstype=2,compress_chunk_size=1024) partition by range(id) +( + partition p0 values less than(5000), + partition p1 values less than(10000), + partition p2 values less than(20000), + partition p3 values less than(30000), + partition p4 values less than(40000), + partition p5 values less than(50000), + partition p6 values less than(60000), + partition p7 values less than(70000) +); +insert into normal_test.tbl_partition select generate_series(1,65000); +select count(*) from normal_test.tbl_partition; + count +------- + 65000 +(1 row) + +checkpoint; +vacuum normal_test.tbl_partition; +select count(*) from normal_test.tbl_partition; + count +------- + 65000 +(1 row) + +-- exchange +select relname, reloptions from pg_partition where parentid in (Select relfilenode from pg_class where relname like 'tbl_partition') order by relname; + relname | reloptions +---------------+---------------------------------------------------------------------------- + p0 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p1 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p2 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p3 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p4 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p5 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p6 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p7 | {orientation=row,compresstype=2,compress_chunk_size=1024} + tbl_partition | {orientation=row,compresstype=2,compress_chunk_size=1024,wait_clean_gpi=n} +(9 rows) + +create table normal_test.exchange_table(id int) WITH(compresstype=2,compress_chunk_size=1024); +ALTER TABLE normal_test.tbl_partition EXCHANGE PARTITION FOR(2500) WITH TABLE normal_test.exchange_table; +select count(*) from normal_test.tbl_partition; + count +------- + 60001 +(1 row) + +-- spilit +ALTER TABLE normal_test.tbl_partition SPLIT PARTITION p1 AT (7500) INTO (PARTITION p10, PARTITION p11); +select relname, reloptions from pg_partition where parentid in (Select relfilenode from pg_class where relname like 'tbl_partition') order by relname; + relname | reloptions +---------------+---------------------------------------------------------------------------- + p0 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p10 | {orientation=row,compresstype=2,compress_chunk_size=1024,wait_clean_gpi=y} + p11 | {orientation=row,compresstype=2,compress_chunk_size=1024,wait_clean_gpi=y} + p2 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p3 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p4 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p5 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p6 | {orientation=row,compresstype=2,compress_chunk_size=1024} + p7 | {orientation=row,compresstype=2,compress_chunk_size=1024} + tbl_partition | {orientation=row,compresstype=2,compress_chunk_size=1024,wait_clean_gpi=y} +(10 rows) + +create index on normal_test.tbl_partition(id) local WITH (compresstype=2,compress_chunk_size=1024); +\d+ normal_test.tbl_partition + Table "normal_test.tbl_partition" + Column | Type | Modifiers | Storage | Stats target | Description +--------+---------+-----------+---------+--------------+------------- + id | integer | | plain | | +Indexes: + "tbl_partition_id_idx" btree (id) LOCAL(PARTITION p0_id_idx, PARTITION p10_id_idx, PARTITION p11_id_idx, PARTITION p2_id_idx, PARTITION p3_id_idx, PARTITION p4_id_idx, PARTITION p5_id_idx, PARTITION p6_id_idx, PARTITION p7_id_idx) WITH (compresstype=2, compress_chunk_size=1024) TABLESPACE pg_default +--?.* +--?.* +Has OIDs: no +Options: orientation=row, compresstype=2, compress_chunk_size=1024 + +select relname, reloptions from pg_partition where parentid in (Select relfilenode from pg_class where relname like 'tbl_partition_id_idx') order by relname; + relname | reloptions +------------+------------------------------------------- + p0_id_idx | {compresstype=2,compress_chunk_size=1024} + p10_id_idx | {compresstype=2,compress_chunk_size=1024} + p11_id_idx | {compresstype=2,compress_chunk_size=1024} + p2_id_idx | {compresstype=2,compress_chunk_size=1024} + p3_id_idx | {compresstype=2,compress_chunk_size=1024} + p4_id_idx | {compresstype=2,compress_chunk_size=1024} + p5_id_idx | {compresstype=2,compress_chunk_size=1024} + p6_id_idx | {compresstype=2,compress_chunk_size=1024} + p7_id_idx | {compresstype=2,compress_chunk_size=1024} +(9 rows) + +-- unsupport +alter index normal_test.tbl_partition_id_idx set (compresstype=1); +ERROR: change compresstype OPTION is not supported +alter index normal_test.tbl_partition_id_idx set (compress_chunk_size=2048); +ERROR: change compress_chunk_size OPTION is not supported +alter index normal_test.tbl_partition_id_idx set (compress_prealloc_chunks=2); +ERROR: change partition compress_prealloc_chunks OPTION is not supported +-- support +alter table normal_test.tbl_pc set (compress_prealloc_chunks=2); +-- new testcase +set search_path=normal_test; +\d+ +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +(3 rows) + +reset search_path; +CREATE TABLE normal_test.pre_handle(id int) WITH(compresstype=2, compress_chunk_size=512, compress_byte_convert=true, compress_diff_convert=true); +insert into normal_test.pre_handle select generate_series(1,1000); +checkpoint; +select count(*) from normal_test.pre_handle; + count +------- + 1000 +(1 row) + +drop schema normal_test cascade; +NOTICE: drop cascades to 4 other objects +DETAIL: drop cascades to table normal_test.tbl_pc +drop cascades to table normal_test.tbl_partition +drop cascades to table normal_test.exchange_table +drop cascades to table normal_test.pre_handle diff --git a/src/test/regress/expected/row_compression/pg_table_size.out b/src/test/regress/expected/row_compression/pg_table_size.out new file mode 100644 index 000000000..7f1dbf565 --- /dev/null +++ b/src/test/regress/expected/row_compression/pg_table_size.out @@ -0,0 +1,79 @@ +-- row table pg_table_size +create schema table_size_schema; +CREATE TABLE table_size_schema.normal_table(id int); +CREATE TABLE table_size_schema.compressed_table_1024(id int) WITH(compresstype=2, compress_chunk_size=1024); +CREATE TABLE table_size_schema.compressed_table_2048(id int) WITH(compresstype=2, compress_chunk_size=2048); +CREATE TABLE table_size_schema.compressed_table_4096(id int) WITH(compresstype=2, compress_chunk_size=4096); +select pg_table_size('table_size_schema.normal_table'); + pg_table_size +--------------- + 0 +(1 row) + +select pg_table_size('table_size_schema.compressed_table_1024'); + pg_table_size +--------------- + 5592896 +(1 row) + +select pg_table_size('table_size_schema.compressed_table_2048'); + pg_table_size +--------------- + 3196168 +(1 row) + +select pg_table_size('table_size_schema.compressed_table_4096'); + pg_table_size +--------------- + 2097664 +(1 row) + +drop schema table_size_schema cascade; +NOTICE: drop cascades to 4 other objects +DETAIL: drop cascades to table table_size_schema.normal_table +drop cascades to table table_size_schema.compressed_table_1024 +drop cascades to table table_size_schema.compressed_table_2048 +drop cascades to table table_size_schema.compressed_table_4096 +-- partition table pg_table_size +create schema partition_table_size_schema; +create table partition_table_size_schema.normal_partition(INV_DATE_SK integer) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +create table partition_table_size_schema.compressed_partition_1024(INV_DATE_SK integer) +WITH(compresstype=2, compress_chunk_size=1024) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +create table partition_table_size_schema.compressed_partition_2048(INV_DATE_SK integer) +WITH(compresstype=2, compress_chunk_size=2048) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +create table partition_table_size_schema.compressed_partition_4096(INV_DATE_SK integer) +WITH(compresstype=2, compress_chunk_size=4096) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +select pg_table_size('partition_table_size_schema.normal_partition'); + pg_table_size +--------------- + 0 +(1 row) + +select pg_table_size('partition_table_size_schema.compressed_partition_1024'); + pg_table_size +--------------- + 11185792 +(1 row) + +select pg_table_size('partition_table_size_schema.compressed_partition_2048'); + pg_table_size +--------------- + 6392336 +(1 row) + +select pg_table_size('partition_table_size_schema.compressed_partition_4096'); + pg_table_size +--------------- + 4195328 +(1 row) + +drop schema partition_table_size_schema cascade; +NOTICE: drop cascades to 4 other objects +DETAIL: drop cascades to table partition_table_size_schema.normal_partition +drop cascades to table partition_table_size_schema.compressed_partition_1024 +drop cascades to table partition_table_size_schema.compressed_partition_2048 +drop cascades to table partition_table_size_schema.compressed_partition_4096 diff --git a/src/test/regress/expected/row_compression/pg_tablespace_size.out b/src/test/regress/expected/row_compression/pg_tablespace_size.out new file mode 100644 index 000000000..1ea9ba635 --- /dev/null +++ b/src/test/regress/expected/row_compression/pg_tablespace_size.out @@ -0,0 +1,32 @@ +CREATE TABLESPACE normal_tablespace RELATIVE LOCATION 'normal_tablespace'; +SELECT pg_tablespace_size('normal_tablespace'); + pg_tablespace_size +-------------------- + 4096 +(1 row) + +CREATE TABLE normal_table(id int) TABLESPACE normal_tablespace; +SELECT pg_tablespace_size('normal_tablespace'); + pg_tablespace_size +-------------------- + 8192 +(1 row) + +CREATE TABLESPACE compress_tablespace RELATIVE LOCATION 'compress_tablespace'; +SELECT pg_tablespace_size('compress_tablespace'); + pg_tablespace_size +-------------------- + 4096 +(1 row) + +CREATE TABLE compressed_table_1024(id int) WITH(compresstype=2, compress_chunk_size=1024) TABLESPACE compress_tablespace; +SELECT pg_tablespace_size('compress_tablespace'); + pg_tablespace_size +-------------------- + 5601088 +(1 row) + +DROP TABLE normal_table; +DROP TABLESPACE normal_tablespace; +DROP TABLE compressed_table_1024; +DROP TABLESPACE compress_tablespace; diff --git a/src/test/regress/expected/row_compression/unsupported_feature.out b/src/test/regress/expected/row_compression/unsupported_feature.out new file mode 100644 index 000000000..88d824bda --- /dev/null +++ b/src/test/regress/expected/row_compression/unsupported_feature.out @@ -0,0 +1,66 @@ +create schema unspported_feature; +-- unspport compressType: 3 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=3, compress_chunk_size=1024); +ERROR: value 3 out of bounds for option "compresstype" +DETAIL: Valid values are between "0" and "2". +-- unspport compress_chunk_size: 2000 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_chunk_size=2000); +ERROR: invalid compress_chunk_size 2000 , must be one of 512, 1024, 2048 or 4096 for compressed_table_1024 +-- unspport compress_prealloc_chunks: -1 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_prealloc_chunks=-1); +ERROR: value -1 out of bounds for option "compress_prealloc_chunks" +DETAIL: Valid values are between "0" and "7". +-- unspport compress_prealloc_chunks: 8 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_prealloc_chunks=8); +ERROR: value 8 out of bounds for option "compress_prealloc_chunks" +DETAIL: Valid values are between "0" and "7". +-- unspport compress_level: 128 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_level=128); +ERROR: value 128 out of bounds for option "compress_level" +DETAIL: Valid values are between "-31" and "31". +-- compresstype cant be used with column table +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(ORIENTATION = 'column', compresstype=2); +ERROR: only row orientation table support compresstype/compress_chunk_size/compress_prealloc_chunks/compress_level. +-- compresstype cant be used with temp table +CREATE TEMP TABLE compressed_temp_table_1024(id int) WITH(compresstype=2); +ERROR: only row orientation table support compresstype/compress_chunk_size/compress_prealloc_chunks/compress_level. +-- compresstype cant be used with unlogged table +CREATE unlogged TABLE compressed_unlogged_table_1024(id int) WITH(compresstype=2); +ERROR: only row orientation table support compresstype/compress_chunk_size/compress_prealloc_chunks/compress_level. +-- use compress_prealloc_chunks\compress_chunk_size\compress_level without compresstype +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compress_prealloc_chunks=5); +ERROR: compress_chunk_size/compress_prealloc_chunks/compress_level should be used with compresstype. +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compress_chunk_size=1024); +ERROR: compress_chunk_size/compress_prealloc_chunks/compress_level should be used with compresstype. +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compress_level=5); +ERROR: compress_chunk_size/compress_prealloc_chunks/compress_level should be used with compresstype. +-- unspport exchange +CREATE TABLE unspported_feature.exchange_table(id int) WITH(compresstype=2); +CREATE TABLE unspported_feature.alter_table(id int) partition by range(id) +( + partition p0 values less than(5000), + partition p1 values less than(10000), + partition p2 values less than(20000), + partition p3 values less than(30000), + partition p4 values less than(40000), + partition p5 values less than(50000), + partition p6 values less than(60000), + partition p7 values less than(70000) +); +ALTER TABLE unspported_feature.alter_table EXCHANGE PARTITION FOR(2500) WITH TABLE unspported_feature.exchange_table; +ERROR: tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress +-- unspport alter compress_chunk_size +create TABLE unspported_feature.alter_table_option(id int) WITH(compresstype=2); +\d+ unspported_feature.alter_table_option + Table "unspported_feature.alter_table_option" + Column | Type | Modifiers | Storage | Stats target | Description +--------+---------+-----------+---------+--------------+------------- + id | integer | | plain | | +Has OIDs: no +Options: orientation=row, compresstype=2 + +ALTER TABLE unspported_feature.alter_table_option SET(compresstype=0); -- fail +ERROR: change compresstype OPTION is not supported +ALTER TABLE unspported_feature.alter_table_option SET(compress_chunk_size=2048); -- fail +ERROR: change compress_chunk_size OPTION is not supported +ALTER TABLE unspported_feature.alter_table_option SET(compress_level=2, compress_prealloc_chunks=0); diff --git a/src/test/regress/expected/rule_test.out b/src/test/regress/expected/rule_test.out index 109ab0f2a..686c608e4 100644 --- a/src/test/regress/expected/rule_test.out +++ b/src/test/regress/expected/rule_test.out @@ -1,347 +1,347 @@ --- --- RULES TEST --- --- --- Tables and rules for the view test --- -create table test1 (a int4, b int4); -create view tv1 as select * from test1; -create rule tv1_ins as on insert to tv1 do instead - insert into test1 values (new.a, new.b); -create rule tv1_upd as on update to tv1 do instead - update test1 set a = new.a, b = new.b - where a = old.a; -create rule tv1_del as on delete to tv1 do instead - delete from test1 where a = old.a; --- insert values -insert into tv1 values (1, 11); -insert into tv1 values (2, 12); -select * from tv1; - a | b ----+---- - 1 | 11 - 2 | 12 -(2 rows) - --- update values -update tv1 set a = 10 where b = 11; -update tv1 set a = 12 , b = 22 where b = 12; -select * from tv1; - a | b -----+---- - 10 | 11 - 12 | 22 -(2 rows) - --- delete values -delete from tv1 where a = 10; -select * from tv1; - a | b -----+---- - 12 | 22 -(1 row) - -drop rule if exists tv1_ins on tv1; -drop rule if exists tv1_upd on tv1; -drop rule if exists tv1_del on tv1; -drop view if exists tv1; -drop table if exists test1; --- --- Tables and rules for the constraint update/delete/insert test --- -create table ttsystem (sysname text, sysdesc text); -create table ttadmin (pname text, sysname text); -create table ttperon (pname text, pdesc text); -create table ttinterface (sysname text, ifname text); -create rule usys_ins as on insert to ttsystem do also ( - insert into ttinterface values (new.sysname,''); - insert into ttadmin values ('',new.sysname); - ); -create rule usys_del as on delete to ttsystem do also ( - delete from ttinterface where sysname = old.sysname; - delete from ttadmin where sysname = old.sysname; - ); -create rule usys_upd as on update to ttsystem do also ( - update ttinterface set sysname = new.sysname - where sysname = old.sysname; - update ttadmin set sysname = new.sysname - where sysname = old.sysname - ); -create rule upers_ins as on insert to ttperon do also ( - insert into ttadmin values (new.pname,''); - ); -create rule upers_del as on delete to ttperon do also - delete from ttadmin where pname = old.pname; -create rule upers_upd as on update to ttperon do also - update ttadmin set pname = new.pname where pname = old.pname; - --- test 1 -insert into ttsystem values ('winxi', 'Linux Jan Wieck'); -insert into ttsystem values ('notjw', 'Qu Yan'); -insert into ttsystem values ('yuyan', 'Fileserver'); -insert into ttinterface values ('winxi', 'dola'); -insert into ttinterface values ('winxi', 'eth1'); -insert into ttinterface values ('notjw', 'dola'); -insert into ttinterface values ('yuyan', 'dola'); -insert into ttperon values ('jw', 'Jan Wieck'); -insert into ttperon values ('bm', 'Bruce Momjian'); -insert into ttadmin values ('jw', 'winxi'); -insert into ttadmin values ('jw', 'notjw'); -insert into ttadmin values ('bm', 'yuyan'); -select * from ttsystem; - sysname | sysdesc ----------+----------------- - winxi | Linux Jan Wieck - notjw | Qu Yan - yuyan | Fileserver -(3 rows) - -select * from ttinterface; - sysname | ifname ----------+-------- - winxi | - notjw | - yuyan | - winxi | dola - winxi | eth1 - notjw | dola - yuyan | dola -(7 rows) - -select * from ttperon; - pname | pdesc --------+--------------- - jw | Jan Wieck - bm | Bruce Momjian -(2 rows) - -select * from ttadmin; - pname | sysname --------+--------- - | winxi - | notjw - | yuyan - jw | - bm | - jw | winxi - jw | notjw - bm | yuyan -(8 rows) - --- test 2 -update ttsystem set sysname = 'pluto' where sysname = 'yuyan'; -select * from ttinterface; - sysname | ifname ----------+-------- - winxi | - notjw | - winxi | dola - winxi | eth1 - notjw | dola - pluto | - pluto | dola -(7 rows) - -select * from ttadmin; - pname | sysname --------+--------- - | winxi - | notjw - jw | - bm | - jw | winxi - jw | notjw - | pluto - bm | pluto -(8 rows) - -update ttperon set pname = 'jwieck' where pdesc = 'Jan Wieck'; -select * from ttadmin order by pname, sysname; - pname | sysname ---------+--------- - bm | pluto - bm | - jwieck | notjw - jwieck | winxi - jwieck | - | notjw - | pluto - | winxi -(8 rows) - -delete from ttsystem where sysname = 'winxi'; -select * from ttinterface; - sysname | ifname ----------+-------- - notjw | - notjw | dola - pluto | - pluto | dola -(4 rows) - -select * from ttadmin; - pname | sysname ---------+--------- - | notjw - bm | - | pluto - bm | pluto - jwieck | - jwieck | notjw -(6 rows) - -delete from ttperon where pname = 'bm'; -select * from ttadmin; - pname | sysname ---------+--------- - | notjw - | pluto - jwieck | - jwieck | notjw -(4 rows) - -drop rule if exists usys_upd on ttsystem; -drop rule if exists usys_del on ttsystem; -drop rule if exists usys_ins on ttsystem; -drop rule if exists upers_upd on ttperon; -drop rule if exists upers_del on ttperon; -drop rule if exists upers_ins on ttperon; -drop table if exists ttsystem; -drop table if exists ttinterface; -drop table if exists ttperon; -drop table if exists ttadmin; --- --- Tables and rules for the logging test --- -create table temp (ename char(20), salary money); -create table templog (ename char(20), action char(10), newsal money, oldsal money); -create rule temp_ins as on insert to temp do - insert into templog values (new.ename, 'hired', new.salary, '0.00'); -create rule temp_upd as on update to temp where new.salary != old.salary do - insert into templog values (new.ename, 'honored', new.salary, old.salary); -create rule temp_del as on delete to temp do - insert into templog values (old.ename, 'fired', '0.00', old.salary); -insert into temp values ('tyu', '45.00'); -insert into temp values ('asd', '90.00'); -select * from templog; - ename | action | newsal | oldsal -----------------------+------------+--------+-------- - tyu | hired | $45.00 | $0.00 - asd | hired | $90.00 | $0.00 -(2 rows) - -update temp set salary = salary * 2 where ename = 'tyu'; -select * from templog; - ename | action | newsal | oldsal -----------------------+------------+--------+-------- - tyu | hired | $45.00 | $0.00 - asd | hired | $90.00 | $0.00 - tyu | honored | $90.00 | $45.00 -(3 rows) - -delete from temp where ename = 'tyu'; -select * from templog; - ename | action | newsal | oldsal -----------------------+------------+--------+-------- - tyu | hired | $45.00 | $0.00 - asd | hired | $90.00 | $0.00 - tyu | honored | $90.00 | $45.00 - tyu | fired | $0.00 | $90.00 -(4 rows) - -select * from temp; - ename | salary -----------------------+-------- - asd | $90.00 -(1 row) - -drop rule if exists temp_ins on temp; -drop rule if exists temp_upd on temp; -drop rule if exists temp_del on temp; -drop table if exists temp; -drop table if exists templog; --- --- Rules for condition --- rule test --- -create table test4 (a int4, b text); -create table test5 (a int4, b text); -create table test6 (a int4, b text); -create rule test4_ins1 as on insert to test4 - where new.a >= 10 and new.a < 20 do instead - insert into test5 values (new.a, new.b); -create rule test4_ins2 as on insert to test4 - where new.a >= 20 and new.a < 30 do - insert into test6 values (new.a, new.b); --- test -insert into test4 values (5, 'huijioa'); -insert into test4 values (15, 'afhuvbn'); -insert into test4 values (25, 'qwerty'); -insert into test4 values (35, 'zxcvbn'); -select * from test4; - a | b -----+--------- - 5 | huijioa - 25 | qwerty - 35 | zxcvbn -(3 rows) - -select * from test5; - a | b -----+--------- - 15 | afhuvbn -(1 row) - -select * from test6; - a | b -----+-------- - 25 | qwerty -(1 row) - -drop rule if exists test4_ins1 on test4; -drop rule if exists test4_ins2 on test4; -drop table if exists test4; -drop table if exists test5; -drop table if exists test6; --- --- Tables and rules for select --- -create table ttt1 (a int4, b text); -create table ttt2 (a int4, b text); -create rule "_RETURN" as on select to ttt1 do instead ( - select * from ttt2; - ); --- test -insert into ttt1 values (1, 'hello'); -insert into ttt2 values (10, 'world'); -select * from ttt1; - a | b -----+------- - 10 | world -(1 row) - -drop table if exists ttt1; -drop table if exists ttt2; --- --- Tables and rules for question --- -create table test_statement(id int); -create table escapetest (ts varchar(50)); -create rule r1 as on insert to escapetest do ( - delete from test_statement; - insert into test_statement values (1); - insert into test_statement values (2); - ); - --- test -insert into escapetest(ts) values (NULL); -select * from test_statement; - id ----- - 1 - 2 -(2 rows) - -drop rule if exists r1 on escapetest; -drop table if exists test_statement; -drop table if exists escapetest; +-- +-- RULES TEST +-- +-- +-- Tables and rules for the view test +-- +create table rule_test1_table (a int4, b int4); +create view tv1 as select * from rule_test1_table; +create rule tv1_ins as on insert to tv1 do instead + insert into rule_test1_table values (new.a, new.b); +create rule tv1_upd as on update to tv1 do instead + update rule_test1_table set a = new.a, b = new.b + where a = old.a; +create rule tv1_del as on delete to tv1 do instead + delete from rule_test1_table where a = old.a; +-- insert values +insert into tv1 values (1, 11); +insert into tv1 values (2, 12); +select * from tv1; + a | b +---+---- + 1 | 11 + 2 | 12 +(2 rows) + +-- update values +update tv1 set a = 10 where b = 11; +update tv1 set a = 12 , b = 22 where b = 12; +select * from tv1; + a | b +----+---- + 10 | 11 + 12 | 22 +(2 rows) + +-- delete values +delete from tv1 where a = 10; +select * from tv1; + a | b +----+---- + 12 | 22 +(1 row) + +drop rule if exists tv1_ins on tv1; +drop rule if exists tv1_upd on tv1; +drop rule if exists tv1_del on tv1; +drop view if exists tv1; +drop table if exists rule_test1_table; +-- +-- Tables and rules for the constraint update/delete/insert test +-- +create table ttsystem (sysname text, sysdesc text); +create table ttadmin (pname text, sysname text); +create table ttperon (pname text, pdesc text); +create table ttinterface (sysname text, ifname text); +create rule usys_ins as on insert to ttsystem do also ( + insert into ttinterface values (new.sysname,''); + insert into ttadmin values ('',new.sysname); + ); +create rule usys_del as on delete to ttsystem do also ( + delete from ttinterface where sysname = old.sysname; + delete from ttadmin where sysname = old.sysname; + ); +create rule usys_upd as on update to ttsystem do also ( + update ttinterface set sysname = new.sysname + where sysname = old.sysname; + update ttadmin set sysname = new.sysname + where sysname = old.sysname + ); +create rule upers_ins as on insert to ttperon do also ( + insert into ttadmin values (new.pname,''); + ); +create rule upers_del as on delete to ttperon do also + delete from ttadmin where pname = old.pname; +create rule upers_upd as on update to ttperon do also + update ttadmin set pname = new.pname where pname = old.pname; + +-- test 1 +insert into ttsystem values ('winxi', 'Linux Jan Wieck'); +insert into ttsystem values ('notjw', 'Qu Yan'); +insert into ttsystem values ('yuyan', 'Fileserver'); +insert into ttinterface values ('winxi', 'dola'); +insert into ttinterface values ('winxi', 'eth1'); +insert into ttinterface values ('notjw', 'dola'); +insert into ttinterface values ('yuyan', 'dola'); +insert into ttperon values ('jw', 'Jan Wieck'); +insert into ttperon values ('bm', 'Bruce Momjian'); +insert into ttadmin values ('jw', 'winxi'); +insert into ttadmin values ('jw', 'notjw'); +insert into ttadmin values ('bm', 'yuyan'); +select * from ttsystem; + sysname | sysdesc +---------+----------------- + winxi | Linux Jan Wieck + notjw | Qu Yan + yuyan | Fileserver +(3 rows) + +select * from ttinterface; + sysname | ifname +---------+-------- + winxi | + notjw | + yuyan | + winxi | dola + winxi | eth1 + notjw | dola + yuyan | dola +(7 rows) + +select * from ttperon; + pname | pdesc +-------+--------------- + jw | Jan Wieck + bm | Bruce Momjian +(2 rows) + +select * from ttadmin; + pname | sysname +-------+--------- + | winxi + | notjw + | yuyan + jw | + bm | + jw | winxi + jw | notjw + bm | yuyan +(8 rows) + +-- test 2 +update ttsystem set sysname = 'pluto' where sysname = 'yuyan'; +select * from ttinterface; + sysname | ifname +---------+-------- + winxi | + notjw | + winxi | dola + winxi | eth1 + notjw | dola + pluto | + pluto | dola +(7 rows) + +select * from ttadmin; + pname | sysname +-------+--------- + | winxi + | notjw + jw | + bm | + jw | winxi + jw | notjw + | pluto + bm | pluto +(8 rows) + +update ttperon set pname = 'jwieck' where pdesc = 'Jan Wieck'; +select * from ttadmin order by pname, sysname; + pname | sysname +--------+--------- + bm | pluto + bm | + jwieck | notjw + jwieck | winxi + jwieck | + | notjw + | pluto + | winxi +(8 rows) + +delete from ttsystem where sysname = 'winxi'; +select * from ttinterface; + sysname | ifname +---------+-------- + notjw | + notjw | dola + pluto | + pluto | dola +(4 rows) + +select * from ttadmin; + pname | sysname +--------+--------- + | notjw + bm | + | pluto + bm | pluto + jwieck | + jwieck | notjw +(6 rows) + +delete from ttperon where pname = 'bm'; +select * from ttadmin; + pname | sysname +--------+--------- + | notjw + | pluto + jwieck | + jwieck | notjw +(4 rows) + +drop rule if exists usys_upd on ttsystem; +drop rule if exists usys_del on ttsystem; +drop rule if exists usys_ins on ttsystem; +drop rule if exists upers_upd on ttperon; +drop rule if exists upers_del on ttperon; +drop rule if exists upers_ins on ttperon; +drop table if exists ttsystem; +drop table if exists ttinterface; +drop table if exists ttperon; +drop table if exists ttadmin; +-- +-- Tables and rules for the logging test +-- +create table temp (ename char(20), salary money); +create table templog (ename char(20), action char(10), newsal money, oldsal money); +create rule temp_ins as on insert to temp do + insert into templog values (new.ename, 'hired', new.salary, '0.00'); +create rule temp_upd as on update to temp where new.salary != old.salary do + insert into templog values (new.ename, 'honored', new.salary, old.salary); +create rule temp_del as on delete to temp do + insert into templog values (old.ename, 'fired', '0.00', old.salary); +insert into temp values ('tyu', '45.00'); +insert into temp values ('asd', '90.00'); +select * from templog; + ename | action | newsal | oldsal +----------------------+------------+--------+-------- + tyu | hired | $45.00 | $0.00 + asd | hired | $90.00 | $0.00 +(2 rows) + +update temp set salary = salary * 2 where ename = 'tyu'; +select * from templog; + ename | action | newsal | oldsal +----------------------+------------+--------+-------- + tyu | hired | $45.00 | $0.00 + asd | hired | $90.00 | $0.00 + tyu | honored | $90.00 | $45.00 +(3 rows) + +delete from temp where ename = 'tyu'; +select * from templog; + ename | action | newsal | oldsal +----------------------+------------+--------+-------- + tyu | hired | $45.00 | $0.00 + asd | hired | $90.00 | $0.00 + tyu | honored | $90.00 | $45.00 + tyu | fired | $0.00 | $90.00 +(4 rows) + +select * from temp; + ename | salary +----------------------+-------- + asd | $90.00 +(1 row) + +drop rule if exists temp_ins on temp; +drop rule if exists temp_upd on temp; +drop rule if exists temp_del on temp; +drop table if exists temp; +drop table if exists templog; +-- +-- Rules for condition +-- rule test +-- +create table test4 (a int4, b text); +create table test5 (a int4, b text); +create table test6 (a int4, b text); +create rule test4_ins1 as on insert to test4 + where new.a >= 10 and new.a < 20 do instead + insert into test5 values (new.a, new.b); +create rule test4_ins2 as on insert to test4 + where new.a >= 20 and new.a < 30 do + insert into test6 values (new.a, new.b); +-- test +insert into test4 values (5, 'huijioa'); +insert into test4 values (15, 'afhuvbn'); +insert into test4 values (25, 'qwerty'); +insert into test4 values (35, 'zxcvbn'); +select * from test4; + a | b +----+--------- + 5 | huijioa + 25 | qwerty + 35 | zxcvbn +(3 rows) + +select * from test5; + a | b +----+--------- + 15 | afhuvbn +(1 row) + +select * from test6; + a | b +----+-------- + 25 | qwerty +(1 row) + +drop rule if exists test4_ins1 on test4; +drop rule if exists test4_ins2 on test4; +drop table if exists test4; +drop table if exists test5; +drop table if exists test6; +-- +-- Tables and rules for select +-- +create table ttt1 (a int4, b text); +create table ttt2 (a int4, b text); +create rule "_RETURN" as on select to ttt1 do instead ( + select * from ttt2; + ); +-- test +insert into ttt1 values (1, 'hello'); +insert into ttt2 values (10, 'world'); +select * from ttt1; + a | b +----+------- + 10 | world +(1 row) + +drop table if exists ttt1; +drop table if exists ttt2; +-- +-- Tables and rules for question +-- +create table test_statement(id int); +create table escapetest (ts varchar(50)); +create rule r1 as on insert to escapetest do ( + delete from test_statement; + insert into test_statement values (1); + insert into test_statement values (2); + ); + +-- test +insert into escapetest(ts) values (NULL); +select * from test_statement; + id +---- + 1 + 2 +(2 rows) + +drop rule if exists r1 on escapetest; +drop table if exists test_statement; +drop table if exists escapetest; diff --git a/src/test/regress/expected/single_node_opr_sanity.out b/src/test/regress/expected/single_node_opr_sanity.out index 21042f6e9..22a91e035 100755 --- a/src/test/regress/expected/single_node_opr_sanity.out +++ b/src/test/regress/expected/single_node_opr_sanity.out @@ -2743,6 +2743,7 @@ WHERE d.classoid IS NULL AND p1.oid <= 9999 order by 1; 4764 | ubtoptions 4765 | ubtcostestimate 4767 | gs_read_block_from_remote + 4768 | gs_read_block_from_remote 4789 | remote_rto_stat 4800 | job_cancel 4801 | job_finish @@ -2939,6 +2940,7 @@ WHERE d.classoid IS NULL AND p1.oid <= 9999 order by 1; 7998 | set_working_grand_version_num_manually 8001 | get_paxos_replication_info 8050 | datalength + 8413 | pg_read_binary_file_blocks 8642 | gs_txid_oldestxmin 9004 | smalldatetime_in 9006 | smalldatetime_out diff --git a/src/test/regress/input/row_compression/row_compression_basebackup.source b/src/test/regress/input/row_compression/row_compression_basebackup.source new file mode 100644 index 000000000..d7f2d0689 --- /dev/null +++ b/src/test/regress/input/row_compression/row_compression_basebackup.source @@ -0,0 +1,6 @@ +\! @abs_bindir@/gsql -dpostgres -p @portstring@ -c "create database gs_basebackup;" +\! @abs_bindir@/gsql -dgs_basebackup -p @portstring@ -f "@abs_srcdir@/sql/gs_basebackup/init/compress_data.sql"; +\! mkdir @abs_bindir@/../gs_basebackup_node_nstream_np +\! chmod 700 @abs_bindir@/../gs_basebackup_node_nstream_np +\! chmod +x @abs_srcdir@/script/gs_basebackup/gs_basebackup.sh +\! @abs_srcdir@/script/gs_basebackup/gs_basebackup.sh @abs_bindir@ @abs_srcdir@ @portstring@ gs_basebackup_node_nstream_np compress_data.sql diff --git a/src/test/regress/output/row_compression/row_compression_basebackup.source b/src/test/regress/output/row_compression/row_compression_basebackup.source new file mode 100644 index 000000000..ee90bb59b --- /dev/null +++ b/src/test/regress/output/row_compression/row_compression_basebackup.source @@ -0,0 +1,28 @@ +--?.* +CREATE DATABASE +--?.* +CREATE TABLE +CREATE INDEX +INSERT 0 1000 +CHECKPOINT +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* +--?.* + count +------- + 1000 +(1 row) + +SET + count +------- + 1000 +(1 row) + +--?.* +SHUTDOWN diff --git a/src/test/regress/parallel_schedule0 b/src/test/regress/parallel_schedule0 index 2a626eb49..3ab6ac486 100644 --- a/src/test/regress/parallel_schedule0 +++ b/src/test/regress/parallel_schedule0 @@ -837,4 +837,5 @@ test: hw_cipher_aes128 test: sequence_cache_test test: pg_buffercache_pages -test: test_astore_multixact \ No newline at end of file +test: test_astore_multixact +test: row_compression/pg_table_size row_compression/pg_tablespace_size row_compression/unsupported_feature row_compression/normal_test \ No newline at end of file diff --git a/src/test/regress/script/gs_basebackup/gs_basebackup.sh b/src/test/regress/script/gs_basebackup/gs_basebackup.sh index 13a0c7b4d..41e2df9e3 100644 --- a/src/test/regress/script/gs_basebackup/gs_basebackup.sh +++ b/src/test/regress/script/gs_basebackup/gs_basebackup.sh @@ -2,8 +2,9 @@ abs_bindir=$1 abs_srcdir=$2 abs_port=$3 dataNode=$4 -x_option=${5-} -format=${6-} +validate_sql=$5 +x_option=${6-} +format=${7-} # backup if [ 'x'${x_option} == 'x' ] then @@ -54,9 +55,8 @@ sleep 10s $abs_bindir/gs_ctl status -D $abs_bindir/../$dataNode #validate -$abs_bindir/gsql -dgs_basebackup -p$gs_basebackup_port -f "$abs_srcdir/sql/gs_basebackup/validate/tablespace.sql"; -$abs_bindir/gsql -dgs_basebackup -p$gs_basebackup_port -f "$abs_srcdir/sql/gs_basebackup/validate/mot.sql"; +$abs_bindir/gsql -dgs_basebackup -p$gs_basebackup_port -f "$abs_srcdir/sql/gs_basebackup/validate/$validate_sql"; #stop node -$abs_bindir/gsql -dgs_basebackup -p$gs_basebackup_port -c 'SHUTDOWN IMMEDIATE' \ No newline at end of file +$abs_bindir/gsql -dgs_basebackup -p$gs_basebackup_port -c 'SHUTDOWN IMMEDIATE' diff --git a/src/test/regress/sql/gs_basebackup/init/compress_data.sql b/src/test/regress/sql/gs_basebackup/init/compress_data.sql new file mode 100644 index 000000000..3f6589eba --- /dev/null +++ b/src/test/regress/sql/gs_basebackup/init/compress_data.sql @@ -0,0 +1,4 @@ +CREATE TABLE tbl_pc(id int, c1 text) WITH(compresstype=2, compress_chunk_size=512); +create index on tbl_pc(id) WITH (compresstype=2,compress_chunk_size=1024); +INSERT INTO tbl_pc SELECT id, id::text FROM generate_series(1,1000) id; +checkpoint; diff --git a/src/test/regress/sql/gs_basebackup/validate/compress_data.sql b/src/test/regress/sql/gs_basebackup/validate/compress_data.sql new file mode 100644 index 000000000..3dfe9780f --- /dev/null +++ b/src/test/regress/sql/gs_basebackup/validate/compress_data.sql @@ -0,0 +1,3 @@ +select count(*) from tbl_pc; +set enable_seqscan=off; +select count(*) from tbl_pc; diff --git a/src/test/regress/sql/hw_package.sql b/src/test/regress/sql/hw_package.sql index e1d11a1c1..bbb8be90a 100644 --- a/src/test/regress/sql/hw_package.sql +++ b/src/test/regress/sql/hw_package.sql @@ -3,7 +3,6 @@ drop trigger if exists insert_trigger on test_trigger_src_tbl; drop table if exists test_trigger_des_tbl; drop table if exists test_trigger_src_tbl; drop package if exists trigger_test; -drop table if exists test1; drop table if exists dams_ci.test1; drop table if exists dams_ci.DB_LOG; drop table if exists au_pkg; diff --git a/src/test/regress/sql/row_compression/normal_test.sql b/src/test/regress/sql/row_compression/normal_test.sql new file mode 100644 index 000000000..20f769e0f --- /dev/null +++ b/src/test/regress/sql/row_compression/normal_test.sql @@ -0,0 +1,69 @@ +create schema normal_test; +CREATE TABLE normal_test.tbl_pc(id int, c1 text) WITH(compresstype=1); +\d+ normal_test.tbl_pc +INSERT INTO normal_test.tbl_pc SELECT id, id::text FROM generate_series(1,1000) id; +select count(*) from normal_test.tbl_pc; +select count(*) from normal_test.tbl_pc where id < 100; +checkpoint; +vacuum normal_test.tbl_pc; +select count(*) from normal_test.tbl_pc; +select count(*) from normal_test.tbl_pc where id < 100; + +-- normal index +create index on normal_test.tbl_pc(id) WITH (compresstype=2,compress_chunk_size=1024); +alter index normal_test.tbl_pc_id_idx set (compresstype=1); --failed +alter index normal_test.tbl_pc_id_idx set (compress_chunk_size=2048); --failed +alter index normal_test.tbl_pc_id_idx set (compress_prealloc_chunks=2); --success +alter index normal_test.tbl_pc_id_idx set (compress_level=2); --success + +set enable_seqscan = off; +set enable_bitmapscan = off; +select count(*) from normal_test.tbl_pc; +CREATE TABLE normal_test.tbl_partition(id int) WITH(compresstype=2,compress_chunk_size=1024) partition by range(id) +( + partition p0 values less than(5000), + partition p1 values less than(10000), + partition p2 values less than(20000), + partition p3 values less than(30000), + partition p4 values less than(40000), + partition p5 values less than(50000), + partition p6 values less than(60000), + partition p7 values less than(70000) +); +insert into normal_test.tbl_partition select generate_series(1,65000); +select count(*) from normal_test.tbl_partition; +checkpoint; +vacuum normal_test.tbl_partition; +select count(*) from normal_test.tbl_partition; + +-- exchange +select relname, reloptions from pg_partition where parentid in (Select relfilenode from pg_class where relname like 'tbl_partition') order by relname; +create table normal_test.exchange_table(id int) WITH(compresstype=2,compress_chunk_size=1024); +ALTER TABLE normal_test.tbl_partition EXCHANGE PARTITION FOR(2500) WITH TABLE normal_test.exchange_table; +select count(*) from normal_test.tbl_partition; + +-- spilit +ALTER TABLE normal_test.tbl_partition SPLIT PARTITION p1 AT (7500) INTO (PARTITION p10, PARTITION p11); +select relname, reloptions from pg_partition where parentid in (Select relfilenode from pg_class where relname like 'tbl_partition') order by relname; + +create index on normal_test.tbl_partition(id) local WITH (compresstype=2,compress_chunk_size=1024); +\d+ normal_test.tbl_partition +select relname, reloptions from pg_partition where parentid in (Select relfilenode from pg_class where relname like 'tbl_partition_id_idx') order by relname; + + +-- unsupport +alter index normal_test.tbl_partition_id_idx set (compresstype=1); +alter index normal_test.tbl_partition_id_idx set (compress_chunk_size=2048); +alter index normal_test.tbl_partition_id_idx set (compress_prealloc_chunks=2); +-- support +alter table normal_test.tbl_pc set (compress_prealloc_chunks=2); + +-- new testcase +set search_path=normal_test; +\d+ +reset search_path; +CREATE TABLE normal_test.pre_handle(id int) WITH(compresstype=2, compress_chunk_size=512, compress_byte_convert=true, compress_diff_convert=true); +insert into normal_test.pre_handle select generate_series(1,1000); +checkpoint; +select count(*) from normal_test.pre_handle; +drop schema normal_test cascade; diff --git a/src/test/regress/sql/row_compression/pg_table_size.sql b/src/test/regress/sql/row_compression/pg_table_size.sql new file mode 100644 index 000000000..054e51905 --- /dev/null +++ b/src/test/regress/sql/row_compression/pg_table_size.sql @@ -0,0 +1,30 @@ +-- row table pg_table_size +create schema table_size_schema; +CREATE TABLE table_size_schema.normal_table(id int); +CREATE TABLE table_size_schema.compressed_table_1024(id int) WITH(compresstype=2, compress_chunk_size=1024); +CREATE TABLE table_size_schema.compressed_table_2048(id int) WITH(compresstype=2, compress_chunk_size=2048); +CREATE TABLE table_size_schema.compressed_table_4096(id int) WITH(compresstype=2, compress_chunk_size=4096); +select pg_table_size('table_size_schema.normal_table'); +select pg_table_size('table_size_schema.compressed_table_1024'); +select pg_table_size('table_size_schema.compressed_table_2048'); +select pg_table_size('table_size_schema.compressed_table_4096'); +drop schema table_size_schema cascade; + +-- partition table pg_table_size +create schema partition_table_size_schema; +create table partition_table_size_schema.normal_partition(INV_DATE_SK integer) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +create table partition_table_size_schema.compressed_partition_1024(INV_DATE_SK integer) +WITH(compresstype=2, compress_chunk_size=1024) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +create table partition_table_size_schema.compressed_partition_2048(INV_DATE_SK integer) +WITH(compresstype=2, compress_chunk_size=2048) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +create table partition_table_size_schema.compressed_partition_4096(INV_DATE_SK integer) +WITH(compresstype=2, compress_chunk_size=4096) +partition by range(inv_date_sk)(partition p0 values less than(5000),partition p1 values less than(10000)); +select pg_table_size('partition_table_size_schema.normal_partition'); +select pg_table_size('partition_table_size_schema.compressed_partition_1024'); +select pg_table_size('partition_table_size_schema.compressed_partition_2048'); +select pg_table_size('partition_table_size_schema.compressed_partition_4096'); +drop schema partition_table_size_schema cascade; diff --git a/src/test/regress/sql/row_compression/pg_tablespace_size.sql b/src/test/regress/sql/row_compression/pg_tablespace_size.sql new file mode 100644 index 000000000..94b5e6cb0 --- /dev/null +++ b/src/test/regress/sql/row_compression/pg_tablespace_size.sql @@ -0,0 +1,14 @@ +CREATE TABLESPACE normal_tablespace RELATIVE LOCATION 'normal_tablespace'; +SELECT pg_tablespace_size('normal_tablespace'); +CREATE TABLE normal_table(id int) TABLESPACE normal_tablespace; +SELECT pg_tablespace_size('normal_tablespace'); + +CREATE TABLESPACE compress_tablespace RELATIVE LOCATION 'compress_tablespace'; +SELECT pg_tablespace_size('compress_tablespace'); +CREATE TABLE compressed_table_1024(id int) WITH(compresstype=2, compress_chunk_size=1024) TABLESPACE compress_tablespace; +SELECT pg_tablespace_size('compress_tablespace'); +DROP TABLE normal_table; +DROP TABLESPACE normal_tablespace; +DROP TABLE compressed_table_1024; +DROP TABLESPACE compress_tablespace; + diff --git a/src/test/regress/sql/row_compression/unsupported_feature.sql b/src/test/regress/sql/row_compression/unsupported_feature.sql new file mode 100644 index 000000000..e26faf2a4 --- /dev/null +++ b/src/test/regress/sql/row_compression/unsupported_feature.sql @@ -0,0 +1,41 @@ +create schema unspported_feature; +-- unspport compressType: 3 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=3, compress_chunk_size=1024); +-- unspport compress_chunk_size: 2000 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_chunk_size=2000); +-- unspport compress_prealloc_chunks: -1 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_prealloc_chunks=-1); +-- unspport compress_prealloc_chunks: 8 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_prealloc_chunks=8); +-- unspport compress_level: 128 +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compresstype=2, compress_level=128); +-- compresstype cant be used with column table +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(ORIENTATION = 'column', compresstype=2); +-- compresstype cant be used with temp table +CREATE TEMP TABLE compressed_temp_table_1024(id int) WITH(compresstype=2); +-- compresstype cant be used with unlogged table +CREATE unlogged TABLE compressed_unlogged_table_1024(id int) WITH(compresstype=2); +-- use compress_prealloc_chunks\compress_chunk_size\compress_level without compresstype +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compress_prealloc_chunks=5); +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compress_chunk_size=1024); +CREATE TABLE unspported_feature.compressed_table_1024(id int) WITH(compress_level=5); +-- unspport exchange +CREATE TABLE unspported_feature.exchange_table(id int) WITH(compresstype=2); +CREATE TABLE unspported_feature.alter_table(id int) partition by range(id) +( + partition p0 values less than(5000), + partition p1 values less than(10000), + partition p2 values less than(20000), + partition p3 values less than(30000), + partition p4 values less than(40000), + partition p5 values less than(50000), + partition p6 values less than(60000), + partition p7 values less than(70000) +); +ALTER TABLE unspported_feature.alter_table EXCHANGE PARTITION FOR(2500) WITH TABLE unspported_feature.exchange_table; +-- unspport alter compress_chunk_size +create TABLE unspported_feature.alter_table_option(id int) WITH(compresstype=2); +\d+ unspported_feature.alter_table_option +ALTER TABLE unspported_feature.alter_table_option SET(compresstype=0); -- fail +ALTER TABLE unspported_feature.alter_table_option SET(compress_chunk_size=2048); -- fail +ALTER TABLE unspported_feature.alter_table_option SET(compress_level=2, compress_prealloc_chunks=0); diff --git a/src/test/regress/sql/rule_test.sql b/src/test/regress/sql/rule_test.sql index 2e53ed42c..bc9ed3115 100644 --- a/src/test/regress/sql/rule_test.sql +++ b/src/test/regress/sql/rule_test.sql @@ -1,226 +1,226 @@ --- --- RULES TEST --- - --- --- Tables and rules for the view test --- -create table test1 (a int4, b int4); -create view tv1 as select * from test1; -create rule tv1_ins as on insert to tv1 do instead - insert into test1 values (new.a, new.b); -create rule tv1_upd as on update to tv1 do instead - update test1 set a = new.a, b = new.b - where a = old.a; -create rule tv1_del as on delete to tv1 do instead - delete from test1 where a = old.a; - --- insert values -insert into tv1 values (1, 11); -insert into tv1 values (2, 12); -select * from tv1; - --- update values -update tv1 set a = 10 where b = 11; -update tv1 set a = 12 , b = 22 where b = 12; -select * from tv1; - --- delete values -delete from tv1 where a = 10; -select * from tv1; - -drop rule if exists tv1_ins on tv1; -drop rule if exists tv1_upd on tv1; -drop rule if exists tv1_del on tv1; -drop view if exists tv1; -drop table if exists test1; - - --- --- Tables and rules for the constraint update/delete/insert test --- -create table ttsystem (sysname text, sysdesc text); -create table ttadmin (pname text, sysname text); -create table ttperon (pname text, pdesc text); -create table ttinterface (sysname text, ifname text); - -create rule usys_ins as on insert to ttsystem do also ( - insert into ttinterface values (new.sysname,''); - insert into ttadmin values ('',new.sysname); - ); - -create rule usys_del as on delete to ttsystem do also ( - delete from ttinterface where sysname = old.sysname; - delete from ttadmin where sysname = old.sysname; - ); - -create rule usys_upd as on update to ttsystem do also ( - update ttinterface set sysname = new.sysname - where sysname = old.sysname; - update ttadmin set sysname = new.sysname - where sysname = old.sysname - ); - -create rule upers_ins as on insert to ttperon do also ( - insert into ttadmin values (new.pname,''); - ); - -create rule upers_del as on delete to ttperon do also - delete from ttadmin where pname = old.pname; - -create rule upers_upd as on update to ttperon do also - update ttadmin set pname = new.pname where pname = old.pname; - --- test 1 -insert into ttsystem values ('winxi', 'Linux Jan Wieck'); -insert into ttsystem values ('notjw', 'Qu Yan'); -insert into ttsystem values ('yuyan', 'Fileserver'); - -insert into ttinterface values ('winxi', 'dola'); -insert into ttinterface values ('winxi', 'eth1'); -insert into ttinterface values ('notjw', 'dola'); -insert into ttinterface values ('yuyan', 'dola'); - -insert into ttperon values ('jw', 'Jan Wieck'); -insert into ttperon values ('bm', 'Bruce Momjian'); - -insert into ttadmin values ('jw', 'winxi'); -insert into ttadmin values ('jw', 'notjw'); -insert into ttadmin values ('bm', 'yuyan'); - -select * from ttsystem; -select * from ttinterface; -select * from ttperon; -select * from ttadmin; - --- test 2 -update ttsystem set sysname = 'pluto' where sysname = 'yuyan'; -select * from ttinterface; -select * from ttadmin; - -update ttperon set pname = 'jwieck' where pdesc = 'Jan Wieck'; -select * from ttadmin order by pname, sysname; - -delete from ttsystem where sysname = 'winxi'; -select * from ttinterface; -select * from ttadmin; - -delete from ttperon where pname = 'bm'; -select * from ttadmin; - -drop rule if exists usys_upd on ttsystem; -drop rule if exists usys_del on ttsystem; -drop rule if exists usys_ins on ttsystem; -drop rule if exists upers_upd on ttperon; -drop rule if exists upers_del on ttperon; -drop rule if exists upers_ins on ttperon; -drop table if exists ttsystem; -drop table if exists ttinterface; -drop table if exists ttperon; -drop table if exists ttadmin; - --- --- Tables and rules for the logging test --- -create table temp (ename char(20), salary money); -create table templog (ename char(20), action char(10), newsal money, oldsal money); - -create rule temp_ins as on insert to temp do - insert into templog values (new.ename, 'hired', new.salary, '0.00'); - -create rule temp_upd as on update to temp where new.salary != old.salary do - insert into templog values (new.ename, 'honored', new.salary, old.salary); - -create rule temp_del as on delete to temp do - insert into templog values (old.ename, 'fired', '0.00', old.salary); - -insert into temp values ('tyu', '45.00'); -insert into temp values ('asd', '90.00'); -select * from templog; - -update temp set salary = salary * 2 where ename = 'tyu'; -select * from templog; - -delete from temp where ename = 'tyu'; -select * from templog; - -select * from temp; - -drop rule if exists temp_ins on temp; -drop rule if exists temp_upd on temp; -drop rule if exists temp_del on temp; -drop table if exists temp; -drop table if exists templog; - --- --- Rules for condition --- rule test --- -create table test4 (a int4, b text); -create table test5 (a int4, b text); -create table test6 (a int4, b text); - -create rule test4_ins1 as on insert to test4 - where new.a >= 10 and new.a < 20 do instead - insert into test5 values (new.a, new.b); - -create rule test4_ins2 as on insert to test4 - where new.a >= 20 and new.a < 30 do - insert into test6 values (new.a, new.b); - - --- test -insert into test4 values (5, 'huijioa'); -insert into test4 values (15, 'afhuvbn'); -insert into test4 values (25, 'qwerty'); -insert into test4 values (35, 'zxcvbn'); - -select * from test4; -select * from test5; -select * from test6; - -drop rule if exists test4_ins1 on test4; -drop rule if exists test4_ins2 on test4; -drop table if exists test4; -drop table if exists test5; -drop table if exists test6; - --- --- Tables and rules for select --- -create table ttt1 (a int4, b text); -create table ttt2 (a int4, b text); - -create rule "_RETURN" as on select to ttt1 do instead ( - select * from ttt2; - ); - --- test -insert into ttt1 values (1, 'hello'); -insert into ttt2 values (10, 'world'); -select * from ttt1; - -drop table if exists ttt1; -drop table if exists ttt2; - --- --- Tables and rules for question --- - -create table test_statement(id int); -create table escapetest (ts varchar(50)); -create rule r1 as on insert to escapetest do ( - delete from test_statement; - insert into test_statement values (1); - insert into test_statement values (2); - ); - --- test -insert into escapetest(ts) values (NULL); -select * from test_statement; - -drop rule if exists r1 on escapetest; -drop table if exists test_statement; -drop table if exists escapetest; - - +-- +-- RULES TEST +-- + +-- +-- Tables and rules for the view test +-- +create table rule_test1_table (a int4, b int4); +create view tv1 as select * from rule_test1_table; +create rule tv1_ins as on insert to tv1 do instead + insert into rule_test1_table values (new.a, new.b); +create rule tv1_upd as on update to tv1 do instead + update rule_test1_table set a = new.a, b = new.b + where a = old.a; +create rule tv1_del as on delete to tv1 do instead + delete from rule_test1_table where a = old.a; + +-- insert values +insert into tv1 values (1, 11); +insert into tv1 values (2, 12); +select * from tv1; + +-- update values +update tv1 set a = 10 where b = 11; +update tv1 set a = 12 , b = 22 where b = 12; +select * from tv1; + +-- delete values +delete from tv1 where a = 10; +select * from tv1; + +drop rule if exists tv1_ins on tv1; +drop rule if exists tv1_upd on tv1; +drop rule if exists tv1_del on tv1; +drop view if exists tv1; +drop table if exists rule_test1_table; + + +-- +-- Tables and rules for the constraint update/delete/insert test +-- +create table ttsystem (sysname text, sysdesc text); +create table ttadmin (pname text, sysname text); +create table ttperon (pname text, pdesc text); +create table ttinterface (sysname text, ifname text); + +create rule usys_ins as on insert to ttsystem do also ( + insert into ttinterface values (new.sysname,''); + insert into ttadmin values ('',new.sysname); + ); + +create rule usys_del as on delete to ttsystem do also ( + delete from ttinterface where sysname = old.sysname; + delete from ttadmin where sysname = old.sysname; + ); + +create rule usys_upd as on update to ttsystem do also ( + update ttinterface set sysname = new.sysname + where sysname = old.sysname; + update ttadmin set sysname = new.sysname + where sysname = old.sysname + ); + +create rule upers_ins as on insert to ttperon do also ( + insert into ttadmin values (new.pname,''); + ); + +create rule upers_del as on delete to ttperon do also + delete from ttadmin where pname = old.pname; + +create rule upers_upd as on update to ttperon do also + update ttadmin set pname = new.pname where pname = old.pname; + +-- test 1 +insert into ttsystem values ('winxi', 'Linux Jan Wieck'); +insert into ttsystem values ('notjw', 'Qu Yan'); +insert into ttsystem values ('yuyan', 'Fileserver'); + +insert into ttinterface values ('winxi', 'dola'); +insert into ttinterface values ('winxi', 'eth1'); +insert into ttinterface values ('notjw', 'dola'); +insert into ttinterface values ('yuyan', 'dola'); + +insert into ttperon values ('jw', 'Jan Wieck'); +insert into ttperon values ('bm', 'Bruce Momjian'); + +insert into ttadmin values ('jw', 'winxi'); +insert into ttadmin values ('jw', 'notjw'); +insert into ttadmin values ('bm', 'yuyan'); + +select * from ttsystem; +select * from ttinterface; +select * from ttperon; +select * from ttadmin; + +-- test 2 +update ttsystem set sysname = 'pluto' where sysname = 'yuyan'; +select * from ttinterface; +select * from ttadmin; + +update ttperon set pname = 'jwieck' where pdesc = 'Jan Wieck'; +select * from ttadmin order by pname, sysname; + +delete from ttsystem where sysname = 'winxi'; +select * from ttinterface; +select * from ttadmin; + +delete from ttperon where pname = 'bm'; +select * from ttadmin; + +drop rule if exists usys_upd on ttsystem; +drop rule if exists usys_del on ttsystem; +drop rule if exists usys_ins on ttsystem; +drop rule if exists upers_upd on ttperon; +drop rule if exists upers_del on ttperon; +drop rule if exists upers_ins on ttperon; +drop table if exists ttsystem; +drop table if exists ttinterface; +drop table if exists ttperon; +drop table if exists ttadmin; + +-- +-- Tables and rules for the logging test +-- +create table temp (ename char(20), salary money); +create table templog (ename char(20), action char(10), newsal money, oldsal money); + +create rule temp_ins as on insert to temp do + insert into templog values (new.ename, 'hired', new.salary, '0.00'); + +create rule temp_upd as on update to temp where new.salary != old.salary do + insert into templog values (new.ename, 'honored', new.salary, old.salary); + +create rule temp_del as on delete to temp do + insert into templog values (old.ename, 'fired', '0.00', old.salary); + +insert into temp values ('tyu', '45.00'); +insert into temp values ('asd', '90.00'); +select * from templog; + +update temp set salary = salary * 2 where ename = 'tyu'; +select * from templog; + +delete from temp where ename = 'tyu'; +select * from templog; + +select * from temp; + +drop rule if exists temp_ins on temp; +drop rule if exists temp_upd on temp; +drop rule if exists temp_del on temp; +drop table if exists temp; +drop table if exists templog; + +-- +-- Rules for condition +-- rule test +-- +create table test4 (a int4, b text); +create table test5 (a int4, b text); +create table test6 (a int4, b text); + +create rule test4_ins1 as on insert to test4 + where new.a >= 10 and new.a < 20 do instead + insert into test5 values (new.a, new.b); + +create rule test4_ins2 as on insert to test4 + where new.a >= 20 and new.a < 30 do + insert into test6 values (new.a, new.b); + + +-- test +insert into test4 values (5, 'huijioa'); +insert into test4 values (15, 'afhuvbn'); +insert into test4 values (25, 'qwerty'); +insert into test4 values (35, 'zxcvbn'); + +select * from test4; +select * from test5; +select * from test6; + +drop rule if exists test4_ins1 on test4; +drop rule if exists test4_ins2 on test4; +drop table if exists test4; +drop table if exists test5; +drop table if exists test6; + +-- +-- Tables and rules for select +-- +create table ttt1 (a int4, b text); +create table ttt2 (a int4, b text); + +create rule "_RETURN" as on select to ttt1 do instead ( + select * from ttt2; + ); + +-- test +insert into ttt1 values (1, 'hello'); +insert into ttt2 values (10, 'world'); +select * from ttt1; + +drop table if exists ttt1; +drop table if exists ttt2; + +-- +-- Tables and rules for question +-- + +create table test_statement(id int); +create table escapetest (ts varchar(50)); +create rule r1 as on insert to escapetest do ( + delete from test_statement; + insert into test_statement values (1); + insert into test_statement values (2); + ); + +-- test +insert into escapetest(ts) values (NULL); +select * from test_statement; + +drop rule if exists r1 on escapetest; +drop table if exists test_statement; +drop table if exists escapetest; + +