row compression & hash index

This commit is contained in:
wuyuechuan 2022-03-06 21:41:58 +08:00
parent f9fc8c0d68
commit 15e3a99805
143 changed files with 13897 additions and 1612 deletions

View File

@ -2,7 +2,7 @@
# pagehack
AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} TGT_pagehack_SRC)
set(TGT_pagehack_INC
${TGT_pq_INC} ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SRC_DIR}/lib/gstrace
${TGT_pq_INC} ${ZSTD_INCLUDE_PATH} ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SRC_DIR}/lib/gstrace
)
set(pagehack_DEF_OPTIONS ${MACRO_OPTIONS})
@ -11,12 +11,13 @@ if(${ENABLE_DEBUG} STREQUAL "ON")
endif()
set(pagehack_COMPILE_OPTIONS ${OS_OPTIONS} ${PROTECT_OPTIONS} ${WARNING_OPTIONS} ${CHECK_OPTIONS} ${BIN_SECURE_OPTIONS} ${OPTIMIZE_OPTIONS})
set(pagehack_LINK_OPTIONS ${BIN_LINK_OPTIONS})
set(pagehack_LINK_LIBS -lpgport -lcrypt -ldl -lm -ledit -lssl -lcrypto -l${SECURE_C_CHECK} -lrt -lz -lminiunz)
set(pagehack_LINK_LIBS -lpgport -lcrypt -ldl -lm -ledit -lssl -lcrypto -lsecurec -lrt -lz -lminiunz -lzstd)
add_bintarget(pagehack TGT_pagehack_SRC TGT_pagehack_INC "${pagehack_DEF_OPTIONS}" "${pagehack_COMPILE_OPTIONS}" "${pagehack_LINK_OPTIONS}" "${pagehack_LINK_LIBS}")
add_dependencies(pagehack pgport_static)
target_link_directories(pagehack PUBLIC
${LIBOPENSSL_LIB_PATH} ${PROTOBUF_LIB_PATH} ${LIBPARQUET_LIB_PATH} ${LIBCURL_LIB_PATH} ${SECURE_LIB_PATH}
${ZLIB_LIB_PATH} ${LIBOBS_LIB_PATH} ${LIBEDIT_LIB_PATH} ${LIBCGROUP_LIB_PATH} ${CMAKE_BINARY_DIR}/lib
${ZSTD_LIB_PATH}
)
install(TARGETS pagehack RUNTIME DESTINATION bin)

View File

@ -1,6 +1,6 @@
# contrib/pagehack/Makefile
MODULE_big = pagehack
OBJS = pagehack.o
OBJS = openGaussCompression.o pagehack.o
# executable program, even there is no database server/client
PROGRAM = pagehack
@ -13,7 +13,7 @@ else
subdir = contrib/pagehack
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
enable_shared = false
override CFLAGS += -lzstd
ifeq ($(enable_debug), yes)
PG_CPPFLAGS += -DDEBUG

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,177 @@
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
*/
#include "openGaussCompression.h"
#include "storage/checksum_impl.h"
#include "storage/page_compression_impl.h"
void OpenGaussCompression::SetFilePath(const char *filePath, int segNo)
{
int rc = snprintf_s(pcaFilePath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, filePath);
securec_check_ss_c(rc, "\0", "\0");
rc = snprintf_s(pcdFilePath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, filePath);
securec_check_ss_c(rc, "\0", "\0");
this->segmentNo = segNo;
}
OpenGaussCompression::~OpenGaussCompression()
{
if (pcaFd != nullptr) {
fclose(pcaFd);
}
if (pcdFd != nullptr) {
fclose(pcdFd);
}
if (header != nullptr) {
pc_munmap(header);
}
}
bool OpenGaussCompression::TryOpen()
{
if ((pcaFd = fopen(this->pcaFilePath, "rb+")) == nullptr) {
return false;
}
if ((pcdFd = fopen(this->pcdFilePath, "rb+")) == nullptr) {
return false;
}
if (fseeko(pcaFd, (off_t)offsetof(PageCompressHeader, chunk_size), SEEK_SET) != 0) {
return false;
}
if (fread(&chunkSize, sizeof(chunkSize), 1, this->pcaFd) <= 0) {
return false;
}
header = pc_mmap(fileno(pcaFd), chunkSize, false);
return true;
}
bool OpenGaussCompression::ReadChunkOfBlock(char *dst, size_t *dstLen, BlockNumber blockNumber)
{
auto currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
do {
auto chunkNum = currentAddr->nchunks;
for (uint8 i = 0; i < chunkNum; i++) {
off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]);
uint8 start = i;
while (i < chunkNum - 1 && currentAddr->chunknos[i + 1] == currentAddr->chunknos[i] + 1) {
i++;
}
if (fseeko(this->pcdFd, seekPos, SEEK_SET) != 0) {
return false;
}
size_t readAmount = chunkSize * (i - start + 1);
if (fread(dst + start * chunkSize, 1, readAmount, this->pcdFd) != readAmount && ferror(this->pcdFd)) {
return false;
}
*dstLen += readAmount;
}
if (chunkNum == 0 || DecompressPage(dst, decompressedBuffer, header->algorithm) == BLCKSZ) {
break;
}
} while (true);
if (PageIs8BXidHeapVersion(dst)) {
byteConvert = ((HeapPageCompressData *)dst)->byte_convert;
diffConvert = ((HeapPageCompressData *)dst)->diff_convert;
} else {
byteConvert = ((PageCompressData *)dst)->byte_convert;
diffConvert = ((PageCompressData *)dst)->diff_convert;
}
this->blockNumber = blockNumber;
return true;
}
bool OpenGaussCompression::WriteBackCompressedData(char *source, size_t sourceLen, BlockNumber blockNumber)
{
auto currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
for (size_t i = 0; i < currentAddr->nchunks; ++i) {
off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]);
if (fseeko(this->pcdFd, seekPos, SEEK_SET) != 0) {
return false;
}
Assert(sourceLen >= i * chunkSize);
auto writeCount = fwrite(source + i * chunkSize, 1, chunkSize, this->pcdFd);
bool success = chunkSize == writeCount;
if (!success) {
return false;
}
}
fflush(this->pcdFd);
return true;
}
void OpenGaussCompression::MarkUncompressedDirty()
{
constexpr int writeLen = BLCKSZ / 2;
unsigned char fill_byte[writeLen] = {0xFF};
for (int i = 0; i < writeLen; i++)
fill_byte[i] = 0xFF;
auto rc = memcpy_s(decompressedBuffer + writeLen, BLCKSZ - writeLen, fill_byte, writeLen);
securec_check(rc, "", "");
}
BlockNumber OpenGaussCompression::GetMaxBlockNumber()
{
return (BlockNumber)pg_atomic_read_u32(&header->nblocks);
}
char *OpenGaussCompression::GetPcdFilePath()
{
return this->pcdFilePath;
}
char *OpenGaussCompression::GetDecompressedPage()
{
return this->decompressedBuffer;
}
bool OpenGaussCompression::WriteBackUncompressedData()
{
auto algorithm = header->algorithm;
auto workBufferSize = CompressPageBufferBound(decompressedBuffer, algorithm);
if (workBufferSize < 0) {
return false;
}
char *work_buffer = (char *)malloc(workBufferSize);
RelFileCompressOption relFileCompressOption;
relFileCompressOption.compressPreallocChunks = 0;
relFileCompressOption.compressLevelSymbol = true;
relFileCompressOption.compressLevel = 1;
relFileCompressOption.compressAlgorithm = algorithm;
relFileCompressOption.byteConvert = byteConvert;
relFileCompressOption.diffConvert = diffConvert;
auto compress_buffer_size = CompressPage(decompressedBuffer, work_buffer, workBufferSize, relFileCompressOption);
if (compress_buffer_size < 0) {
return false;
}
uint8 nchunks = (compress_buffer_size - 1) / chunkSize + 1;
auto bufferSize = chunkSize * nchunks;
if (bufferSize >= BLCKSZ) {
/* store original page if can not save space? */
free(work_buffer);
work_buffer = (char *)decompressedBuffer;
nchunks = BLCKSZ / chunkSize;
} else {
/* fill zero in the last chunk */
if (compress_buffer_size < bufferSize) {
auto leftSize = bufferSize - compress_buffer_size;
errno_t rc = memset_s(work_buffer + compress_buffer_size, leftSize, 0, leftSize);
securec_check(rc, "", "");
}
}
uint8 need_chunks = nchunks;
PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
if (pcAddr->allocated_chunks < need_chunks) {
auto chunkno = pg_atomic_fetch_add_u32(&header->allocated_chunks, need_chunks - pcAddr->allocated_chunks);
for (uint8 i = pcAddr->allocated_chunks; i < need_chunks; ++i) {
pcAddr->chunknos[i] = ++chunkno;
}
pcAddr->allocated_chunks = need_chunks;
pcAddr->nchunks = need_chunks;
}
return this->WriteBackCompressedData(work_buffer, compress_buffer_size, blockNumber);
}
#include "compression_algorithm.ini"

View File

@ -0,0 +1,40 @@
#ifndef OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H
#define OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H
#define FRONTEND 1
#include <stdio.h>
#include "c.h"
#include "storage/buf/block.h"
#include "storage/page_compression.h"
class OpenGaussCompression {
private:
FILE* pcaFd = nullptr;
FILE* pcdFd = nullptr;
char pcaFilePath[MAXPGPATH];
char pcdFilePath[MAXPGPATH];
PageCompressHeader* header = nullptr;
private:
int segmentNo;
BlockNumber blockNumber;
decltype(PageCompressHeader::chunk_size) chunkSize;
char decompressedBuffer[BLCKSZ];
bool byteConvert;
bool diffConvert;
public:
void SetFilePath(const char* filePath, int segNo);
virtual ~OpenGaussCompression();
bool TryOpen();
bool ReadChunkOfBlock(char* dst, size_t* dstLen, BlockNumber blockNumber);
bool WriteBackCompressedData(char* source, size_t sourceLen, BlockNumber blockNumber);
bool WriteBackUncompressedData();
void MarkUncompressedDirty();
BlockNumber GetMaxBlockNumber();
char* GetPcdFilePath();
char* GetDecompressedPage();
};
#endif // OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H

View File

@ -91,6 +91,9 @@
#include "tsdb/utils/constant_def.h"
#endif
#include "openGaussCompression.h"
/* Max number of pg_class oid, currently about 4000 */
#define MAX_PG_CLASS_ID 10000
/* Number of pg_class types */
@ -139,6 +142,7 @@ static const char* PgHeapRelName[] = {"pg_class",
"pg_am",
"pg_statistic",
"pg_toast"};
typedef enum SegmentType { SEG_HEAP, SEG_FSM, SEG_UHEAP, SEG_INDEX_BTREE, SEG_UNDO, SEG_UNKNOWN } SegmentType;
static void ParsePgClassTupleData(binary tupdata, int len, binary nullBitmap, int natrrs);
static void ParsePgIndexTupleData(binary tupdata, int len, binary nullBitmap, int nattrs);
@ -156,6 +160,8 @@ static void ParseToastTupleData(binary tupdata, int len, binary nullBitmap, int
static void ParseTDSlot(const char *page);
static void ParseToastIndexTupleData(binary tupdata, int len, binary nullBitmap, int nattrs);
static int parse_uncompressed_page_file(const char *filename, SegmentType type, const uint32 start_point,
const uint32 number_read);
static ParseHeapTupleData PgHeapRelTupleParser[] = {
ParsePgClassTupleData, // pg_class
@ -899,8 +905,6 @@ static const char* HACKINGTYPE[] = {"heap",
"segment"
};
typedef enum SegmentType { SEG_HEAP, SEG_FSM, SEG_UHEAP, SEG_INDEX_BTREE, SEG_UNDO, SEG_UNKNOWN } SegmentType;
const char* PageTypeNames[] = {"DATA", "FSM", "VM"};
#define GETHEAPSTRUCT(TUP) ((unsigned char*)(TUP) + (TUP)->t_hoff)
@ -3145,7 +3149,78 @@ static int parse_a_page(const char* buffer, int blkno, int blknum, SegmentType t
return true;
}
static BlockNumber CalculateMaxBlockNumber(BlockNumber blknum, BlockNumber start, BlockNumber number)
{
/* parse */
if (start >= blknum) {
fprintf(stderr, "start point exceeds the total block number of relation.\n");
return InvalidBlockNumber;
} else if ((start + number) > blknum) {
fprintf(stderr, "don't have %d blocks from block %d in the relation, only %d blocks\n", number, start,
(blknum - start));
number = blknum;
} else if (number == 0) {
number = blknum;
} else {
number += start;
}
return number;
}
static int parse_page_file(const char* filename, SegmentType type, const uint32 start_point, const uint32 number_read)
{
if (type != SEG_HEAP && type != SEG_INDEX_BTREE) {
return parse_uncompressed_page_file(filename, type, start_point, number_read);
}
auto openGaussCompression = new OpenGaussCompression();
openGaussCompression->SetFilePath(filename, SegNo);
bool success = openGaussCompression->TryOpen();
if (!success) {
delete openGaussCompression;
return parse_uncompressed_page_file(filename, type, start_point, number_read);
}
BlockNumber start = start_point;
BlockNumber blknum = openGaussCompression->GetMaxBlockNumber();
BlockNumber number = CalculateMaxBlockNumber(blknum, start, number_read);
if (number == InvalidBlockNumber) {
delete openGaussCompression;
return false;
}
char compressed[BLCKSZ];
size_t compressedLen;
while (start < number) {
if (!openGaussCompression->ReadChunkOfBlock(compressed, &compressedLen, start)) {
fprintf(stderr, "read block %d failed, filename: %s: %s\n", start, openGaussCompression->GetPcdFilePath(),
strerror(errno));
delete openGaussCompression;
return false;
}
if (!parse_a_page(openGaussCompression->GetDecompressedPage(), start, blknum, type)) {
fprintf(stderr, "Error during parsing block %d/%d\n", start, blknum);
delete openGaussCompression;
return false;
}
if ((write_back && num_item) || dirty_page) {
if (dirty_page) {
openGaussCompression->MarkUncompressedDirty();
}
if (!openGaussCompression->WriteBackUncompressedData()) {
fprintf(stderr, "write back failed, filename: %s: %s\n", openGaussCompression->GetPcdFilePath(),
strerror(errno));
delete openGaussCompression;
return false;
}
}
start++;
}
delete openGaussCompression;
return true;
}
static int parse_uncompressed_page_file(const char *filename, SegmentType type, const uint32 start_point,
const uint32 number_read)
{
char buffer[BLCKSZ];
FILE* fd = NULL;
@ -3173,9 +3248,8 @@ static int parse_page_file(const char* filename, SegmentType type, const uint32
blknum = size / BLCKSZ;
/* parse */
if (start >= blknum) {
fprintf(stderr, "start point exceeds the total block number of relation.\n");
fclose(fd);
number = CalculateMaxBlockNumber(blknum, start, number);
if (number == InvalidBlockNumber) {
return false;
} else if ((start + number) > blknum) {
fprintf(stderr,

View File

@ -13,6 +13,7 @@
#include "access/gin.h"
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/nbtree.h"

View File

@ -366,7 +366,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
Page page;
OffsetNumber maxoff;
_hash_getlock(rel, blkno, HASH_SHARE);
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
page = BufferGetPage(buf);
@ -393,7 +392,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
}
_hash_relbuf(rel, buf);
_hash_droplock(rel, blkno, HASH_SHARE);
}
/*

View File

@ -761,7 +761,7 @@ else # not PGXS
endif
endif
override CPPFLAGS := $(CPPFLAGS) -I$(LIBODBC_INCLUDE_PATH) -I$(LIBOBS_INCLUDE_PATH) -I$(LIBCGROUP_INCLUDE_PATH) -I$(LIBOPENSSL_INCLUDE_PATH) -I${LIBORC_INCLUDE_PATH} -I${LIBPARQUET_INCLUDE_PATH} -I${PROTOBUF_INCLUDE_PATH} -I${BOOST_INCLUDE_PATH} -I$(LIBLLVM_INCLUDE_PATH) -I$(KERBEROS_INCLUDE_PATH) -I$(CJSON_INCLUDE_PATH) -I$(NUMA_INCLUDE_PATH) -I$(ZLIB_INCLUDE_PATH) -I$(LZ4_INCLUDE_PATH) -I$(LIBCURL_INCLUDE_PATH) -I$(DCF_INCLUDE_PATH)
override CPPFLAGS := $(CPPFLAGS) -I$(LIBODBC_INCLUDE_PATH) -I$(LIBOBS_INCLUDE_PATH) -I$(LIBCGROUP_INCLUDE_PATH) -I$(LIBOPENSSL_INCLUDE_PATH) -I${LIBORC_INCLUDE_PATH} -I${LIBPARQUET_INCLUDE_PATH} -I${PROTOBUF_INCLUDE_PATH} -I${BOOST_INCLUDE_PATH} -I$(LIBLLVM_INCLUDE_PATH) -I$(KERBEROS_INCLUDE_PATH) -I$(CJSON_INCLUDE_PATH) -I$(NUMA_INCLUDE_PATH) -I$(ZLIB_INCLUDE_PATH) -I$(LZ4_INCLUDE_PATH) -I$(LIBCURL_INCLUDE_PATH) -I$(DCF_INCLUDE_PATH) -I$(ZSTD_INCLUDE_PATH)
# GDS links to libevent
ifeq ($(enable_multiple_nodes), yes)
@ -895,6 +895,9 @@ endif
# append zlib for compression: zlib
LDFLAGS += -L$(ZLIB_LIB_PATH) -I$(ZLIB_INCLUDE_PATH)
#append zstd for compression: zstd
LDFLAGS += -L$(ZSTD_LIB_PATH) -I$(ZSTD_INCLUDE_PATH)
LDFLAGS += -L$(SECURE_LIB_PATH)
LDFLAGS += -L$(LIBOPENSSL_LIB_PATH)
LDFLAGS += -L$(LIBSTD_LIB_PATH)

View File

@ -5,6 +5,7 @@ set(TGT_rewind_SRC ${CMAKE_CURRENT_SOURCE_DIR}/datapagemap.cpp
${CMAKE_CURRENT_SOURCE_DIR}/filemap.cpp
${CMAKE_CURRENT_SOURCE_DIR}/file_ops.cpp
${CMAKE_CURRENT_SOURCE_DIR}/logging.cpp
${CMAKE_CURRENT_SOURCE_DIR}/compressed_rewind.cpp
${CMAKE_CURRENT_SOURCE_DIR}/parsexlog.cpp
${CMAKE_CURRENT_SOURCE_DIR}/pg_rewind.cpp
)
@ -15,7 +16,8 @@ set(TGT_rewind_INC
${PROJECT_SRC_DIR}/common/port
${PROJECT_SRC_DIR}/common/interfaces/libpq
${PROJECT_SRC_DIR}/include/libpq
${LIBOPENSSL_INCLUDE_PATH}
${LIBOPENSSL_INCLUDE_PATH}
${ZSTD_INCLUDE_PATH}
)
set(rewind_DEF_OPTIONS ${MACRO_OPTIONS})

View File

@ -26,7 +26,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
endif
endif
endif
OBJS = file_ops.o datapagemap.o fetch.o filemap.o logging.o parsexlog.o pg_rewind.o
OBJS = file_ops.o datapagemap.o fetch.o filemap.o logging.o parsexlog.o pg_rewind.o compressed_rewind.o
#all:gs_rewind.a

View File

@ -0,0 +1,46 @@
/* -------------------------------------------------------------------------
*
* compressed_common.h
*
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* -------------------------------------------------------------------------
*/
#ifndef OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H
#define OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H
#include "utils/atomic.h"
struct RewindCompressInfo {
bool compressed = false; /* compressed table or not */
uint32 oldBlockNumber = 0;
uint32 newBlockNumber = 0;
uint8 algorithm = 0; /* compressed algorithm */
uint16 chunkSize = 0; /* compressed chunk size */
};
struct CompressedPcaInfo {
char *pcaMap = NULL;
int pcaFd = -1;
char path[MAXPGPATH];
int32 chunkSize = 0;
int32 algorithm = 0;
};
#define COPY_REWIND_COMPRESS_INFO(entry, infoPointer, oldBlock, newBlock) \
(entry)->rewindCompressInfo.oldBlockNumber = 0; \
(entry)->rewindCompressInfo.newBlockNumber = 0; \
(entry)->rewindCompressInfo.compressed = false; \
(entry)->rewindCompressInfo.algorithm = 0; \
(entry)->rewindCompressInfo.chunkSize = 0; \
if ((infoPointer) != NULL && (infoPointer)->compressed) { \
(entry)->rewindCompressInfo.oldBlockNumber = (oldBlock); \
(entry)->rewindCompressInfo.newBlockNumber = (newBlock); \
(entry)->rewindCompressInfo.compressed = true; \
(entry)->rewindCompressInfo.algorithm = (infoPointer)->algorithm; \
(entry)->rewindCompressInfo.chunkSize = (infoPointer)->chunkSize; \
}
#endif // OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H

View File

@ -0,0 +1,129 @@
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*
* compressed_rewind.cpp
* Functions for fetching compressed table.
*
*
* IDENTIFICATION
* ./src/bin/pg_rewind/compressed_rewind.cpp
*
* -------------------------------------------------------------------------
*/
#include "compressed_rewind.h"
#include "libpq/libpq-fe.h"
#include "lib/string.h"
#include "logging.h"
#include "filemap.h"
#include "utils/elog.h"
#include "file_ops.h"
void FormatPathToPca(const char* path, char* dst, size_t len, bool withPrefix)
{
errno_t rc;
if (withPrefix) {
rc = snprintf_s(dst, len, len - 1, "%s/" PCA_SUFFIX, pg_data, path);
} else {
rc = snprintf_s(dst, len, len - 1, PCA_SUFFIX, path);
}
securec_check_ss_c(rc, "\0", "\0");
}
void FormatPathToPcd(const char* path, char* dst, size_t len, bool withPrefix)
{
errno_t rc;
if (withPrefix) {
rc = snprintf_s(dst, len, len - 1, "%s/" PCD_SUFFIX, pg_data, path);
} else {
rc = snprintf_s(dst, len, len - 1, PCD_SUFFIX, path);
}
securec_check_ss_c(rc, "\0", "\0");
}
template <typename T>
bool ReadCompressedInfo(T& t, off_t offset, FILE* file, char* pcaFilePath, size_t len)
{
if (fseeko(file, offset, SEEK_SET) != 0) {
pg_fatal("could not seek in file \"%s\": \"%lu\": %s\n", pcaFilePath, len, strerror(errno));
return false;
}
if (fread(&t, sizeof(t), 1, file) <= 0) {
pg_fatal("could not open file \"%s\": \"%lu\": %s\n", pcaFilePath, len, strerror(errno));
return false;
}
return true;
}
/**
* write RewindCompressInfo
* @param file file fp
* @param pcaFilePath file path,for ereport
* @param rewindCompressInfo pointer of return
* @return sucesss or not
*/
static bool ReadRewindCompressedInfo(FILE* file, char* pcaFilePath, size_t len, RewindCompressInfo* rewindCompressInfo)
{
off_t offset = (off_t)offsetof(PageCompressHeader, chunk_size);
if (!ReadCompressedInfo(rewindCompressInfo->chunkSize, offset, file, pcaFilePath, len)) {
return false;
}
offset = (off_t)offsetof(PageCompressHeader, algorithm);
if (!ReadCompressedInfo(rewindCompressInfo->algorithm, offset, file, pcaFilePath, len)) {
return false;
}
offset = (off_t)offsetof(PageCompressHeader, nblocks);
if (!ReadCompressedInfo(rewindCompressInfo->oldBlockNumber, offset, file, pcaFilePath, len)) {
return false;
}
rewindCompressInfo->compressed = true;
return true;
}
bool FetchSourcePca(const char* strValue, RewindCompressInfo* rewindCompressInfo)
{
size_t length = 0;
PageCompressHeader* ptr = (PageCompressHeader*)PQunescapeBytea((const unsigned char*)strValue, &length);
rewindCompressInfo->compressed = false;
if (length == sizeof(PageCompressHeader)) {
rewindCompressInfo->compressed = true;
rewindCompressInfo->algorithm = ptr->algorithm;
rewindCompressInfo->newBlockNumber = ptr->nblocks;
rewindCompressInfo->oldBlockNumber = 0;
rewindCompressInfo->chunkSize = ptr->chunk_size;
}
PQfreemem(ptr);
return rewindCompressInfo->compressed;
}
bool ProcessLocalPca(const char* tablePath, RewindCompressInfo* rewindCompressInfo)
{
rewindCompressInfo->compressed = false;
if (!isRelDataFile(tablePath)) {
return false;
}
char pcaFilePath[MAXPGPATH];
FormatPathToPca(tablePath, pcaFilePath, MAXPGPATH, true);
FILE* file = fopen(pcaFilePath, "rb");
if (file == NULL) {
if (errno == ENOENT) {
return false;
}
pg_fatal("could not open file \"%s\": %s\n", pcaFilePath, strerror(errno));
return false;
}
bool success = ReadRewindCompressedInfo(file, pcaFilePath, MAXPGPATH, rewindCompressInfo);
fclose(file);
return success;
}

View File

@ -0,0 +1,21 @@
/* -------------------------------------------------------------------------
*
* compressed_rewind.h
*
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* -------------------------------------------------------------------------
*/
#ifndef OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H
#define OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H
#include "compressed_common.h"
#include "storage/page_compression.h"
#include "storage/smgr/relfilenode.h"
extern bool FetchSourcePca(const char* strValue, RewindCompressInfo* rewindCompressInfo);
extern bool ProcessLocalPca(const char* tablePath, RewindCompressInfo* rewindCompressInfo);
extern void FormatPathToPca(const char* path, char* dst, size_t len, bool withPrefix = false);
extern void FormatPathToPcd(const char* path, char* dst, size_t len, bool withPrefix = false);
#endif // OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H

View File

@ -23,6 +23,7 @@
#include "libpq/libpq-fe.h"
#include "libpq/libpq-int.h"
#include "common/fe_memutils.h"
#include "compressed_rewind.h"
#include "catalog/catalog.h"
#include "catalog/pg_type.h"
@ -47,11 +48,11 @@ const uint64 MAX_FILE_SIZE = 0xFFFFFFFF;
#define MAX_PARAM_LEN 1024
static BuildErrorCode receiveFileChunks(const char* sql, FILE* file);
static BuildErrorCode execute_pagemap(datapagemap_t* pagemap, const char* path, FILE* file);
static BuildErrorCode execute_pagemap(file_entry_t* entry, FILE* file);
static char* run_simple_query(const char* sql);
static BuildErrorCode recurse_dir(const char* datadir, const char* path, process_file_callback_t callback);
static void get_slot_name_by_app_name(void);
static BuildErrorCode CheckResultSet(PGresult* pgResult);
BuildErrorCode libpqConnect(const char* connstr)
{
PGresult* res = NULL;
@ -254,10 +255,22 @@ BuildErrorCode fetchSourceFileList()
* general, so if the admin has put any custom symbolic links in the data
* directory, they won't be copied correctly.
*/
sql = "SELECT path, size, isdir, pg_tablespace_location(pg_tablespace.oid) AS link_target \n"
/* skip pca/pcd files and concat pca with table file */
sql = "WITH tmp_table AS (\n"
"SELECT path, size, isdir, pg_tablespace_location(pg_tablespace.oid) AS link_target \n"
"FROM (SELECT * FROM pg_stat_file_recursive('.')) AS files \n"
"LEFT OUTER JOIN pg_tablespace ON files.path like 'pg_tblspc/%' AND oid::text = files.filename\n";
res = PQexec(conn, sql);
"LEFT OUTER JOIN pg_tablespace ON files.path ~ '^pg_tblspc/' AND oid :: text = files.filename\n"
"),compressed_address AS (SELECT path pca_path, substr(path, 0, length(path) - 4) AS table_path\n"
"FROM pg_stat_file_recursive('.') WHERE path ~ '_pca$' AND length(path) > 4)\n"
"SELECT path, size, isdir, link_target,\n"
"CASE WHEN pca_path IS NOT NULL THEN pg_read_binary_file(pca_path, 0, %d, true)\n"
"ELSE NULL END AS pchdr\n"
"FROM tmp_table LEFT JOIN compressed_address\n"
"ON tmp_table.path = compressed_address.table_path\nWHERE path !~ '_pca$' AND path !~ '_pcd$'\n";
char sqlbuf[1024];
int rc = snprintf_s(sqlbuf, sizeof(sqlbuf), sizeof(sqlbuf) - 1, sql, SIZE_OF_PAGE_COMPRESS_HEADER_DATA);
securec_check_ss_c(rc, "\0", "\0");
res = PQexec(conn, (const char*)sqlbuf);
if (PQresultStatus(res) != PGRES_TUPLES_OK) {
pg_log(PG_ERROR, "could not fetch file list: %s", PQresultErrorMessage(res));
@ -265,7 +278,7 @@ BuildErrorCode fetchSourceFileList()
}
/* sanity check the result set */
if (PQnfields(res) != 4) {
if (PQnfields(res) != 5) {
pg_fatal("unexpected result set while fetching file list\n");
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
}
@ -308,7 +321,13 @@ BuildErrorCode fetchSourceFileList()
}
}
}
process_source_file(path, type, filesize, link_target);
RewindCompressInfo rewindCompressInfo;
RewindCompressInfo *pointer = NULL;
if (!PQgetisnull(res, i, 4) && FetchSourcePca(PQgetvalue(res, i, 4), &rewindCompressInfo)) {
filesize = rewindCompressInfo.newBlockNumber * BLCKSZ;
pointer = &rewindCompressInfo;
}
process_source_file(path, type, filesize, link_target, pointer);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
}
PQclear(res);
@ -364,7 +383,7 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file)
}
/* sanity check the result set */
if (PQnfields(res) != 4 || PQntuples(res) != 1) {
if (PQnfields(res) != 7 || PQntuples(res) != 1) {
pg_fatal("unexpected result set size while fetching remote files\n");
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
}
@ -393,6 +412,8 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file)
pg_fatal("unexpected result length while fetching remote files\n");
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
}
/* check compressed result set */
CheckResultSet(res);
/* Read result set to local variables */
errorno = memcpy_s(&chunkoff, sizeof(int32), PQgetvalue(res, 0, 1), sizeof(int32));
@ -429,17 +450,63 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file)
continue;
}
pg_log(PG_DEBUG, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
fprintf(file, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
int32 algorithm;
errorno = memcpy_s(&algorithm, sizeof(int32), PQgetvalue(res, 0, 4), sizeof(int32));
securec_check_c(errorno, "\0", "\0");
algorithm = ntohl(algorithm);
if (algorithm == 0) {
pg_log(PG_DEBUG, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
fprintf(file, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
open_target_file(filename, false);
pg_free(filename);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
write_target_range(chunk, chunkoff, chunksize, chunkspace);
} else {
int32 chunkSize;
int errorno = memcpy_s(&chunkSize, sizeof(int32), PQgetvalue(res, 0, 5), sizeof(int32));
securec_check_c(errorno, "\0", "\0");
chunkSize = ntohl(chunkSize);
bool rebuild = *PQgetvalue(res, 0, 6) != 0;
char dst[MAXPGPATH];
/* open pca */
FormatPathToPca(filename, dst, MAXPGPATH, false);
OpenCompressedPcaFile(dst, chunkSize, algorithm, rebuild);
open_target_file(filename, false);
pg_free(filename);
filename = NULL;
/* open pcd */
FormatPathToPcd(filename, dst, MAXPGPATH, false);
open_target_file(dst, false);
BlockNumber blockNumber = chunkoff;
size_t blockSize = chunkspace;
/* fetch result */
FetchCompressedFile(chunk, blockNumber, blockSize);
}
}
return BUILD_SUCCESS;
}
/**
* check result set of compressed tables
* @param pgResult result
* @return success or not
*/
static BuildErrorCode CheckResultSet(PGresult* res)
{
#define PQ_TYPE(index, type) (PQftype(res, (index)) != (type))
if (PQ_TYPE(4, INT4OID) || PQ_TYPE(5, INT4OID) || PQ_TYPE(6, BOOLOID)) {
pg_fatal(
"FetchCompressedFile:unexpected data types: %u %u %u\n", PQftype(res, 4), PQftype(res, 5), PQftype(res, 6));
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
write_target_range(chunk, chunkoff, chunksize, chunkspace);
}
#define PQ_FORMAT(index) (PQfformat(res, 0) != 1)
if (PQ_FORMAT(4) && PQ_FORMAT(5) && PQ_FORMAT(6)) {
pg_fatal("unexpected result format while fetching remote files\n");
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
}
#define PQ_ISNULL(index) (PQgetisnull(res, 0, (index)))
if (PQ_ISNULL(4) || PQ_ISNULL(5) || PQ_ISNULL(6)) {
pg_fatal("unexpected null values in result while fetching remote files\n");
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
PQclear(res);
res = NULL;
}
return BUILD_SUCCESS;
}
@ -497,6 +564,43 @@ error:
return result;
}
static void CompressedFileCopy(const file_entry_t* entry, bool rebuild)
{
Assert(!rebuild || entry->rewindCompressInfo.oldBlockNumber == 0);
if (dry_run) {
return;
}
char linebuf[MAXPGPATH + 47];
int ret = snprintf_s(linebuf,
sizeof(linebuf),
sizeof(linebuf) - 1,
"%s\t%u\t%u\t%u\t%u\t%u\n",
entry->path,
entry->rewindCompressInfo.oldBlockNumber,
entry->rewindCompressInfo.newBlockNumber - entry->rewindCompressInfo.oldBlockNumber,
entry->rewindCompressInfo.algorithm,
entry->rewindCompressInfo.chunkSize,
rebuild);
securec_check_ss_c(ret, "\0", "\0");
if (PQputCopyData(conn, linebuf, strlen(linebuf)) != 1) {
pg_fatal("could not send COPY data: %s", PQerrorMessage(conn));
}
pg_log(PG_PROGRESS, "CompressedFileCopy:%s", linebuf);
}
static void CompressedFileRemove(const file_entry_t* entry)
{
remove_target((file_entry_t*) entry);
char* path = entry->path;
char dst[MAXPGPATH];
FormatPathToPca(path, dst, MAXPGPATH);
remove_target_file(dst, false);
FormatPathToPcd(path, dst, MAXPGPATH);
remove_target_file(dst, false);
pg_log(PG_PROGRESS, "CompressedFileRemove: %s\n", path);
}
/*
* Write a file range to a temporary table in the server.
*
@ -506,7 +610,7 @@ error:
*/
static void fetch_file_range(const char* path, unsigned int begin, unsigned int end)
{
char linebuf[MAXPGPATH + 23];
char linebuf[MAXPGPATH + 47];
int ss_c = 0;
/* Split the range into CHUNKSIZE chunks */
@ -518,12 +622,12 @@ static void fetch_file_range(const char* path, unsigned int begin, unsigned int
} else {
len = end - begin;
}
ss_c = snprintf_s(linebuf, sizeof(linebuf), sizeof(linebuf) - 1, "%s\t%u\t%u\n", path, begin, len);
ss_c = snprintf_s(
linebuf, sizeof(linebuf), sizeof(linebuf) - 1, "%s\t%u\t%u\t%u\t%u\t%u\n", path, begin, len, 0, 0, 0);
securec_check_ss_c(ss_c, "\0", "\0");
if (PQputCopyData(conn, linebuf, strlen(linebuf)) != 1)
pg_fatal("could not send COPY data: %s", PQerrorMessage(conn));
begin += len;
}
}
@ -542,7 +646,8 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
* First create a temporary table, and load it with the blocks that we
* need to fetch.
*/
sql = "CREATE TEMPORARY TABLE fetchchunks(path text, begin int4, len int4);";
sql = "CREATE TEMPORARY TABLE fetchchunks(path text, begin int4, len int4, "
"algorithm int4, chunksize int4, rebuild bool);";
res = PQexec(conn, sql);
if (PQresultStatus(res) != PGRES_COMMAND_OK) {
pg_fatal("could not create temporary table: %s", PQresultErrorMessage(res));
@ -571,11 +676,16 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
}
/* report all the path to check whether it's correct */
if (entry->rewindCompressInfo.compressed) {
pg_log(PG_PROGRESS, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action);
}
pg_log(PG_DEBUG, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action);
fprintf(file, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action);
/* If this is a relation file, copy the modified blocks */
execute_pagemap(&entry->pagemap, entry->path, file);
bool compressed = entry->rewindCompressInfo.compressed;
execute_pagemap(entry, file);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
switch (entry->action) {
@ -584,29 +694,47 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
break;
case FILE_ACTION_COPY:
/* Truncate the old file out of the way, if any */
open_target_file(entry->path, true);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
fetch_file_range(entry->path, 0, entry->newsize);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
if (compressed) {
CompressedFileCopy(entry, true);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
} else {
/* Truncate the old file out of the way, if any */
open_target_file(entry->path, true);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
fetch_file_range(entry->path, 0, entry->newsize);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
}
break;
case FILE_ACTION_TRUNCATE:
truncate_target_file(entry->path, entry->newsize);
if (compressed) {
CompressedFileTruncate(entry->path, &entry->rewindCompressInfo);
} else {
truncate_target_file(entry->path, entry->newsize);
}
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
break;
case FILE_ACTION_COPY_TAIL:
fetch_file_range(entry->path, entry->oldsize, entry->newsize);
if (compressed) {
CompressedFileCopy(entry, false);
} else {
fetch_file_range(entry->path, entry->oldsize, entry->newsize);
}
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
break;
case FILE_ACTION_REMOVE:
remove_target(entry);
if (compressed) {
CompressedFileRemove(entry);
} else {
remove_target(entry);
}
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
break;
case FILE_ACTION_CREATE:
Assert(!compressed);
create_target(entry);
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
break;
@ -638,9 +766,14 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
* temporary table. Now, actually fetch all of those ranges.
*/
sql = "SELECT path, begin, \n"
" pg_read_binary_file(path, begin, len, true) AS chunk,\n"
" len \n"
"FROM fetchchunks\n";
" pg_read_binary_file(path, begin, len, true) AS chunk, len, algorithm, chunksize,rebuild \n"
"FROM fetchchunks where algorithm =0 \n"
"union all \n"
"select (json->>'path')::text as path, (json->>'blocknum')::int4 as begin, (json->>'data')::bytea as chunk,\n"
"(json->>'len')::int4 as len, algorithm, chunksize,rebuild \n"
"from (select row_to_json(pg_read_binary_file_blocks(path,begin,len)) json, algorithm, chunksize,rebuild \n"
"from fetchchunks where algorithm !=0) \n"
"order by path, begin;";
fprintf(file, "fetch and write file based on temporary table fetchchunks.\n");
return receiveFileChunks(sql, file);
@ -700,7 +833,7 @@ BuildErrorCode backupFileMap(filemap_t* map)
/* to be supported later */
break;
case FILE_ACTION_COPY:
case FILE_ACTION_COPY: {
/* create fake file for restore when file not exist, otherwise, backup file */
file_entry_t statbuf;
if (targetFilemapSearch(entry->path, &statbuf) < 0) {
@ -709,6 +842,7 @@ BuildErrorCode backupFileMap(filemap_t* map)
backup_target_file(entry->path, divergeXlogFileName);
}
break;
}
case FILE_ACTION_COPY_TAIL:
case FILE_ACTION_TRUNCATE:
@ -732,17 +866,60 @@ BuildErrorCode backupFileMap(filemap_t* map)
return BUILD_SUCCESS;
}
static BuildErrorCode execute_pagemap(datapagemap_t* pagemap, const char* path, FILE* file)
/**
* combine continue blocks numbers and copy file
* @param entry file entry
* @param file file
*/
static void CompressedFileCopy(file_entry_t* entry, FILE* file)
{
datapagemap_t* pagemap = &entry->pagemap;
datapagemap_iterator_t* iter = datapagemap_iterate(pagemap);
BlockNumber blkno;
file_entry_t fileEntry;
fileEntry.path = entry->path;
fileEntry.rewindCompressInfo = entry->rewindCompressInfo;
int invalidNumber = -1;
long int before = invalidNumber;
while (datapagemap_next(iter, &blkno)) {
fprintf(file, " block %u\n", blkno);
if (before == -1) {
fileEntry.rewindCompressInfo.oldBlockNumber = blkno;
before = blkno;
} else {
if (before == blkno - 1) {
before = blkno;
} else {
fileEntry.rewindCompressInfo.newBlockNumber = before + 1;
CompressedFileCopy(&fileEntry, false);
fileEntry.rewindCompressInfo.oldBlockNumber = blkno;
before = blkno;
}
}
}
if (before != invalidNumber) {
fileEntry.rewindCompressInfo.newBlockNumber = before + 1;
CompressedFileCopy(&fileEntry, false);
}
}
static BuildErrorCode execute_pagemap(file_entry_t* entry, FILE* file)
{
datapagemap_iterator_t* iter = NULL;
BlockNumber blkno;
off_t offset;
datapagemap_t* pagemap = &entry->pagemap;
char* path = entry->path;
iter = datapagemap_iterate(pagemap);
while (datapagemap_next(iter, &blkno)) {
fprintf(file, " block %u\n", blkno);
offset = blkno * BLCKSZ;
fetch_file_range(path, offset, offset + BLCKSZ);
if (entry->rewindCompressInfo.compressed) {
CompressedFileCopy(entry, file);
} else {
while (datapagemap_next(iter, &blkno)) {
fprintf(file, " block %u\n", blkno);
offset = blkno * BLCKSZ;
fetch_file_range(path, offset, offset + BLCKSZ);
}
}
pg_free(iter);
iter = NULL;
@ -789,9 +966,19 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
struct stat fst;
char fullpath[MAXPGPATH];
char path[MAXPGPATH];
const size_t MINPCANAMESIZE = 4;
if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0)
continue;
/* Skip compressed page files */
size_t dirNamePath = strlen(xlde->d_name);
if (dirNamePath >= MINPCANAMESIZE) {
const char* suffix = xlde->d_name + dirNamePath - MINPCANAMESIZE;
if (strncmp(suffix, "_pca", MINPCANAMESIZE) == 0 || strncmp(suffix, "_pcd", MINPCANAMESIZE) == 0) {
continue;
}
}
ss_c = snprintf_s(fullpath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", fullparentpath, xlde->d_name);
securec_check_ss_c(ss_c, "\0", "\0");
@ -822,8 +1009,15 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
continue;
if (S_ISREG(fst.st_mode)) {
if ((uint64)fst.st_size <= MAX_FILE_SIZE) {
callback(path, FILE_TYPE_REGULAR, fst.st_size, NULL);
uint64 fileSize = (uint64)fst.st_size;
RewindCompressInfo rewindCompressInfo;
RewindCompressInfo *pointer = NULL;
if (ProcessLocalPca(path, &rewindCompressInfo)) {
fileSize = rewindCompressInfo.oldBlockNumber * BLCKSZ;
pointer = &rewindCompressInfo;
}
if (fileSize <= MAX_FILE_SIZE) {
callback(path, FILE_TYPE_REGULAR, fileSize, NULL, pointer);
if (increment_return_code != BUILD_SUCCESS) {
(void)closedir(xldir);
}
@ -832,7 +1026,7 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
pg_log(PG_WARNING, "file size of \"%s\" is over %ld\n", fullpath, MAX_FILE_SIZE);
}
} else if (S_ISDIR(fst.st_mode)) {
callback(path, FILE_TYPE_DIRECTORY, 0, NULL);
callback(path, FILE_TYPE_DIRECTORY, 0, NULL, NULL);
if (increment_return_code != BUILD_SUCCESS) {
(void)closedir(xldir);
}
@ -857,7 +1051,7 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
}
link_target[len] = '\0';
callback(path, FILE_TYPE_SYMLINK, 0, link_target);
callback(path, FILE_TYPE_SYMLINK, 0, link_target, NULL);
/*
* If it's a symlink within pg_tblspc, we need to recurse into it,

View File

@ -42,7 +42,9 @@ extern XLogRecPtr libpqGetCurrentXlogInsertLocation(void);
extern void libpqRequestCheckpoint(void);
typedef void (*process_file_callback_t)(const char* path, file_type_t type, size_t size, const char* link_target);
typedef void (*process_file_callback_t)(const char* path, file_type_t type, size_t oldsize, const char* link_target,
const RewindCompressInfo* rewindCompressInfo);
extern BuildErrorCode traverse_datadir(const char* datadir, process_file_callback_t callback);
extern void get_source_slotname(void);

View File

@ -25,6 +25,8 @@
#include "common/fe_memutils.h"
#include "common/build_query/build_query.h"
#include "compressed_rewind.h"
#include "storage/page_compression_impl.h"
#include "replication/replicainternal.h"
#define BLOCKSIZE (8 * 1024)
@ -36,6 +38,8 @@ static int dstfd = -1;
static char dstpath[MAXPGPATH] = "";
static bool g_isRelDataFile = false;
static CompressedPcaInfo g_compressedPcaInfo;
static void create_target_dir(const char* path);
static void remove_target_dir(const char* path);
static void create_target_symlink(const char* path, const char* slink);
@ -100,7 +104,7 @@ void close_target_file(void)
dstfd = -1;
}
void write_target_range(char* buf, off_t begin, size_t size, int space)
void write_target_range(char* buf, off_t begin, size_t size, int space, bool compressed)
{
int writeleft;
char* p = NULL;
@ -111,7 +115,7 @@ void write_target_range(char* buf, off_t begin, size_t size, int space)
if (dry_run)
return;
if (begin % BLOCKSIZE != 0) {
if (!compressed && begin % BLOCKSIZE != 0) {
(void)close(dstfd);
dstfd = -1;
pg_fatal("seek position %ld in target file \"%s\" is not in BLOCKSIZEs\n", size, dstpath);
@ -1225,3 +1229,142 @@ bool tablespaceDataIsValid(const char* path)
return true;
}
void CompressedFileTruncate(const char *path, const RewindCompressInfo *rewindCompressInfo)
{
if (dry_run) {
return;
}
uint16 chunkSize = rewindCompressInfo->chunkSize;
BlockNumber oldBlockNumber = rewindCompressInfo->oldBlockNumber;
BlockNumber newBlockNumber = rewindCompressInfo->newBlockNumber;
Assert(oldBlockNumber > newBlockNumber);
char pcaPath[MAXPGPATH];
FormatPathToPca(path, pcaPath, MAXPGPATH, true);
int pcaFd = open(pcaPath, O_RDWR | PG_BINARY, 0600);
if (pcaFd < 0) {
pg_fatal("CompressedFileTruncate: could not open file \"%s\": %s\n", pcaPath, strerror(errno));
return;
}
PageCompressHeader* map = pc_mmap(pcaFd, chunkSize, false);
if (map == MAP_FAILED) {
pg_fatal("CompressedFileTruncate: Failed to mmap file \"%s\": %s\n", pcaPath, strerror(errno));
return;
}
/* write zero to truncated addr */
for (BlockNumber blockNumber = newBlockNumber; blockNumber < oldBlockNumber; ++blockNumber) {
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(map, chunkSize, blockNumber);
for (size_t i = 0; i < addr->allocated_chunks; ++i) {
addr->chunknos[i] = 0;
}
addr->nchunks = 0;
addr->allocated_chunks = 0;
addr->checksum = 0;
}
map->last_synced_nblocks = map->nblocks = newBlockNumber;
/* find the max used chunk number */
pc_chunk_number_t beforeUsedChunks = map->allocated_chunks;
pc_chunk_number_t max_used_chunkno = 0;
for (BlockNumber blockNumber = 0; blockNumber < newBlockNumber; ++blockNumber) {
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(map, chunkSize, blockNumber);
for (uint8 i = 0; i < addr->allocated_chunks; i++) {
if (addr->chunknos[i] > max_used_chunkno) {
max_used_chunkno = addr->chunknos[i];
}
}
}
map->allocated_chunks = map->last_synced_allocated_chunks = max_used_chunkno;
/* truncate pcd qfile */
if (beforeUsedChunks > max_used_chunkno) {
char pcdPath[MAXPGPATH];
FormatPathToPcd(path, pcdPath, MAXPGPATH, false);
truncate_target_file(pcdPath, max_used_chunkno * chunkSize);
}
pc_munmap(map);
pg_log(PG_PROGRESS, "CompressedFileTruncate: %s\n", path);
}
void OpenCompressedPcaFile(const char* fileName, int32 chunkSize, int32 algorithm, bool rebuild)
{
if (dry_run) {
return;
}
if (g_compressedPcaInfo.pcaFd != -1 && strcmp(fileName, &g_compressedPcaInfo.path[strlen(pg_data) + 1]) == 0) {
/* already open */
return;
}
CloseCompressedPcaFile();
int rc = snprintf_s(g_compressedPcaInfo.path, sizeof(g_compressedPcaInfo.path),
sizeof(g_compressedPcaInfo.path) - 1,
"%s/%s", pg_data, fileName);
securec_check_ss_c(rc, "\0", "\0");
int mode = O_RDWR | PG_BINARY;
mode = rebuild ? (mode | O_TRUNC | O_CREAT) : mode;
g_compressedPcaInfo.pcaFd = open(g_compressedPcaInfo.path, mode, S_IRUSR | S_IWUSR);
if (g_compressedPcaInfo.pcaFd < 0) {
pg_fatal("could not open compressed pca file \"%s\": %s\n", g_compressedPcaInfo.path, strerror(errno));
return;
}
g_compressedPcaInfo.algorithm = algorithm;
g_compressedPcaInfo.chunkSize = chunkSize;
g_compressedPcaInfo.pcaMap = (char*) pc_mmap(g_compressedPcaInfo.pcaFd, chunkSize, false);
if ((void*)g_compressedPcaInfo.pcaMap == MAP_FAILED) {
pg_fatal("OpenCompressedPcaFile: Failed to mmap file \"%s\": %s\n", g_compressedPcaInfo.path, strerror(errno));
return;
}
}
void CloseCompressedPcaFile()
{
if (g_compressedPcaInfo.pcaFd == -1) {
return;
}
pc_munmap((PageCompressHeader*)g_compressedPcaInfo.pcaMap);
if (close(g_compressedPcaInfo.pcaFd) != 0) {
pg_fatal("could not close target file \"%s\": %s\n", g_compressedPcaInfo.path, gs_strerror(errno));
}
g_compressedPcaInfo.pcaFd = -1;
g_compressedPcaInfo.pcaMap = NULL;
g_compressedPcaInfo.chunkSize = 0;
g_compressedPcaInfo.algorithm = 0;
}
void FetchCompressedFile(char* buf, BlockNumber blockNumber, int32 size)
{
int32 chunkSize = g_compressedPcaInfo.chunkSize;
int needChunks = size / chunkSize;
PageCompressHeader* pcMap = (PageCompressHeader*) g_compressedPcaInfo.pcaMap;
PageCompressAddr* pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunkSize, blockNumber);
// 2. allocate chunks
if (pcAddr->allocated_chunks < needChunks) {
auto chunkno = pg_atomic_fetch_add_u32(&pcMap->allocated_chunks, needChunks - pcAddr->allocated_chunks);
for (int i = pcAddr->allocated_chunks; i < needChunks; i++) {
pcAddr->chunknos[i] = ++chunkno;
}
pcAddr->allocated_chunks = needChunks;
}
for (int32 i = 0; i < needChunks; ++i) {
auto buffer_pos = buf + chunkSize * i;
off_t seekpos = (off_t) OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, pcAddr->chunknos[i]);
int32 start = i;
while (i < needChunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) {
i++;
}
int write_amount = chunkSize * (i - start + 1);
// open file dstfd
write_target_range(buffer_pos, seekpos, write_amount, 0, true);
}
pcAddr->nchunks = pcAddr->allocated_chunks;
pcAddr->checksum = AddrChecksum32(blockNumber, pcAddr, chunkSize);
}

View File

@ -11,10 +11,11 @@
#define FILE_OPS_H
#include "filemap.h"
#include "compressed_common.h"
extern char* pg_data;
extern void open_target_file(const char* path, bool trunc);
extern void write_target_range(char* buf, off_t begin, size_t size, int space);
extern void write_target_range(char* buf, off_t begin, size_t size, int space, bool compressed = false);
extern void close_target_file(void);
extern void truncate_target_file(const char* path, off_t newsize);
extern void create_target(file_entry_t* t);
@ -41,6 +42,9 @@ extern void delete_target_file(const char* file);
extern bool isPathInFilemap(const char* path);
extern bool tablespaceDataIsValid(const char* path);
extern void copy_file(const char* fromfile, char* tofile);
extern void CompressedFileTruncate(const char* path, const RewindCompressInfo* rewindCompressInfo);
void FetchCompressedFile(char* buf, BlockNumber begin, int32 size);
void OpenCompressedPcaFile(const char* fileName, int32 chunkSize, int32 algorithm, bool rebuild);
void CloseCompressedPcaFile();
#endif /* FILE_OPS_H */

View File

@ -19,6 +19,7 @@
#include "catalog/catalog.h"
#include "catalog/pg_tablespace.h"
#include "common/fe_memutils.h"
#include "compressed_rewind.h"
#include "storage/cu.h"
#include "storage/smgr/fd.h"
@ -147,7 +148,8 @@ void filemapInit(void)
filemaptarget = filemap_create();
}
void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, const char* link_target)
void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, const char* link_target,
const RewindCompressInfo* info)
{
file_entry_t* entry = NULL;
filemap_t* map = filemaptarget;
@ -163,6 +165,8 @@ void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, co
entry->pagemap.bitmap = NULL;
entry->pagemap.bitmapsize = 0;
COPY_REWIND_COMPRESS_INFO(entry, info, info == NULL ? 0 : info->oldBlockNumber, 0)
if (map->last != NULL) {
map->last->next = entry;
map->last = entry;
@ -231,7 +235,7 @@ BuildErrorCode targetFilemapProcess(void)
filemap_t* map = filemaptarget;
for (i = 0; i < map->narray; i++) {
entry = map->array[i];
process_target_file(entry->path, entry->type, entry->oldsize, entry->link_target);
process_target_file(entry->path, entry->type, entry->oldsize, entry->link_target, &entry->rewindCompressInfo);
}
return BUILD_SUCCESS;
}
@ -342,7 +346,8 @@ static bool process_source_file_sanity_check(const char* path, file_type_t type)
* action needs to be taken for the file, depending on whether the file
* exists in the target and whether the size matches.
*/
void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target)
void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target,
RewindCompressInfo* info)
{
bool exists = false;
char localpath[MAXPGPATH];
@ -350,6 +355,7 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
filemap_t* map = filemap;
file_action_t action = FILE_ACTION_NONE;
size_t oldsize = 0;
BlockNumber oldBlockNumber = 0;
file_entry_t* entry = NULL;
int ss_c = 0;
bool isreldatafile = false;
@ -500,7 +506,21 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
* replayed.
*/
/* mod blocksize 8k to avoid half page write */
oldsize = statbuf.oldsize;
RewindCompressInfo oldRewindCompressInfo;
bool sourceCompressed = info != NULL;
bool targetCompressed = ProcessLocalPca(path, &oldRewindCompressInfo);
if (sourceCompressed && !targetCompressed) {
info->compressed = false;
action = FILE_ACTION_REMOVE;
break;
} else if (!sourceCompressed && targetCompressed) {
info = &oldRewindCompressInfo;
action = FILE_ACTION_REMOVE;
break;
} else if (sourceCompressed && targetCompressed) {
oldBlockNumber = oldRewindCompressInfo.oldBlockNumber;
oldsize = oldBlockNumber * BLCKSZ;
}
if (oldsize % BLOCKSIZE != 0) {
oldsize = oldsize - (oldsize % BLOCKSIZE);
pg_log(PG_PROGRESS, "target file size mod BLOCKSIZE not equal 0 %s %ld \n", path, statbuf.oldsize);
@ -531,6 +551,8 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
entry->pagemap.bitmapsize = 0;
entry->isrelfile = isreldatafile;
COPY_REWIND_COMPRESS_INFO(entry, info, oldBlockNumber, info == NULL ? 0 : info->newBlockNumber)
if (map->last != NULL) {
map->last->next = entry;
map->last = entry;
@ -546,7 +568,8 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
* marks target data directory's files that didn't exist in the source for
* deletion.
*/
void process_target_file(const char* path, file_type_t type, size_t oldsize, const char* link_target)
void process_target_file(const char* path, file_type_t type, size_t oldsize, const char* link_target,
const RewindCompressInfo* info)
{
bool exists = false;
file_entry_t key;
@ -575,7 +598,7 @@ void process_target_file(const char* path, file_type_t type, size_t oldsize, con
*/
for (int excludeIdx = 0; excludeFiles[excludeIdx] != NULL; excludeIdx++) {
if (strstr(path, excludeFiles[excludeIdx]) != NULL) {
pg_log(PG_DEBUG, "entry \"%s\" excluded from target file list", path);
pg_log(PG_DEBUG, "entry \"%s\" excluded from target file list\n", path);
return;
}
}
@ -627,6 +650,9 @@ void process_target_file(const char* path, file_type_t type, size_t oldsize, con
entry->pagemap.bitmapsize = 0;
entry->isrelfile = isRelDataFile(path);
COPY_REWIND_COMPRESS_INFO(entry, info, info == NULL ? 0 : info->oldBlockNumber, 0)
RewindCompressInfo *rewindCompressInfo = NULL;
COPY_REWIND_COMPRESS_INFO(entry, rewindCompressInfo, 0, 0)
if (map->last == NULL)
map->first = entry;
else

View File

@ -8,6 +8,7 @@
#ifndef FILEMAP_H
#define FILEMAP_H
#include "compressed_common.h"
#include "storage/smgr/relfilenode.h"
#include "storage/buf/block.h"
@ -42,6 +43,9 @@ typedef struct file_entry_t {
file_action_t action;
/* for compressed table */
RewindCompressInfo rewindCompressInfo;
/* for a regular file */
size_t oldsize;
size_t newsize;
@ -96,8 +100,10 @@ extern void print_filemap(void);
extern void print_filemap_to_file(FILE* file);
/* Functions for populating the filemap */
extern void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target);
extern void process_target_file(const char* path, file_type_t type, size_t newsize, const char* link_target);
extern void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target,
RewindCompressInfo* rewindCompressInfo = nullptr);
extern void process_target_file(const char* path, file_type_t type, size_t newsize, const char* link_target,
const RewindCompressInfo* rewindCompressInfo = nullptr);
extern void process_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno);
extern void filemap_finalize(void);
extern int targetFilemapSearch(const char* path, file_entry_t* entry);

View File

@ -161,7 +161,7 @@ BuildErrorCode findCommonCheckpoint(const char* datadir, TimeLineID tli, XLogRec
pg_fatal("find max lsn fail, errmsg:%s\n", returnmsg);
return BUILD_FATAL;
}
pg_log(PG_PROGRESS, "find max lsn success, %s\n", returnmsg);
pg_log(PG_PROGRESS, "find max lsn success, %s", returnmsg);
readprivate.datadir = datadir;
readprivate.tli = tli;

View File

@ -3626,8 +3626,9 @@
AddBuiltinFunc(_0(9038), _1("gs_query_standby_cluster_barrier_id_exist"), _2(1), _3(true), _4(false), _5(gs_query_standby_cluster_barrier_id_exist), _6(16), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(1, 25), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_query_standby_cluster_barrier_id_exist"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
),
AddFuncGroup(
"gs_read_block_from_remote", 1,
AddBuiltinFunc(_0(4767), _1("gs_read_block_from_remote"), _2(10), _3(true), _4(false), _5(gs_read_block_from_remote), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(10, 26, 26, 26, 21, 23, 28, 23, 28, 16, 23), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
"gs_read_block_from_remote", 2,
AddBuiltinFunc(_0(4767), _1("gs_read_block_from_remote"), _2(10), _3(true), _4(false), _5(gs_read_block_from_remote), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(10, 26, 26, 26, 21, 23, 28, 23, 28, 16, 23), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)),
AddBuiltinFunc(_0(5843), _1("gs_read_block_from_remote"), _2(11), _3(true), _4(false), _5(gs_read_block_from_remote_compress), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(11, 23, 23, 23, 21, 21, 23, 28, 23, 28, 16, 23), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote_compress"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
),
AddFuncGroup(
"gs_respool_exception_info", 1,
@ -8082,6 +8083,10 @@
AddBuiltinFunc(_0(3827), _1("pg_read_binary_file"), _2(4), _3(true), _4(false), _5(pg_read_binary_file), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(4, 25, 20, 20, 16), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_binary_file"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("read bytea from a file"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)),
AddBuiltinFunc(_0(3828), _1("pg_read_binary_file"), _2(1), _3(true), _4(false), _5(pg_read_binary_file_all), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(1, 25), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_binary_file_all"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("read bytea from a file"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
),
AddFuncGroup(
"pg_read_binary_file_blocks", 1,
AddBuiltinFunc(_0(8413), _1("pg_read_binary_file_blocks"), _2(3), _3(true), _4(true), _5(pg_read_binary_file_blocks), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(100), _11(20), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(3, 25, 20, 20), _21(7, 25, 20, 20, 25, 23, 23, 17), _22(7, 'i', 'i', 'i', 'o', 'o', 'o', 'o'), _23(7, "input", "blocknum", "blockcount", "path", "blocknum", "len", "data"), _24(NULL), _25("pg_read_binary_file_blocks"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'))
),
AddFuncGroup(
"pg_read_file", 2,
AddBuiltinFunc(_0(2624), _1("pg_read_file"), _2(3), _3(true), _4(false), _5(pg_read_file), _6(25), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(3, 25, 20, 20), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_file"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("read text from a file - old version for adminpack 1.0"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)),

View File

@ -87,6 +87,7 @@
#include "pgxc/groupmgr.h"
#include "storage/buf/buf.h"
#include "storage/predicate.h"
#include "storage/page_compression.h"
#include "storage/buf/bufmgr.h"
#include "storage/lmgr.h"
#include "storage/smgr/smgr.h"
@ -514,8 +515,9 @@ static void InitSubPartitionDef(Partition newPartition, Oid partOid, char strate
*/
Relation heap_create(const char* relname, Oid relnamespace, Oid reltablespace, Oid relid, Oid relfilenode,
Oid bucketOid, TupleDesc tupDesc, char relkind, char relpersistence, bool partitioned_relation, bool rowMovement,
bool shared_relation, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, Oid ownerid,
bool skip_create_storage, TableAmType tam_type, int8 relindexsplit, StorageType storage_type, bool newcbi)
bool shared_relation, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, Datum reloptions,
Oid ownerid, bool skip_create_storage, TableAmType tam_type, int8 relindexsplit, StorageType storage_type,
bool newcbi, Oid accessMethodObjectId)
{
bool create_storage = false;
Relation rel;
@ -626,9 +628,11 @@ Relation heap_create(const char* relname, Oid relnamespace, Oid reltablespace, O
relpersistence,
relkind,
row_compress,
reloptions,
tam_type,
relindexsplit,
storage_type
storage_type,
accessMethodObjectId
);
if (partitioned_relation) {
@ -2712,6 +2716,7 @@ Oid heap_create_with_catalog(const char *relname, Oid relnamespace, Oid reltable
mapped_relation,
allow_system_table_mods,
row_compress,
reloptions,
ownerid,
false,
tam,
@ -5248,7 +5253,7 @@ void dropDeltaTableOnPartition(Oid partId)
*
*/
Partition heapCreatePartition(const char* part_name, bool for_partitioned_table, Oid part_tablespace, Oid part_id,
Oid partFileNode, Oid bucketOid, Oid ownerid, StorageType storage_type, bool newcbi)
Oid partFileNode, Oid bucketOid, Oid ownerid, StorageType storage_type, bool newcbi, Datum reloptions)
{
Partition new_part_desc = NULL;
bool createStorage = false;
@ -5301,7 +5306,8 @@ Partition heapCreatePartition(const char* part_name, bool for_partitioned_table,
part_id, /* partition oid */
partFileNode, /* partition's file node, same as partition oid*/
part_tablespace,
for_partitioned_table ? HEAP_DISK : storage_type);
for_partitioned_table ? HEAP_DISK : storage_type,
reloptions);
/*
* Save newcbi as a context indicator to
@ -5805,7 +5811,9 @@ Oid heapAddRangePartition(Relation pgPartRel, Oid partTableOid, Oid partTablespa
newPartrelfileOid,
bucketOid,
ownerid,
storage_type);
storage_type,
false,
reloptions);
Assert(newPartitionOid == PartitionGetPartid(newPartition));
if (isSubpartition) {
@ -6012,7 +6020,9 @@ Oid HeapAddIntervalPartition(Relation pgPartRel, Relation rel, Oid partTableOid,
partrelfileOid,
bucketOid,
ownerid,
storage_type);
storage_type,
false,
reloptions);
pfree(partName);
Assert(newPartitionOid == PartitionGetPartid(newPartition));
@ -6100,7 +6110,8 @@ Oid HeapAddListPartition(Relation pgPartRel, Oid partTableOid, Oid partTablespac
forPartitionTable = false;
}
newListPartition = heapCreatePartition(newListPartDef->partitionName, forPartitionTable, newPartitionTableSpaceOid,
newListPartitionOid, partrelfileOid, bucketOid, ownerid, storage_type);
newListPartitionOid, partrelfileOid, bucketOid, ownerid, storage_type,false,
reloptions);
Assert(newListPartitionOid == PartitionGetPartid(newListPartition));
@ -6386,7 +6397,9 @@ Oid HeapAddHashPartition(Relation pgPartRel, Oid partTableOid, Oid partTablespac
partrelfileOid,
bucketOid,
ownerid,
storage_type);
storage_type,
false,
reloptions);
Assert(newHashPartitionOid == PartitionGetPartid(newHashPartition));
if (isSubpartition) {
@ -6561,7 +6574,9 @@ static void addNewPartitionTupleForTable(Relation pg_partition_rel, const char*
new_partition_rfoid,
InvalidOid,
ownerid,
HEAP_DISK);
HEAP_DISK,
false,
reloptions);
Assert(new_partition_oid == PartitionGetPartid(new_partition));
new_partition->pd_part->parttype = PART_OBJ_TYPE_PARTED_TABLE;

View File

@ -912,9 +912,9 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid index
indexRelation = heap_create(indexRelationName, namespaceId, tableSpaceId, indexRelationId, relFileNode,
RELATION_CREATE_BUCKET(heapRelation) ? heapRelation->rd_bucketoid : InvalidOid, indexTupDesc, relKind,
relpersistence, isLocalPart, false, shared_relation, mapped_relation, allow_system_table_mods,
REL_CMPRS_NOT_SUPPORT, heapRelation->rd_rel->relowner, skip_create_storage,
REL_CMPRS_NOT_SUPPORT, (Datum)reloptions, heapRelation->rd_rel->relowner, skip_create_storage,
isUstore ? TAM_USTORE : TAM_HEAP, /* XXX: Index tables are by default HEAP Table Type */
relindexsplit, storage_type, extra->crossBucket);
relindexsplit, storage_type, extra->crossBucket, accessMethodObjectId);
Assert(indexRelationId == RelationGetRelid(indexRelation));
@ -932,7 +932,6 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid index
* XXX should have a cleaner way to create cataloged indexes
*/
indexRelation->rd_rel->relowner = heapRelation->rd_rel->relowner;
indexRelation->rd_rel->relam = accessMethodObjectId;
indexRelation->rd_rel->relhasoids = false;
if (accessMethodObjectId == PSORT_AM_OID) {
@ -1244,7 +1243,8 @@ Oid partition_index_create(const char* partIndexName, /* the name of partition i
parentIndex->rd_bucketoid,
parentIndex->rd_rel->relowner,
RelationGetStorageType(parentIndex),
extra->crossbucket);
extra->crossbucket,
indexRelOptions);
partitionIndex->pd_part->parttype = PART_OBJ_TYPE_INDEX_PARTITION;
partitionIndex->pd_part->rangenum = 0;
partitionIndex->pd_part->parentid = parentIndexId;
@ -1282,9 +1282,13 @@ Oid partition_index_create(const char* partIndexName, /* the name of partition i
partitionIndex->pd_part->relfrozenxid = (ShortTransactionId)InvalidTransactionId;
/* insert into pg_partition */
#ifndef ENABLE_MULTIPLE_NODES
insertPartitionEntry(pg_partition_rel, partitionIndex, partitionIndex->pd_id, NULL, NULL, 0, 0, 0, indexRelOptions,
PART_OBJ_TYPE_INDEX_PARTITION);
#else
insertPartitionEntry(
pg_partition_rel, partitionIndex, partitionIndex->pd_id, NULL, NULL, 0, 0, 0, 0, PART_OBJ_TYPE_INDEX_PARTITION);
#endif
/* Make the above change visible */
CommandCounterIncrement();

View File

@ -319,17 +319,30 @@ void log_smgrcreate(RelFileNode* rnode, ForkNumber forkNum)
if (IsSegmentFileNode(*rnode)) {
return;
}
xl_smgr_create_compress xlrec;
uint size;
uint8 info = XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE;
/*
* compressOptions Copy
*/
if (rnode->opt != 0) {
xlrec.pageCompressOpts = rnode->opt;
size = sizeof(xl_smgr_create_compress);
info |= XLR_REL_COMPRESS;
} else {
size = sizeof(xl_smgr_create);
}
/*
* Make an XLOG entry reporting the file creation.
*/
xl_smgr_create xlrec;
xlrec.forkNum = forkNum;
RelFileNodeRelCopy(xlrec.rnode, *rnode);
xlrec.xlrec.forkNum = forkNum;
RelFileNodeRelCopy(xlrec.xlrec.rnode, *rnode);
XLogBeginInsert();
XLogRegisterData((char*)&xlrec, sizeof(xlrec));
XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE, rnode->bucketNode);
XLogRegisterData((char*)&xlrec, size);
XLogInsert(RM_SMGR_ID, info, rnode->bucketNode);
}
static void CStoreRelDropStorage(Relation rel, RelFileNode* rnode, Oid ownerid)
@ -691,14 +704,24 @@ void RelationTruncate(Relation rel, BlockNumber nblocks)
* Make an XLOG entry reporting the file truncation.
*/
XLogRecPtr lsn;
xl_smgr_truncate xlrec;
xl_smgr_truncate_compress xlrec;
uint size;
uint8 info = XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE;
xlrec.blkno = nblocks;
RelFileNodeRelCopy(xlrec.rnode, rel->rd_node);
xlrec.xlrec.blkno = nblocks;
if (rel->rd_node.opt != 0) {
xlrec.pageCompressOpts = rel->rd_node.opt;
size = sizeof(xl_smgr_truncate_compress);
info |= XLR_REL_COMPRESS;
} else {
size = sizeof(xl_smgr_truncate);
}
RelFileNodeRelCopy(xlrec.xlrec.rnode, rel->rd_node);
XLogBeginInsert();
XLogRegisterData((char*)&xlrec, sizeof(xlrec));
XLogRegisterData((char*)&xlrec, size);
lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE, rel->rd_node.bucketNode);
/*
@ -1213,7 +1236,7 @@ void smgr_redo(XLogReaderState* record)
{
XLogRecPtr lsn = record->EndRecPtr;
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
bool compress = XLogRecGetInfo(record) & XLR_REL_COMPRESS;
/* Backup blocks are not used in smgr records */
Assert(!XLogRecHasAnyBlockRefs(record));
@ -1222,14 +1245,14 @@ void smgr_redo(XLogReaderState* record)
RelFileNode rnode;
RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record));
smgr_redo_create(rnode, xlrec->forkNum, (char *)xlrec);
/* Redo column file, attid is hidden in forkNum */
rnode.opt = compress ? ((xl_smgr_create_compress*)XLogRecGetData(record))->pageCompressOpts : 0;
smgr_redo_create(rnode, xlrec->forkNum, (char *)xlrec);
/* Redo column file, attid is hidden in forkNum */
} else if (info == XLOG_SMGR_TRUNCATE) {
xl_smgr_truncate* xlrec = (xl_smgr_truncate*)XLogRecGetData(record);
RelFileNode rnode;
RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record));
rnode.opt = compress ? ((xl_smgr_truncate_compress*)XLogRecGetData(record))->pageCompressOpts : 0;
/*
* Forcibly create relation if it doesn't exist (which suggests that
* it was dropped somewhere later in the WAL sequence). As in

View File

@ -3695,12 +3695,21 @@ IndexStmt* transformIndexStmt(Oid relid, IndexStmt* stmt, const char* queryStrin
if (!isColStore && (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIN_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_USTORE_INDEX_TYPE))) {
/* row store only support btree/ubtree/gin/gist index */
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_USTORE_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE))) {
/* row store only support btree/ubtree/gin/gist/hash index */
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
}
if (0 == pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE) &&
t_thrd.proc->workingVersionNum < SUPPORT_HASH_XLOG_VERSION_NUM) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
}
if (isColStore && (!isPsortMothed && !isCBtreeMethod && !isCGinBtreeMethod)) {
/* column store support psort/cbtree/gin index */
ereport(ERROR,

View File

@ -70,6 +70,7 @@
#include "storage/custorage.h"
#include "storage/smgr/segment.h"
#include "storage/cstore/cstore_compress.h"
#include "storage/page_compression.h"
#include "vecexecutor/vecnodes.h"
#ifdef PGXC
@ -792,6 +793,7 @@ int64 calculate_relation_size(RelFileNode* rfn, BackendId backend, ForkNumber fo
relationpath = relpathbackend(*rfn, backend, forknum);
bool rowCompress = IS_COMPRESSED_RNODE((*rfn), forknum);
for (segcount = 0;; segcount++) {
struct stat fst;
@ -808,7 +810,7 @@ int64 calculate_relation_size(RelFileNode* rfn, BackendId backend, ForkNumber fo
else
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathname)));
}
totalsize += fst.st_size;
totalsize += rowCompress ? CalculateMainForkSize((char*)pathname, rfn, forknum) : fst.st_size;
}
pfree_ext(relationpath);

View File

@ -316,6 +316,132 @@ Datum pg_read_binary_file_all(PG_FUNCTION_ARGS)
PG_RETURN_BYTEA_P(read_binary_file(filename, 0, -1, false));
}
struct CompressAddressItemState {
uint32 blkno;
int segmentNo;
ReadBlockChunksStruct rbStruct;
FILE *pcaFile;
};
static void ReadBinaryFileBlocksFirstCall(PG_FUNCTION_ARGS, int32 startBlockNum, int32 blockCount)
{
char* path = convert_and_check_filename(PG_GETARG_TEXT_PP(0));
int segmentNo = 0;
UndoFileType undoFileType = UNDO_INVALID;
if (!is_row_data_file(path, &segmentNo, &undoFileType)) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("%s is not a relation file.", path)));
}
/* create a function context for cross-call persistence */
FuncCallContext* fctx = SRF_FIRSTCALL_INIT();
/* switch to memory context appropriate for multiple function calls */
MemoryContext mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
/* initialize file scanning code */
CompressAddressItemState* itemState = (CompressAddressItemState*)palloc(sizeof(CompressAddressItemState));
/* save mmap to inter_call_data->pcMap */
char pcaFilePath[MAXPGPATH];
errno_t rc = snprintf_s(pcaFilePath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path);
securec_check_ss(rc, "\0", "\0");
FILE* pcaFile = AllocateFile((const char*)pcaFilePath, "rb");
if (pcaFile == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcaFilePath)));
}
PageCompressHeader* map = pc_mmap(fileno(pcaFile), ReadChunkSize(pcaFile, pcaFilePath, MAXPGPATH), true);
if (map == MAP_FAILED) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to mmap %s: %m", pcaFilePath)));
}
if ((BlockNumber)startBlockNum + (BlockNumber)blockCount > map->nblocks) {
auto blockNum = map->nblocks;
ReleaseMap(map, pcaFilePath);
ereport(ERROR,
(ERRCODE_INVALID_PARAMETER_VALUE,
errmsg("invalid blocknum \"%d\" and block count \"%d\", the max blocknum is \"%u\"",
startBlockNum,
blockCount,
blockNum)));
}
/* construct ReadBlockChunksStruct */
char* pcdFilePath = (char*)palloc0(MAXPGPATH);
rc = snprintf_s(pcdFilePath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path);
securec_check_ss(rc, "\0", "\0");
FILE* fp = AllocateFile(pcdFilePath, "rb");
if (fp == NULL) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcdFilePath)));
}
char* pageBuffer = (char*)palloc(BLCKSZ);
itemState->pcaFile = pcaFile;
itemState->rbStruct.header = map;
itemState->rbStruct.pageBuffer = pageBuffer;
itemState->rbStruct.pageBufferLen = BLCKSZ;
itemState->rbStruct.fp = fp;
itemState->rbStruct.segmentNo = segmentNo;
itemState->rbStruct.fileName = pcdFilePath;
/*
* build tupdesc for result tuples. This must match this function's
* pg_proc entry!
*/
TupleDesc tupdesc = CreateTemplateTupleDesc(4, false, TAM_HEAP);
TupleDescInitEntry(tupdesc, (AttrNumber)1, "path", TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)2, "blocknum", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)3, "len", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)4, "data", BYTEAOID, -1, 0);
fctx->tuple_desc = BlessTupleDesc(tupdesc);
itemState->blkno = startBlockNum;
fctx->max_calls = blockCount;
fctx->user_fctx = itemState;
MemoryContextSwitchTo(mctx);
}
Datum pg_read_binary_file_blocks(PG_FUNCTION_ARGS)
{
int32 startBlockNum = PG_GETARG_INT32(1);
int32 blockCount = PG_GETARG_INT32(2);
if (startBlockNum < 0 || blockCount <= 0 || startBlockNum + blockCount > RELSEG_SIZE) {
ereport(ERROR, (ERRCODE_INVALID_PARAMETER_VALUE,
errmsg("invalid blocknum \"%d\" or block count \"%d\"", startBlockNum, blockCount)));
}
/* stuff done only on the first call of the function */
if (SRF_IS_FIRSTCALL()) {
ReadBinaryFileBlocksFirstCall(fcinfo, startBlockNum, blockCount);
}
/* stuff done on every call of the function */
FuncCallContext *fctx = SRF_PERCALL_SETUP();
CompressAddressItemState *itemState = (CompressAddressItemState *)fctx->user_fctx;
if (fctx->call_cntr < fctx->max_calls) {
bytea *buf = (bytea *)palloc(BLCKSZ + VARHDRSZ);
size_t len = ReadAllChunkOfBlock(VARDATA(buf), BLCKSZ, itemState->blkno, itemState->rbStruct);
SET_VARSIZE(buf, len + VARHDRSZ);
Datum values[4];
values[0] = PG_GETARG_DATUM(0);
values[1] = Int32GetDatum(itemState->blkno);
values[2] = Int32GetDatum(len);
values[3] = PointerGetDatum(buf);
/* Build and return the result tuple. */
bool nulls[4];
securec_check(memset_s(nulls, sizeof(nulls), 0, sizeof(nulls)), "\0", "\0");
HeapTuple tuple = heap_form_tuple(fctx->tuple_desc, (Datum*)values, (bool*)nulls);
Datum result = HeapTupleGetDatum(tuple);
itemState->blkno++;
SRF_RETURN_NEXT(fctx, result);
} else {
if (itemState->rbStruct.header != NULL) {
pc_munmap(itemState->rbStruct.header);
}
FreeFile(itemState->pcaFile);
FreeFile(itemState->rbStruct.fp);
SRF_RETURN_DONE(fctx);
}
}
/*
* stat a file

View File

@ -664,3 +664,281 @@ void pglz_decompress(const PGLZ_Header* source, char* dest)
* That's it.
*/
}
/* ----------
* lz_compress -
*
* Compresses source into dest using strategy. Returns the number of
* bytes written in buffer dest, or -1 if compression fails.
* ----------
*/
int32 lz_compress(const char* source, int32 slen, char* dest)
{
unsigned char* bp = (unsigned char*) dest;
unsigned char* bstart = bp;
int hist_next = 0;
bool hist_recycle = false;
const char* dp = source;
const char* dend = source + slen;
unsigned char ctrl_dummy = 0;
unsigned char* ctrlp = &ctrl_dummy;
unsigned char ctrlb = 0;
unsigned char ctrl = 0;
bool found_match = false;
int32 match_len;
int32 match_off;
int32 good_match;
int32 good_drop;
int32 result_size;
int32 result_max;
int32 need_rate;
errno_t rc;
const PGLZ_Strategy* strategy = PGLZ_strategy_always;
/*
* Our fallback strategy is the default.
*/
if (strategy == NULL) {
strategy = PGLZ_strategy_default;
}
/*
* If the strategy forbids compression (at all or if source chunk size out
* of range), fail.
*/
if (strategy->match_size_good <= 0 || slen < strategy->min_input_size || slen > strategy->max_input_size) {
return -1;
}
/*
* Limit the match parameters to the supported range.
*/
good_match = strategy->match_size_good;
if (good_match > PGLZ_MAX_MATCH) {
good_match = PGLZ_MAX_MATCH;
} else if (good_match < 17) {
good_match = 17;
}
good_drop = strategy->match_size_drop;
if (good_drop < 0) {
good_drop = 0;
} else if (good_drop > 100) {
good_drop = 100;
}
need_rate = strategy->min_comp_rate;
if (need_rate < 0) {
need_rate = 0;
} else if (need_rate > 99) {
need_rate = 99;
}
/*
* Compute the maximum result size allowed by the strategy, namely the
* input size minus the minimum wanted compression rate. This had better
* be <= slen, else we might overrun the provided output buffer.
*/
if (slen > (INT_MAX / 100)) {
/* Approximate to avoid overflow */
result_max = (slen / 100) * (100 - need_rate);
} else {
result_max = (slen * (100 - need_rate)) / 100;
}
/*
* Initialize the history lists to empty. We do not need to zero the
* hist_entries[] array; its entries are initialized as they are used.
*/
rc = memset_s(u_sess->utils_cxt.hist_start, HIST_START_LEN, 0, HIST_START_LEN);
securec_check(rc, "\0", "\0");
/*
* Compress the source directly into the output buffer.
*/
while (dp < dend) {
/*
* If we already exceeded the maximum result size, fail.
*
* We check once per loop; since the loop body could emit as many as 4
* bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
* allow 4 slop bytes.
*/
if (bp - bstart >= result_max) {
return -1;
}
/*
* If we've emitted more than first_success_by bytes without finding
* anything compressible at all, fail. This lets us fall out
* reasonably quickly when looking at incompressible input (such as
* pre-compressed data).
*/
if (!found_match && bp - bstart >= strategy->first_success_by) {
return -1;
}
/*
* Try to find a match in the history
*/
if (pglz_find_match(u_sess->utils_cxt.hist_start, dp, dend, &match_len, &match_off, good_match, good_drop)) {
/*
* Create the tag and add history entries for all matched
* characters.
*/
pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
while (match_len--) {
pglz_hist_add(
u_sess->utils_cxt.hist_start, u_sess->utils_cxt.hist_entries, hist_next, hist_recycle, dp,
dend);
dp++; /* Do not do this ++ in the line above! */
/* The macro would do it four times - Jan. */
}
found_match = true;
} else {
/*
* No match found. Copy one literal byte.
*/
pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
pglz_hist_add(
u_sess->utils_cxt.hist_start, u_sess->utils_cxt.hist_entries, hist_next, hist_recycle, dp, dend);
dp++; /* Do not do this ++ in the line above! */
/* The macro would do it four times - Jan. */
}
}
/*
* Write out the last control byte and check that we haven't overrun the
* output size allowed by the strategy.
*/
*ctrlp = ctrlb;
result_size = bp - bstart;
if (result_size >= result_max) {
return -1;
}
/* success */
return result_size;
}
/* ----------
* pglz_decompress -
*
* Decompresses source into dest. Returns the number of bytes
* decompressed in the destination buffer, and *optionally*
* checks that both the source and dest buffers have been
* fully read and written to, respectively.
* ----------
*/
int32 lz_decompress(const char* source, int32 slen, char* dest, int32 rawsize, bool check_complete)
{
const unsigned char* sp;
const unsigned char* srcend;
unsigned char* dp;
unsigned char* destend;
errno_t rc = 0;
sp = (const unsigned char*) source;
srcend = ((const unsigned char*) source) + slen;
dp = (unsigned char*) dest;
destend = dp + rawsize;
while (sp < srcend && dp < destend) {
/*
* Read one control byte and process the next 8 items (or as many as
* remain in the compressed input).
*/
unsigned char ctrl = *sp++;
int ctrlc;
for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) {
if (ctrl & 1) {
/*
* Set control bit means we must read a match tag. The match
* is coded with two bytes. First byte uses lower nibble to
* code length - 3. Higher nibble contains upper 4 bits of the
* offset. The next following byte contains the lower 8 bits
* of the offset. If the length is coded as 18, another
* extension tag byte tells how much longer the match really
* was (0-255).
*/
int32 len;
int32 off;
len = (sp[0] & 0x0f) + 3;
off = ((sp[0] & 0xf0) << 4) | sp[1];
sp += 2;
if (len == 18) {
len += *sp++;
}
/*
* Now we copy the bytes specified by the tag from OUTPUT to
* OUTPUT (copy len bytes from dp - off to dp). The copied
* areas could overlap, to preven possible uncertainty, we
* copy only non-overlapping regions.
*/
len = Min(len, destend - dp);
while (off < len) {
/*---------
* When offset is smaller than length - source and
* destination regions overlap. memmove() is resolving
* this overlap in an incompatible way with pglz. Thus we
* resort to memcpy()-ing non-overlapping regions.
*
* Consider input: 112341234123412341234
* At byte 5 here ^ we have match with length 16 and
* offset 4. 11234M(len=16, off=4)
* We are decoding first period of match and rewrite match
* 112341234M(len=12, off=8)
*
* The same match is now at position 9, it points to the
* same start byte of output, but from another position:
* the offset is doubled.
*
* We iterate through this offset growth until we can
* proceed to usual memcpy(). If we would try to decode
* the match at byte 5 (len=16, off=4) by memmove() we
* would issue memmove(5, 1, 16) which would produce
* 112341234XXXXXXXXXXXX, where series of X is 12
* undefined bytes, that were at bytes [5:17].
* ---------
*/
errno_t rc = memcpy_s(dp, off + 1, dp - off, off);
securec_check(rc, "", "");
len -= off;
dp += off;
off += off;
}
rc = memcpy_s(dp, len + 1, dp - off, len);
securec_check(rc, "", "");
dp += len;
} else {
/*
* An unset control bit means LITERAL BYTE. So we just copy
* one from INPUT to OUTPUT.
*/
*dp++ = *sp++;
}
/*
* Advance the control bit
*/
ctrl >>= 1;
}
}
/*
* Check we decompressed the right amount. If we are slicing, then we
* won't necessarily be at the end of the source or dest buffers when we
* hit a stop, so we don't test them.
*/
if (check_complete && (dp != destend || sp != srcend)) {
return -1;
}
/*
* That's it.
*/
return (char*) dp - dest;
}

View File

@ -59,6 +59,7 @@
#include "rewrite/rewriteDefine.h"
#include "rewrite/rewriteHandler.h"
#include "storage/lmgr.h"
#include "storage/page_compression.h"
#include "storage/smgr/smgr.h"
#include "storage/smgr/segment.h"
#include "catalog/storage.h"
@ -326,6 +327,7 @@ Partition PartitionBuildDesc(Oid targetPartId, StorageType storage_type, bool in
return partition;
}
void PartitionInitPhysicalAddr(Partition partition)
{
partition->pd_node.spcNode = ConvertToRelfilenodeTblspcOid(partition->pd_part->reltablespace);
@ -350,6 +352,12 @@ void PartitionInitPhysicalAddr(Partition partition)
partition->pd_id)));
}
}
partition->pd_node.opt = 0;
if (partition->rd_options) {
SetupPageCompressForRelation(&partition->pd_node, &((StdRdOptions*)(partition->rd_options))->compress,
PartitionGetPartitionName(partition));
}
}
/*
@ -441,7 +449,7 @@ void PartitionClose(Partition partition)
}
Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid partfilenode, Oid parttablespace,
StorageType storage_type)
StorageType storage_type, Datum reloptions)
{
Partition part;
MemoryContext oldcxt;
@ -490,6 +498,11 @@ Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid part
if (partfilenode != InvalidOid) {
PartitionInitPhysicalAddr(part);
/* compressed option was set by PartitionInitPhysicalAddr if part->rd_options != NULL */
if (part->rd_options == NULL && reloptions) {
StdRdOptions* options = (StdRdOptions*)default_reloptions(reloptions, false, RELOPT_KIND_HEAP);
SetupPageCompressForRelation(&part->pd_node, &options->compress, PartitionGetPartitionName(part));
}
}
if (storage_type == SEGMENT_PAGE) {

View File

@ -176,6 +176,7 @@
#include "rewrite/rewriteDefine.h"
#include "rewrite/rewriteRlsPolicy.h"
#include "storage/lmgr.h"
#include "storage/page_compression.h"
#include "storage/smgr/smgr.h"
#include "storage/smgr/segment.h"
#include "threadpool/threadpool.h"
@ -1284,7 +1285,6 @@ static void IndexSupportInitialize(Relation relation, oidvector* indclass, Strat
static OpClassCacheEnt* LookupOpclassInfo(Relation relation, Oid operatorClassOid, StrategyNumber numSupport);
static void RelationCacheInitFileRemoveInDir(const char* tblspcpath);
static void unlink_initfile(const char* initfilename);
/*
* ScanPgRelation
*
@ -2499,6 +2499,12 @@ void RelationInitPhysicalAddr(Relation relation)
if (!RelationIsPartitioned(relation) && relation->storage_type == SEGMENT_PAGE) {
relation->rd_node.bucketNode = SegmentBktId;
}
// setup page compression options
relation->rd_node.opt = 0;
if (relation->rd_options && REL_SUPPORT_COMPRESSED(relation)) {
SetupPageCompressForRelation(&relation->rd_node, &((StdRdOptions*)(relation->rd_options))->compress, RelationGetRelationName(relation));
}
}
static void IndexRelationInitKeyNums(Relation relation)
@ -4335,8 +4341,9 @@ void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTrans
* and enter it into the relcache.
*/
Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, TupleDesc tupDesc, Oid relid,
Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence, char relkind,
int8 row_compress, TableAmType tam_type, int8 relindexsplit, StorageType storage_type)
Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence,
char relkind, int8 row_compress, Datum reloptions, TableAmType tam_type, int8 relindexsplit,
StorageType storage_type, Oid accessMethodObjectId)
{
Relation rel;
MemoryContext oldcxt;
@ -4452,6 +4459,7 @@ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, Tuple
rel->rd_rel->relowner = BOOTSTRAP_SUPERUSERID;
rel->rd_rel->parttype = PARTTYPE_NON_PARTITIONED_RELATION;
rel->rd_rel->relrowmovement = false;
rel->rd_rel->relam = accessMethodObjectId;
/* set up persistence and relcache fields dependent on it */
rel->rd_rel->relpersistence = relpersistence;
@ -4508,6 +4516,13 @@ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, Tuple
RelationInitPhysicalAddr(rel);
/* compressed option was set by RelationInitPhysicalAddr if rel->rd_options != NULL */
if (rel->rd_options == NULL && reloptions && SUPPORT_COMPRESSED(relkind, rel->rd_rel->relam)) {
StdRdOptions *options = (StdRdOptions *) default_reloptions(reloptions, false, RELOPT_KIND_HEAP);
SetupPageCompressForRelation(&rel->rd_node, &options->compress, RelationGetRelationName(rel));
}
/* materialized view not initially scannable */
if (relkind == RELKIND_MATVIEW)
rel->rd_isscannable = false;
@ -8106,6 +8121,45 @@ void GetTdeInfoFromRel(Relation rel, TdeInfo *tde_info)
}
}
void SetupPageCompressForRelation(RelFileNode* node, PageCompressOpts* compress_options, const char* relationName)
{
uint1 algorithm = compress_options->compressType;
if (algorithm == COMPRESS_TYPE_NONE) {
node->opt = 0;
} else {
if (!SUPPORT_PAGE_COMPRESSION) {
ereport(ERROR, (errmsg("unsupported page compression on this platform")));
}
uint1 compressLevel;
bool symbol = false;
if (compress_options->compressLevel >= 0) {
symbol = true;
compressLevel = compress_options->compressLevel;
} else {
symbol = false;
compressLevel = -compress_options->compressLevel;
}
bool success = false;
uint1 chunkSize = ConvertChunkSize(compress_options->compressChunkSize, &success);
if (!success) {
ereport(ERROR, (errmsg("invalid compress_chunk_size %d , must be one of %d, %d, %d or %d for %s",
compress_options->compressChunkSize, BLCKSZ / 16, BLCKSZ / 8, BLCKSZ / 4, BLCKSZ / 2,
relationName)));
}
uint1 preallocChunks;
if (compress_options->compressPreallocChunks >= BLCKSZ / compress_options->compressChunkSize) {
ereport(ERROR, (errmsg("invalid compress_prealloc_chunks %d , must be less than %d for %s",
compress_options->compressPreallocChunks,
BLCKSZ / compress_options->compressChunkSize, relationName)));
} else {
preallocChunks = (uint1)(compress_options->compressPreallocChunks);
}
node->opt = 0;
SET_COMPRESS_OPTION((*node), compress_options->compressByteConvert, compress_options->compressDiffConvert,
preallocChunks, symbol, compressLevel, algorithm, chunkSize);
}
}
char RelationGetRelReplident(Relation r)
{
bool isNull = false;

View File

@ -59,7 +59,7 @@ bool open_join_children = true;
bool will_shutdown = false;
/* hard-wired binary version number */
const uint32 GRAND_VERSION_NUM = 92602;
const uint32 GRAND_VERSION_NUM = 92603;
const uint32 PREDPUSH_SAME_LEVEL_VERSION_NUM = 92522;
const uint32 UPSERT_WHERE_VERSION_NUM = 92514;
@ -108,6 +108,7 @@ const uint32 V5R2C00_START_VERSION_NUM = 92350;
const uint32 V5R2C00_BACKEND_VERSION_NUM = 92412;
const uint32 ANALYZER_HOOK_VERSION_NUM = 92592;
const uint32 SUPPORT_HASH_XLOG_VERSION_NUM = 92603;
/* This variable indicates wheather the instance is in progress of upgrade as a whole */
uint32 volatile WorkingGrandVersionNum = GRAND_VERSION_NUM;

View File

@ -971,6 +971,7 @@ const char* const config_group_names[] = {
/* INSTRUMENTS_OPTIONS */
gettext_noop("Instruments Options"),
gettext_noop("Column Encryption"),
gettext_noop("Compress Options"),
#ifdef PGXC
/* DATA_NODES */
gettext_noop("Datanodes and Connection Pooling"),

View File

@ -114,6 +114,7 @@ bool gs_memory_enjection(void)
}
#endif
/*
* check if the node is on heavy memory status now?
* is strict is true, we'll do some pre-judgement.
@ -907,6 +908,36 @@ int MemoryProtectFunctions::gs_posix_memalign(void** memptr, Size alignment, Siz
return ENOMEM; /* insufficient memory */
}
/**
* reseve memory for mmap of compressed table
* @tparam mem_type MEM_SHRD is supported only
* @param sz reserved size(bytes)
* @param needProtect
* @return success or not
*/
template <MemType type>
bool MemoryProtectFunctions::gs_memprot_reserve(Size sz, bool needProtect)
{
if (type != MEM_SHRD) {
return false;
}
return memTracker_ReserveMem<type>(sz, needProtect);
}
/**
* release the momery allocated by gs_memprot_reserve
* @tparam type MEM_SHRD is supported only
* @param sz free size(bytes)
*/
template <MemType type>
void MemoryProtectFunctions::gs_memprot_release(Size sz)
{
if (type != MEM_SHRD) {
return;
}
memTracker_ReleaseMem<type>(sz);
}
/* thread level initialization */
void gs_memprot_thread_init(void)
{

View File

@ -452,9 +452,6 @@ static void ResourceOwnerReleaseInternal(
MemoryContextDelete(memContext);
ResourceOwnerForgetGMemContext(t_thrd.utils_cxt.TopTransactionResourceOwner, memContext);
}
/* Clean up index scans too */
ReleaseResources_hash();
}
t_thrd.utils_cxt.CurrentResourceOwner = save;

View File

@ -110,6 +110,7 @@
#include <limits.h>
#include "access/nbtree.h"
#include "access/hash.h"
#include "access/tableam.h"
#include "access/ustore/knl_utuple.h"
#include "access/tableam.h"
@ -415,6 +416,7 @@ struct Tuplesortstate {
* These variables are specific to the IndexTuple case; they are set by
* tuplesort_begin_index_xxx and used only by the IndexTuple routines.
*/
Relation heapRel; /* table the index is being built on */
Relation indexRel; /* index being built */
/* These are specific to the index_btree subcase: */
@ -422,7 +424,9 @@ struct Tuplesortstate {
bool enforceUnique; /* complain if we find duplicate tuples */
/* These are specific to the index_hash subcase: */
uint32 hash_mask; /* mask for sortable part of hash code */
uint32 high_mask; /* masks for sortable part of hash code */
uint32 low_mask;
uint32 max_buckets;
/*
* These variables are specific to the Datum case; they are set by
@ -970,7 +974,8 @@ Tuplesortstate* tuplesort_begin_index_btree(
}
Tuplesortstate* tuplesort_begin_index_hash(
Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem)
Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask,
uint32 max_buckets, int workMem, bool randomAccess, int maxMem)
{
Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess);
MemoryContext oldcontext;
@ -980,11 +985,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
#ifdef TRACE_SORT
if (u_sess->attr.attr_common.trace_sort) {
elog(LOG,
"begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c, maxMem = %d",
hash_mask,
workMem,
randomAccess ? 't' : 'f',
maxMem);
"begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
"max_buckets = 0x%x, workMem = %d, randomAccess = %c",
high_mask,
low_mask,
max_buckets,
workMem, randomAccess ? 't' : 'f');
}
#endif
@ -999,9 +1005,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
#endif
state->reversedirection = reversedirection_index_hash;
state->heapRel = heapRel;
state->indexRel = indexRel;
state->hash_mask = hash_mask;
state->high_mask = high_mask;
state->low_mask = low_mask;
state->max_buckets = max_buckets;
state->maxMem = maxMem * 1024L;
(void)MemoryContextSwitchTo(oldcontext);
@ -3810,8 +3819,8 @@ static int comparetup_index_btree(const SortTuple* a, const SortTuple* b, Tuples
static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tuplesortstate* state)
{
uint32 hash1;
uint32 hash2;
Bucket bucket1;
Bucket bucket2;
IndexTuple tuple1;
IndexTuple tuple2;
@ -3820,13 +3829,17 @@ static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tupleso
* that the first column of the index tuple is the hash key.
*/
Assert(!a->isnull1);
hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
state->max_buckets, state->high_mask,
state->low_mask);
Assert(!b->isnull1);
hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
state->max_buckets, state->high_mask,
state->low_mask);
if (hash1 > hash2) {
if (bucket1 > bucket2) {
return 1;
} else if (hash1 < hash2) {
} else if (bucket1 < bucket2) {
return -1;
}

View File

@ -696,6 +696,7 @@ ifneq ($(with_openeuler_os), yes)
cp -d $(with_3rd)/$(BINARYPATH)/event/$(LIB_SUPPORT_LLT)/lib/libevent* '$(DESTDIR)$(libdir)/'
cp $(SECUREDYNAMICLIB_HOME)/libsecurec* '$(DESTDIR)$(libdir)/'
ifneq (, $(findstring __USE_NUMA, $(CFLAGS)))
cp $(ZSTD_LIB_PATH)/libzstd* '$(DESTDIR)$(libdir)/'
cp $(NUMA_LIB_PATH)/* '$(DESTDIR)$(libdir)/'
endif
ifeq ($(enable_mot), yes)

View File

@ -234,6 +234,7 @@ Boot_CreateStmt:
mapped_relation,
true,
REL_CMPRS_NOT_SUPPORT,
(Datum)0,
BOOTSTRAP_SUPERUSERID,
false,
TAM_HEAP,

View File

@ -246,6 +246,8 @@ int GetRemoteConnInfo(char* remoteAddress, char* remoteReadConnInfo, int len)
* @IN spcnode: tablespace id
* @IN dbnode: database id
* @IN relnode: relfilenode
* @IN bucketnode: bucketnode
* @IN opt: compressed table options
* @IN/OUT forknum: forknum
* @IN/OUT blocknum: block number
* @IN/OUT blocksize: block size
@ -284,7 +286,7 @@ extern int RemoteGetPage(char* remoteAddress, RepairBlockKey *key, uint32 blocks
tnRet = snprintf_s(sqlCommands, MAX_PATH_LEN, MAX_PATH_LEN - 1,
"SELECT gs_read_block_from_remote(%u, %u, %u, %d, %d, '%lu', %u, '%lu', false, %d);",
key->relfilenode.spcNode, key->relfilenode.dbNode, key->relfilenode.relNode,
key->relfilenode.bucketNode, key->forknum, key->blocknum, blocksize, lsn, timeout);
key->relfilenode.bucketNode, key->relfilenode.opt, key->forknum, key->blocknum, blocksize, lsn, timeout);
}
securec_check_ss(tnRet, "", "");

View File

@ -1054,6 +1054,14 @@ Oid DefineIndex(Oid relationId, IndexStmt* stmt, Oid indexRelationId, bool is_al
}
}
TableCreateSupport indexCreateSupport{false,false,false,false,false,false};
ListCell* cell = NULL;
foreach (cell, stmt->options) {
DefElem* defElem = (DefElem*)lfirst(cell);
SetOneOfCompressOption(defElem->defname, &indexCreateSupport);
}
CheckCompressOption(&indexCreateSupport);
/*
* Parse AM-specific options, convert to text array form, validate.
*/

View File

@ -125,6 +125,7 @@
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/lock/lock.h"
#include "storage/page_compression.h"
#include "storage/predicate.h"
#include "storage/remote_read.h"
#include "storage/smgr/segment.h"
@ -1090,10 +1091,10 @@ static bool isOrientationSet(List* options, bool* isCUFormat, bool isDfsTbl)
* @Param [IN] relkind: table's kind(ordinary table or other database object).
* @return: option with defalut options.
*/
static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 relcmprs, Oid relnamespace)
static List* AddDefaultOptionsIfNeed(List* options, const char relkind, CreateStmt* stmt, Oid relnamespace)
{
List* res = options;
int8 relcmprs = stmt->row_compress;
ListCell* cell = NULL;
bool isCStore = false;
bool isTsStore = false;
@ -1102,6 +1103,7 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
bool isUstore = false;
bool assignedStorageType = false;
TableCreateSupport tableCreateSupport{false,false,false,false,false,false};
(void)isOrientationSet(options, NULL, false);
foreach (cell, options) {
DefElem* def = (DefElem*)lfirst(cell);
@ -1131,6 +1133,8 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
ereport(ERROR,
(errcode(ERRCODE_INVALID_OPTION),
errmsg("It is not allowed to assign version option for non-dfs table.")));
} else {
SetOneOfCompressOption(def->defname, &tableCreateSupport);
}
if (pg_strcasecmp(def->defname, "orientation") == 0 && pg_strcasecmp(defGetString(def), ORIENTATION_ORC) == 0) {
@ -1156,6 +1160,15 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
res = lappend(options, def);
}
bool noSupportTable = isCStore || isTsStore || relkind != RELKIND_RELATION ||
stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED ||
stmt->relation->relpersistence == RELPERSISTENCE_TEMP ||
stmt->relation->relpersistence == RELPERSISTENCE_GLOBAL_TEMP;
if (noSupportTable && tableCreateSupport.compressType) {
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION), errmsg("only row orientation table support compresstype.")));
}
CheckCompressOption(&tableCreateSupport);
if (isUstore && !isCStore && !hasCompression) {
DefElem* def = makeDefElem("compression", (Node *)makeString(COMPRESSION_NO));
res = lappend(options, def);
@ -1191,7 +1204,7 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
DefElem *def1 = makeDefElem("orientation", (Node *)makeString(ORIENTATION_ROW));
res = lcons(def1, options);
}
if (!hasCompression) {
if (!hasCompression && !tableCreateSupport.compressType) {
DefElem *def2 = makeDefElem("compression", (Node *)rowCmprOpt);
res = lappend(options, def2);
}
@ -2124,7 +2137,7 @@ Oid DefineRelation(CreateStmt* stmt, char relkind, Oid ownerId, bool isCTAS)
/* Add default options for relation if need. */
if (!dfsTablespace) {
if (!u_sess->attr.attr_common.IsInplaceUpgrade) {
stmt->options = AddDefaultOptionsIfNeed(stmt->options, relkind, stmt->row_compress, namespaceId);
stmt->options = AddDefaultOptionsIfNeed(stmt->options, relkind, stmt, namespaceId);
}
} else {
checkObjectCreatedinHDFSTblspc(stmt, relkind);
@ -2364,10 +2377,13 @@ Oid DefineRelation(CreateStmt* stmt, char relkind, Oid ownerId, bool isCTAS)
ereport(LOG, (errmodule(MOD_TIMESERIES), errmsg("use implicit distribution column method.")));
}
} else if (pg_strcasecmp(storeChar, TABLE_ACCESS_METHOD_USTORE) == 0) {
if (pg_strcasecmp(COMPRESSION_NO, StdRdOptionsGetStringData(std_opt, compression, COMPRESSION_NO)) != 0 ||
auto compression = StdRdOptionsGetStringData(std_opt, compression, COMPRESSION_NO);
auto orientation = StdRdOptionsGetStringData(std_opt, orientation, ORIENTATION_ROW);
if ((pg_strcasecmp(COMPRESSION_NO, compression) != 0 &&
pg_strcasecmp(ORIENTATION_COLUMN, orientation) == 0) ||
IsCompressedByCmprsInPgclass((RelCompressType)stmt->row_compress)) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("UStore tables do not support compression.")));
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("UStore tables do not support compression.")));
}
ForbidToSetOptionsForRowTbl(stmt->options);
ForbidToSetOptionsForUstoreTbl(stmt->options);
@ -14428,6 +14444,67 @@ static void ATExecSetRelOptionsToast(Oid toastid, List* defList, AlterTableType
heap_close(pgclass, RowExclusiveLock);
}
/**
* Do not modify compression parameters.
*/
void static CheckSupportModifyCompression(Relation rel, bytea* relOoption, List* defList)
{
if (!relOoption) {
return;
}
if (!REL_SUPPORT_COMPRESSED(rel) || rel->rd_node.opt == 0) {
ForbidUserToSetCompressedOptions(defList);
return;
}
PageCompressOpts* newCompressOpt = &(((StdRdOptions*)relOoption)->compress);
RelFileCompressOption current;
TransCompressOptions(rel->rd_node, &current);
if (newCompressOpt) {
if (newCompressOpt->compressType != (int)current.compressAlgorithm) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("change compresstype OPTION is not supported")));
}
if ((int)current.compressAlgorithm != COMPRESS_TYPE_NONE &&
newCompressOpt->compressChunkSize != CHUNK_SIZE_LIST[current.compressChunkSize]) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("change compress_chunk_size OPTION is not supported")));
}
if (!newCompressOpt->compressByteConvert && newCompressOpt->compressDiffConvert) {
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
errmsg("compress_diff_convert should be used with compress_byte_convert.")));
}
if (current.compressAlgorithm == COMPRESS_TYPE_PGLZ) {
ListCell *opt = NULL;
foreach (opt, defList) {
DefElem *def = (DefElem *)lfirst(opt);
if (pg_strcasecmp(def->defname, "compress_level") == 0) {
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
errmsg("compress_level should be used with ZSTD algorithm.")));
}
}
}
} else {
if ((int)current.compressAlgorithm != COMPRESS_TYPE_NONE) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("change compresstype OPTION is not supported")));
}
}
/*
* forbid modify partition CompressOption
*/
if (HEAP_IS_PARTITIONED(rel)) {
if ((int)current.compressLevel != newCompressOpt->compressLevel) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("change partition compressLevel OPTION is not supported")));
}
if ((int)current.compressPreallocChunks != newCompressOpt->compressPreallocChunks) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("change partition compress_prealloc_chunks OPTION is not supported")));
}
}
}
/*
* Set, reset, or replace reloptions.
*/
@ -14567,6 +14644,7 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper
}
/* Validate */
bytea* relOpt = NULL;
switch (rel->rd_rel->relkind) {
case RELKIND_RELATION: {
/* this options only can be used when define a new relation.
@ -14575,6 +14653,7 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper
ForbidUserToSetDefinedOptions(defList);
bytea* heapRelOpt = heap_reloptions(rel->rd_rel->relkind, newOptions, true);
relOpt = heapRelOpt;
const char* algo = RelationGetAlgo(rel);
newRelHasUids = StdRdOptionsHasUids(heapRelOpt, RELKIND_RELATION);
if (rel->rd_rel->relhasoids && newRelHasUids) {
@ -14617,18 +14696,21 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper
break;
}
case RELKIND_INDEX:
case RELKIND_GLOBAL_INDEX:
case RELKIND_GLOBAL_INDEX: {
ForbidUserToSetDefinedIndexOptions(defList);
Assert(oldRelHasUids == false);
(void)index_reloptions(rel->rd_am->amoptions, newOptions, true);
relOpt = index_reloptions(rel->rd_am->amoptions, newOptions, true);
break;
}
default:
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is not a table, view, materialized view, index, or TOAST table", RelationGetRelationName(rel))));
break;
}
CheckSupportModifyCompression(rel, relOpt, defList);
/*
* All we need do here is update the pg_class row; the new options will be
* propagated into relcaches during post-commit cache inval.
@ -22257,6 +22339,11 @@ static void checkCompressForExchange(Relation partTableRel, Relation ordTableRel
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress")));
}
if (partTableRel->rd_node.opt != ordTableRel->rd_node.opt) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress")));
}
}
// Description : Check number, type of column
@ -24317,9 +24404,16 @@ static char* GenTemporaryPartitionName(Relation partTableRel, int sequence)
return pstrdup(tmpName);
}
#ifndef ENABLE_MULTIPLE_NODES
static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *partDef, Oid bucketOid,
bool *isTimestamptz, StorageType stype, Datum new_reloptions)
{
#else
static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *partDef,
Oid bucketOid, bool *isTimestamptz, StorageType stype)
{
Datum new_reloptions = (Datum)0;
#endif
Oid newPartOid = InvalidOid;
switch (nodeTag(partDef)) {
case T_RangePartitionDefState:
@ -24412,9 +24506,13 @@ static Oid AddTemporaryPartition(Relation partTableRel, Node* partDef)
}
/* Temporary tables do not use segment-page */
#ifndef ENABLE_MULTIPLE_NODES
newPartOid = GetNewPartitionOid(pgPartRel, partTableRel, partDef, bucketOid,
isTimestamptz, RelationGetStorageType(partTableRel), (Datum)new_reloptions);
isTimestamptz, RelationGetStorageType(partTableRel), new_reloptions);
#else
newPartOid = GetNewPartitionOid(
pgPartRel, partTableRel, partDef, bucketOid, isTimestamptz, RelationGetStorageType(partTableRel));
#endif
// We must bump the command counter to make the newly-created
// partition tuple visible for opening.
CommandCounterIncrement();
@ -24736,6 +24834,7 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List*
pgPartRel = relation_open(PartitionRelationId, RowExclusiveLock);
#ifndef ENABLE_MULTIPLE_NODES
bool isNull = false;
HeapTuple tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(partTableRel->rd_id));
Datum relOptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, &isNull);
@ -24743,6 +24842,7 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List*
Datum newRelOptions = transformRelOptions((Datum)0, oldRelOptions, NULL, NULL, false, false);
ReleaseSysCache(tuple);
list_free_ext(oldRelOptions);
#endif
foreach (cell, destPartDefList) {
RangePartitionDefState* partDef = (RangePartitionDefState*)lfirst(cell);
@ -24753,7 +24853,11 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List*
bucketOid,
partDef,
partTableRel->rd_rel->relowner,
#ifndef ENABLE_MULTIPLE_NODES
(Datum)newRelOptions,
#else
(Datum)0,
#endif
isTimestamptz,
RelationGetStorageType(partTableRel),
AccessExclusiveLock);

View File

@ -301,10 +301,12 @@ void PrepForRead(char* path, int64 blocknum, bool is_segment, RelFileNode *relno
char* bucketNodestr = strstr(path, "_b");
if (NULL != bucketNodestr) {
bucketNodestr += 2; /* delete first two chars: _b */
flag = StrToInt32(bucketNodestr, &(relfilenode.rnode.node.bucketNode));
int _bucketNode;
flag = StrToInt32(bucketNodestr, &_bucketNode); // carrottodo
if (!flag) {
ereport(ERROR, (errmsg("Can not covert %s to int32 type. \n", bucketNodestr)));
}
relfilenode.rnode.node.bucketNode = (int2)_bucketNode;
rc = strncpy_s(pathFirstpart, MAXFNAMELEN, path, strlen(path) - strlen(bucketNodestr));
securec_check(rc, "\0", "\0");
}
@ -852,13 +854,14 @@ Datum gs_read_segment_block_from_remote(PG_FUNCTION_ARGS)
uint32 dbNode = PG_GETARG_UINT32(1);
uint32 relNode = PG_GETARG_UINT32(2);
int16 bucketNode = PG_GETARG_INT16(3);
int32 forkNum = PG_GETARG_INT32(4);
uint64 blockNum = (uint64)PG_GETARG_TRANSACTIONID(5);
uint32 blockSize = PG_GETARG_UINT32(6);
uint64 lsn = (uint64)PG_GETARG_TRANSACTIONID(7);
uint32 seg_relNode = PG_GETARG_UINT32(8);
uint32 seg_block = PG_GETARG_UINT32(9);
int32 timeout = PG_GETARG_INT32(10);
uint16 opt = PG_GETARG_INT16(4);
int32 forkNum = PG_GETARG_INT32(5);
uint64 blockNum = (uint64)PG_GETARG_TRANSACTIONID(6);
uint32 blockSize = PG_GETARG_UINT32(7);
uint64 lsn = (uint64)PG_GETARG_TRANSACTIONID(8);
uint32 seg_relNode = PG_GETARG_UINT32(9);
uint32 seg_block = PG_GETARG_UINT32(10);
int32 timeout = PG_GETARG_INT32(11);
XLogPhyBlock pblk = {
.relNode = seg_relNode,
@ -871,6 +874,7 @@ Datum gs_read_segment_block_from_remote(PG_FUNCTION_ARGS)
key.relfilenode.dbNode = dbNode;
key.relfilenode.relNode = relNode;
key.relfilenode.bucketNode = bucketNode;
key.relfilenode.opt = opt;
key.forknum = forkNum;
key.blocknum = blockNum;

View File

@ -626,7 +626,7 @@ static uint32 ckpt_qsort_dirty_page_for_flush(bool *is_new_relfilenode, uint32 f
item->bucketNode = buf_desc->tag.rnode.bucketNode;
item->forkNum = buf_desc->tag.forkNum;
item->blockNum = buf_desc->tag.blockNum;
if(IsSegmentFileNode(buf_desc->tag.rnode)) {
if(IsSegmentFileNode(buf_desc->tag.rnode) || buf_desc->tag.rnode.opt != 0) {
*is_new_relfilenode = true;
}
}

View File

@ -4521,6 +4521,12 @@ const char* pgstat_get_wait_io(WaitEventIO w)
case WAIT_EVENT_LOGCTRL_SLEEP:
event_name = "LOGCTRL_SLEEP";
break;
case WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH:
event_name = "PCA_FLUSH";
break;
case WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC:
event_name = "PCA_SYNC";
break;
/* no default case, so that compiler will warn */
case IO_EVENT_NUM:
break;

View File

@ -2156,6 +2156,8 @@ int PostmasterMain(int argc, char* argv[])
ngroup_info_hash_create();
/*init Role id hash table*/
InitRoleIdHashTable();
/* pcmap */
RealInitialMMapLockArray();
/* init unique sql */
InitUniqueSQL();
/* init hypo index */

View File

@ -113,6 +113,10 @@ static relopt_bool boolRelOpts[] = {
{{ "crossbucket", "Enables cross bucket index creation in this index relation", RELOPT_KIND_BTREE}, false },
{{ "enable_tde", "enable table's level transparent data encryption", RELOPT_KIND_HEAP }, false },
{{ "hasuids", "Enables uids in this relation", RELOPT_KIND_HEAP }, false },
{{ "compress_byte_convert", "Whether do byte convert in compression", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
false },
{{ "compress_diff_convert", "Whether do diiffer convert in compression", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
false },
/* list terminator */
{{NULL}}
};
@ -233,6 +237,16 @@ static relopt_int intRelOpts[] = {
},
0, 1, 32
},
{{ "compress_level", "Level of page compression.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, 0, -31, 31},
{{ "compresstype", "compress type (none, pglz or zstd).", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, 0, 0, 2},
{{ "compress_chunk_size", "Size of chunk to store compressed page.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
BLCKSZ / 2,
BLCKSZ / 16,
BLCKSZ / 2},
{{ "compress_prealloc_chunks", "Number of prealloced chunks for each block.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
0,
0,
7},
/* list terminator */
{{NULL}}
};
@ -1948,7 +1962,20 @@ bytea *default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
{ "cmk_id", RELOPT_TYPE_STRING, offsetof(StdRdOptions, cmk_id)},
{ "encrypt_algo", RELOPT_TYPE_STRING, offsetof(StdRdOptions, encrypt_algo)},
{ "enable_tde", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, enable_tde)},
{ "hasuids", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, hasuids) }
{ "hasuids", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, hasuids) },
{ "compresstype", RELOPT_TYPE_INT,
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressType)},
{ "compress_level", RELOPT_TYPE_INT,
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressLevel)},
{ "compress_chunk_size", RELOPT_TYPE_INT,
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressChunkSize)},
{"compress_prealloc_chunks", RELOPT_TYPE_INT,
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressPreallocChunks)},
{ "compress_byte_convert", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressByteConvert)},
{ "compress_diff_convert", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressDiffConvert)},
};
options = parseRelOptions(reloptions, validate, kind, &numoptions);
@ -2594,6 +2621,25 @@ void ForbidUserToSetDefinedOptions(List *options)
}
}
/*
* @Description: compressed parameter cannot be changed by ALTER TABLE statement if table is uncompressed table.
* this function do the checking work.
* @Param[IN] options: input user options
* @See also:
*/
void ForbidUserToSetCompressedOptions(List *options)
{
static const char *unSupportOptions[] = {"compresstype", "compress_chunk_size", "compress_prealloc_chunks",
"compress_level", "compress_byte_convert", "compress_diff_convert"};
int firstInvalidOpt = -1;
if (FindInvalidOption(options, unSupportOptions, lengthof(unSupportOptions), &firstInvalidOpt)) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
(errmsg("Un-support feature"), errdetail("Option \"%s\" doesn't allow ALTER on uncompressed table",
unSupportOptions[firstInvalidOpt]))));
}
}
/*
* @Description: forbid to change inner option
* inner options only can be used by system itself.
@ -2888,3 +2934,33 @@ bool is_cstore_option(char relkind, Datum reloptions)
pfree_ext(std_opt);
return result;
}
void SetOneOfCompressOption(const char* defname, TableCreateSupport* tableCreateSupport)
{
if (pg_strcasecmp(defname, "compresstype") == 0) {
tableCreateSupport->compressType = true;
} else if (pg_strcasecmp(defname, "compress_chunk_size") == 0) {
tableCreateSupport->compressChunkSize = true;
} else if (pg_strcasecmp(defname, "compress_prealloc_chunks") == 0) {
tableCreateSupport->compressPreAllocChunks = true;
} else if (pg_strcasecmp(defname, "compress_level") == 0) {
tableCreateSupport->compressLevel = true;
} else if (pg_strcasecmp(defname, "compress_byte_convert") == 0) {
tableCreateSupport->compressByteConvert = true;
} else if (pg_strcasecmp(defname, "compress_diff_convert") == 0) {
tableCreateSupport->compressDiffConvert = true;
}
}
void CheckCompressOption(TableCreateSupport *tableCreateSupport)
{
if (!tableCreateSupport->compressType && HasCompressOption(tableCreateSupport)) {
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
errmsg("compress_chunk_size/compress_prealloc_chunks/compress_level/compress_byte_convert/"
"compress_diff_convert should be used with compresstype.")));
}
if (!tableCreateSupport->compressByteConvert && tableCreateSupport->compressDiffConvert) {
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
errmsg("compress_diff_convert should be used with compress_byte_convert.")));
}
}

View File

@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
endif
endif
endif
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
hashsearch.o hashsort.o hashutil.o
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o\
hashsort.o hashutil.o hash_xlog.o
include $(top_srcdir)/src/gausskernel/common.mk

View File

@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
move primary bucket pages around after they are created.
Primary bucket pages (henceforth just "bucket pages") are allocated in
power-of-2 groups, called "split points" in the code. Buckets 0 and 1
are created when the index is initialized. At the first split, buckets 2
and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket
pages of a power-of-2 group appear consecutively in the index. This
addressing scheme allows the physical location of a bucket page to be
computed from the bucket number relatively easily, using only a small
amount of control information. We take the log2() of the bucket number
to determine which split point S the bucket belongs to, and then simply
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
metapage) to compute the physical address. hashm_spares[S] can be
interpreted as the total number of overflow pages that have been allocated
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
block numbers 1 and 2, just after the meta page. We always have
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow
pages appearing between the bucket page groups of splitpoints N and N+1.
power-of-2 groups, called "split points" in the code. That means at every new
splitpoint we double the existing number of buckets. Allocating huge chunks
of bucket pages all at once isn't optimal and we will take ages to consume
those. To avoid this exponential growth of index size, we did use a trick to
break up allocation of buckets at the splitpoint into 4 equal phases. If
(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
of total buckets at each phase of splitpoint group. Next quarter of allocation
will only happen if buckets of the previous phase have been already consumed.
For the initial splitpoint groups < 10 we will allocate all of their buckets in
single phase only, as number of buckets allocated at initial groups are small
in numbers. And for the groups >= 10 the allocation process is distributed
among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4
different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
indicate the number of buckets allocated within each phase of splitpoint group
10. And, for splitpoint group 11 and 12 allocation phases will be
{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We
can see that at each splitpoint group we double the total number of buckets
from the previous group but in an incremental phase. The bucket pages
allocated within one phase of a splitpoint group will appear consecutively in
the index. This addressing scheme allows the physical location of a bucket
page to be computed from the bucket number relatively easily, using only a
small amount of control information. If we look at the function
_hash_spareindex for a given bucket number we first compute the
splitpoint group it belongs to and then the phase to which the bucket belongs
to. Adding them we get the global splitpoint phase number S to which the
bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
is an array stored in the metapage) with given bucket number to compute its
physical address. The hashm_spares[S] can be interpreted as the total number
of overflow pages that have been allocated before the bucket pages of
splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1
always appear at block numbers 1 and 2, just after the meta page. We always
have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow pages
appearing between the bucket page groups of splitpoints phase N and N+1.
(Note: the above describes what happens when filling an initially minimally
sized hash index. In practice, we try to estimate the required index size
and allocate a suitable number of splitpoints immediately, to avoid
sized hash index. In practice, we try to estimate the required index size and
allocate a suitable number of splitpoints phases immediately, to avoid
expensive re-splitting during initial index build.)
When S splitpoints exist altogether, the array entries hashm_spares[0]
through hashm_spares[S] are valid; hashm_spares[S] records the current
total number of overflow pages. New overflow pages are created as needed
at the end of the index, and recorded by incrementing hashm_spares[S].
When it is time to create a new splitpoint's worth of bucket pages, we
When it is time to create a new splitpoint phase's worth of bucket pages, we
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
stored in the hashm_ovflpoint field of the meta page). This has the
effect of reserving the correct number of bucket pages at the end of the
@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
an index extension we crash after allocating filesystem space and before
updating the metapage. Note that on filesystems that allow "holes" in
files, it's entirely likely that pages before the logical EOF are not yet
allocated: when we allocate a new splitpoint's worth of bucket pages, we
allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
physically zero the last such page to force the EOF up, and the first such
page will be used immediately, but the intervening pages are not written
until needed.
@ -126,61 +142,98 @@ the initially created buckets.
Lock Definitions
----------------
We use both lmgr locks ("heavyweight" locks) and buffer context locks
(LWLocks) to control access to a hash index. lmgr locks are needed for
long-term locking since there is a (small) risk of deadlock, which we must
be able to detect. Buffer context locks are used for short-term access
control to individual pages of the index.
Concurrency control for hash indexes is provided using buffer content
locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL,
cleanup lock means that we hold an exclusive lock on the buffer and have
observed at some point after acquiring the lock that we hold the only pin
on that buffer. For hash indexes, a cleanup lock on a primary bucket page
represents the right to perform an arbitrary reorganization of the entire
bucket. Therefore, scans retain a pin on the primary bucket page for the
bucket they are currently scanning. Splitting a bucket requires a cleanup
lock on both the old and new primary bucket pages. VACUUM therefore takes
a cleanup lock on every bucket page in order to remove tuples. It can also
remove tuples copied to a new bucket by any previous split operation, because
the cleanup lock taken on the primary bucket page guarantees that no scans
which started prior to the most recent split can still be in progress. After
cleaning each page individually, it attempts to take a cleanup lock on the
primary bucket page in order to "squeeze" the bucket down to the minimum
possible number of pages.
We define the following lmgr locks for a hash index:
To avoid deadlocks, we must be consistent about the lock order in which we
lock the buckets for operations that requires locks on two different buckets.
We choose to always lock the lower-numbered bucket first. The metapage is
only ever locked after all bucket locks have been taken.
LockPage(rel, 0) represents the right to modify the hash-code-to-bucket
mapping. A process attempting to enlarge the hash table by splitting a
bucket must exclusive-lock this lock before modifying the metapage data
representing the mapping. Processes intending to access a particular
bucket must share-lock this lock until they have acquired lock on the
correct target bucket.
LockPage(rel, page), where page is the page number of a hash bucket page,
represents the right to split or compact an individual bucket. A process
splitting a bucket must exclusive-lock both old and new halves of the
bucket until it is done. A process doing VACUUM must exclusive-lock the
bucket it is currently purging tuples from. Processes doing scans or
insertions must share-lock the bucket they are scanning or inserting into.
(It is okay to allow concurrent scans and insertions.)
Metapage Caching
----------------
The lmgr lock IDs corresponding to overflow pages are currently unused.
These are available for possible future refinements.
Both scanning the index and inserting tuples require locating the bucket
where a given tuple ought to be located. To do this, we need the bucket
count, highmask, and lowmask from the metapage; however, it's undesirable
for performance reasons to have to have to lock and pin the metapage for
every such operation. Instead, we retain a cached copy of the metapage
in each each backend's relcache entry. This will produce the correct
bucket mapping as long as the target bucket hasn't been split since the
last cache refresh.
Note that these lock definitions are conceptually distinct from any sort
of lock on the pages whose numbers they share. A process must also obtain
read or write buffer lock on the metapage or bucket page before accessing
said page.
To guard against the possibility that such a split has occurred, the
primary page of each bucket chain stores the number of buckets that
existed as of the time the bucket was last split, or if never split as
of the time it was created, in the space normally used for the
previous block number (that is, hasho_prevblkno). This doesn't cost
anything because the primary bucket page is always the first page in
the chain, and the previous block number is therefore always, in
reality, InvalidBlockNumber.
Processes performing hash index scans must hold share lock on the bucket
they are scanning throughout the scan. This seems to be essential, since
there is no reasonable way for a scan to cope with its bucket being split
underneath it. This creates a possibility of deadlock external to the
hash index code, since a process holding one of these locks could block
waiting for an unrelated lock held by another process. If that process
then does something that requires exclusive lock on the bucket, we have
deadlock. Therefore the bucket locks must be lmgr locks so that deadlock
can be detected and recovered from. This also forces the page-zero lock
to be an lmgr lock, because as we'll see below it is held while attempting
to acquire a bucket lock, and so it could also participate in a deadlock.
After computing the ostensibly-correct bucket number based on our cached
copy of the metapage, we lock the corresponding primary bucket page and
check whether the bucket count stored in hasho_prevblkno is greater than
our the number of buckets stored in our cached copy of the metapage. If
so, the bucket has certainly been split, because the must originally
have been less than the number of buckets that existed at that time and
can't have increased except due to a split. If not, the bucket can't have
been split, because a split would have created a new bucket with a higher
bucket number than any we'd seen previously. In the latter case, we've
locked the correct bucket and can proceed; in the former case, we must
release the lock on this bucket, lock the metapage, update our cache,
unlock the metapage, and retry.
Processes must obtain read (share) buffer context lock on any hash index
page while reading it, and write (exclusive) lock while modifying it.
To prevent deadlock we enforce these coding rules: no buffer lock may be
held long term (across index AM calls), nor may any buffer lock be held
while waiting for an lmgr lock, nor may more than one buffer lock
be held at a time by any one process. (The third restriction is probably
stronger than necessary, but it makes the proof of no deadlock obvious.)
Needing to retry occasionally might seem expensive, but the number of times
any given bucket can be split is limited to a few dozen no matter how
many times the hash index is accessed, because the total number of
buckets is limited to less than 2^32. On the other hand, the number of
times we access a bucket is unbounded and will be several orders of
magnitude larger even in unsympathetic cases.
(The metapage cache is new in v10. Older hash indexes had the primary
bucket page's hasho_prevblkno initialized to InvalidBuffer.)
Pseudocode Algorithms
---------------------
Various flags that are used in hash index operations are described as below:
The bucket-being-split and bucket-being-populated flags indicate that split
the operation is in progress for a bucket. During split operation, a
bucket-being-split flag is set on the old bucket and bucket-being-populated
flag is set on new bucket. These flags are cleared once the split operation
is finished.
The split-cleanup flag indicates that a bucket which has been recently split
still contains tuples that were also copied to the new bucket; it essentially
marks the split as incomplete. Once we're certain that no scans which
started before the new bucket was fully populated are still in progress, we
can remove the copies from the old bucket and clear the flag. We insist that
this flag must be clear before splitting a bucket; thus, a bucket can't be
split again until the previous split is totally complete.
The moved-by-split flag on a tuple indicates that tuple is moved from old to
new bucket. Concurrent scans will skip such tuples until the split operation
is finished. Once the tuple is marked as moved-by-split, it will remain so
forever but that does no harm. We have intentionally not cleared it as that
can generate an additional I/O which is not necessary.
The operations we need to support are: readers scanning the index for
entries of a particular hash code (which by definition are all in the same
bucket); insertion of a new tuple into the correct bucket; enlarging the
@ -195,57 +248,75 @@ track of available overflow pages.
The reader algorithm is:
share-lock page 0 (to prevent active split)
read/sharelock meta page
compute bucket number for target hash key
release meta page
share-lock bucket page (to prevent split/compact of this bucket)
release page 0 share-lock
lock the primary bucket page of the target bucket
if the target bucket is still being populated by a split:
release the buffer content lock on current bucket page
pin and acquire the buffer content lock on old bucket in shared mode
release the buffer content lock on old bucket, but not pin
retake the buffer content lock on new bucket
arrange to scan the old bucket normally and the new bucket for
tuples which are not moved-by-split
-- then, per read request:
read/sharelock current page of bucket
step to next page if necessary (no chaining of locks)
reacquire content lock on current page
step to next page if necessary (no chaining of content locks, but keep
the pin on the primary bucket throughout the scan; we also maintain
a pin on the page currently being scanned)
get tuple
release current page
release content lock
-- at scan shutdown:
release bucket share-lock
release all pins still held
By holding the page-zero lock until lock on the target bucket is obtained,
the reader ensures that the target bucket calculation is valid (otherwise
the bucket might be split before the reader arrives at it, and the target
entries might go into the new bucket). Holding the bucket sharelock for
the remainder of the scan prevents the reader's current-tuple pointer from
being invalidated by splits or compactions. Notice that the reader's lock
does not prevent other buckets from being split or compacted.
Holding the buffer pin on the primary bucket page for the whole scan prevents
the reader's current-tuple pointer from being invalidated by splits or
compactions. (Of course, other buckets can still be split or compacted.)
To keep concurrency reasonably good, we require readers to cope with
concurrent insertions, which means that they have to be able to re-find
their current scan position after re-acquiring the page sharelock. Since
deletion is not possible while a reader holds the bucket sharelock, and
we assume that heap tuple TIDs are unique, this can be implemented by
their current scan position after re-acquiring the buffer content lock on
page. Since deletion is not possible while a reader holds the pin on bucket,
and we assume that heap tuple TIDs are unique, this can be implemented by
searching for the same heap tuple TID previously returned. Insertion does
not move index entries across pages, so the previously-returned index entry
should always be on the same page, at the same or higher offset number,
as it was before.
To allow for scans during a bucket split, if at the start of the scan, the
bucket is marked as bucket-being-populated, it scan all the tuples in that
bucket except for those that are marked as moved-by-split. Once it finishes
the scan of all the tuples in the current bucket, it scans the old bucket from
which this bucket is formed by split.
The insertion algorithm is rather similar:
share-lock page 0 (to prevent active split)
read/sharelock meta page
compute bucket number for target hash key
release meta page
share-lock bucket page (to prevent split/compact of this bucket)
release page 0 share-lock
-- (so far same as reader)
read/exclusive-lock current page of bucket
if full, release, read/exclusive-lock next page; repeat as needed
lock the primary bucket page of the target bucket
-- (so far same as reader, except for acquisition of buffer content lock in
exclusive mode on primary bucket page)
if the bucket-being-split flag is set for a bucket and pin count on it is
one, then finish the split
release the buffer content lock on current bucket
get the "new" bucket which was being populated by the split
scan the new bucket and form the hash table of TIDs
conditionally get the cleanup lock on old and new buckets
if we get the lock on both the buckets
finish the split using algorithm mentioned below for split
release the pin on old bucket and restart the insert from beginning.
if current page is full, first check if this page contains any dead tuples.
if yes, remove dead tuples from the current page and again check for the
availability of the space. If enough space found, insert the tuple else
release lock but not pin, read/exclusive-lock
next page; repeat as needed
>> see below if no space in any page of bucket
take buffer content lock in exclusive mode on metapage
insert tuple at appropriate place in page
write/release current page
release bucket share-lock
read/exclusive-lock meta page
mark current page dirty
increment tuple count, decide if split needed
write/release meta page
done if no split needed, else enter Split algorithm below
mark meta page dirty
write WAL for insertion of tuple
release the buffer content lock on metapage
release buffer content lock on current page
if current page is not a bucket page, release the pin on bucket page
if split is needed, enter Split algorithm below
release the pin on metapage
To speed searches, the index entries within any individual index page are
kept sorted by hash code; the insertion code must take care to insert new
@ -254,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this
as explained above. We only need the short-term buffer locks to ensure
that readers do not see a partially-updated page.
It is clearly impossible for readers and inserters to deadlock, and in
fact this algorithm allows them a very high degree of concurrency.
(The exclusive metapage lock taken to update the tuple count is stronger
than necessary, since readers do not care about the tuple count, but the
lock is held for such a short time that this is probably not an issue.)
To avoid deadlock between readers and inserters, whenever there is a need
to lock multiple buckets, we always take in the order suggested in Lock
Definitions above. This algorithm allows them a very high degree of
concurrency. (The exclusive metapage lock taken to update the tuple count
is stronger than necessary, since readers do not care about the tuple count,
but the lock is held for such a short time that this is probably not an
issue.)
When an inserter cannot find space in any existing page of a bucket, it
must obtain an overflow page and add that page to the bucket's chain.
@ -269,82 +342,95 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets).
The algorithm attempts, but does not necessarily succeed, to split one
existing bucket in two, thereby lowering the fill ratio:
exclusive-lock page 0 (assert the right to begin a split)
read/exclusive-lock meta page
check split still needed
if split not needed anymore, drop locks and exit
decide which bucket to split
Attempt to X-lock old bucket number (definitely could fail)
Attempt to X-lock new bucket number (shouldn't fail, but...)
if above fail, drop locks and exit
update meta page to reflect new number of buckets
write/release meta page
release X-lock on page 0
-- now, accesses to all other buckets can proceed.
Perform actual split of bucket, moving tuples as needed
>> see below about acquiring needed extra space
Release X-locks of old and new buckets
pin meta page and take buffer content lock in exclusive mode
check split still needed
if split not needed anymore, drop buffer content lock and pin and exit
decide which bucket to split
try to take a cleanup lock on that bucket; if fail, give up
if that bucket is still being split or has split-cleanup work:
try to finish the split and the cleanup work
if that succeeds, start over; if it fails, give up
mark the old and new buckets indicating split is in progress
mark both old and new buckets as dirty
write WAL for allocation of new page for split
copy the tuples that belongs to new bucket from old bucket, marking
them as moved-by-split
write WAL record for moving tuples to new page once the new page is full
or all the pages of old bucket are finished
release lock but not pin for primary bucket page of old bucket,
read/shared-lock next page; repeat as needed
clear the bucket-being-split and bucket-being-populated flags
mark the old bucket indicating split-cleanup
write WAL for changing the flags on both old and new buckets
Note the page zero and metapage locks are not held while the actual tuple
rearrangement is performed, so accesses to other buckets can proceed in
parallel; in fact, it's possible for multiple bucket splits to proceed
in parallel.
Split's attempt to X-lock the old bucket number could fail if another
process holds S-lock on it. We do not want to wait if that happens, first
because we don't want to wait while holding the metapage exclusive-lock,
and second because it could very easily result in deadlock. (The other
process might be out of the hash AM altogether, and could do something
that blocks on another lock this process holds; so even if the hash
algorithm itself is deadlock-free, a user-induced deadlock could occur.)
So, this is a conditional LockAcquire operation, and if it fails we just
abandon the attempt to split. This is all right since the index is
overfull but perfectly functional. Every subsequent inserter will try to
split, and eventually one will succeed. If multiple inserters failed to
split, the index might still be overfull, but eventually, the index will
The split operation's attempt to acquire cleanup-lock on the old bucket number
could fail if another process holds any lock or pin on it. We do not want to
wait if that happens, because we don't want to wait while holding the metapage
exclusive-lock. So, this is a conditional LWLockAcquire operation, and if
it fails we just abandon the attempt to split. This is all right since the
index is overfull but perfectly functional. Every subsequent inserter will
try to split, and eventually one will succeed. If multiple inserters failed
to split, the index might still be overfull, but eventually, the index will
not be overfull and split attempts will stop. (We could make a successful
splitter loop to see if the index is still overfull, but it seems better to
distribute the split overhead across successive insertions.)
A problem is that if a split fails partway through (eg due to insufficient
disk space) the index is left corrupt. The probability of that could be
made quite low if we grab a free page or two before we update the meta
page, but the only real solution is to treat a split as a WAL-loggable,
must-complete action. I'm not planning to teach hash about WAL in this
go-round.
If a split fails partway through (e.g. due to insufficient disk space or an
interrupt), the index will not be corrupted. Instead, we'll retry the split
every time a tuple is inserted into the old bucket prior to inserting the new
tuple; eventually, we should succeed. The fact that a split is left
unfinished doesn't prevent subsequent buckets from being split, but we won't
try to split the bucket again until the prior split is finished. In other
words, a bucket can be in the middle of being split for some time, but it can't
be in the middle of two splits at the same time.
The fourth operation is garbage collection (bulk deletion):
next bucket := 0
read/sharelock meta page
pin metapage and take buffer content lock in exclusive mode
fetch current max bucket number
release meta page
release meta page buffer content lock and pin
while next bucket <= max bucket do
Acquire X lock on target bucket
Scan and remove tuples, compact free space as needed
Release X lock
acquire cleanup lock on primary bucket page
loop:
scan and remove tuples
mark the target page dirty
write WAL for deleting tuples from target page
if this is the last bucket page, break out of loop
pin and x-lock next page
release prior lock and pin (except keep pin on primary bucket page)
if the page we have locked is not the primary bucket page:
release lock and take exclusive lock on primary bucket page
if there are no other pins on the primary bucket page:
squeeze the bucket to remove free space
release the pin on primary bucket page
next bucket ++
end loop
exclusive-lock meta page
pin metapage and take buffer content lock in exclusive mode
check if number of buckets changed
if so, release lock and return to for-each-bucket loop
if so, release content lock and pin and return to for-each-bucket loop
else update metapage tuple count
write/release meta page
mark meta page dirty and write WAL for update of metapage
release buffer content lock and pin
Note that this is designed to allow concurrent splits. If a split occurs,
tuples relocated into the new bucket will be visited twice by the scan,
but that does no harm. (We must however be careful about the statistics
Note that this is designed to allow concurrent splits and scans. If a split
occurs, tuples relocated into the new bucket will be visited twice by the
scan, but that does no harm. As we release the lock on bucket page during
cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
and ensures that scan will always be behind cleanup. It is must to keep scans
behind cleanup, else vacuum could decrease the TIDs that are required to
complete the scan. Now, as the scan that returns multiple tuples from the
same bucket page always expect next valid TID to be greater than or equal to
the current TID, it might miss the tuples. This holds true for backward scans
as well (backward scans first traverse each bucket starting from first bucket
to last overflow page in the chain). We must be careful about the statistics
reported by the VACUUM operation. What we can do is count the number of
tuples scanned, and believe this in preference to the stored tuple count
if the stored tuple count and number of buckets did *not* change at any
time during the scan. This provides a way of correcting the stored tuple
count if it gets out of sync for some reason. But if a split or insertion
does occur concurrently, the scan count is untrustworthy; instead,
subtract the number of tuples deleted from the stored tuple count and
use that.)
The exclusive lock request could deadlock in some strange scenarios, but
we can just error out without any great harm being done.
tuples scanned, and believe this in preference to the stored tuple count if
the stored tuple count and number of buckets did *not* change at any time
during the scan. This provides a way of correcting the stored tuple count if
it gets out of sync for some reason. But if a split or insertion does occur
concurrently, the scan count is untrustworthy; instead, subtract the number of
tuples deleted from the stored tuple count and use that.
Free Space Management
@ -360,25 +446,23 @@ overflow page to the free pool.
Obtaining an overflow page:
read/exclusive-lock meta page
take metapage content lock in exclusive mode
determine next bitmap page number; if none, exit loop
release meta page lock
read/exclusive-lock bitmap page
release meta page content lock
pin bitmap page and take content lock in exclusive mode
search for a free page (zero bit in bitmap)
if found:
set bit in bitmap
write/release bitmap page
read/exclusive-lock meta page
mark bitmap page dirty
take metapage buffer content lock in exclusive mode
if first-free-bit value did not change,
update it and write meta page
release meta page
return page number
update it and mark meta page dirty
else (not found):
release bitmap page
release bitmap page buffer content lock
loop back to try next bitmap page, if any
-- here when we have checked all bitmap pages; we hold meta excl. lock
extend index to add another overflow page; update meta information
write/release meta page
mark meta page dirty
return page number
It is slightly annoying to release and reacquire the metapage lock
@ -398,12 +482,17 @@ like this:
-- having determined that no space is free in the target bucket:
remember last page of bucket, drop write lock on it
call free-page-acquire routine
re-write-lock last page of bucket
if it is not last anymore, step to the last page
update (former) last page to point to new page
execute free-page-acquire (obtaining an overflow page) mechanism
described above
update (former) last page to point to the new page and mark buffer dirty
write-lock and initialize new page, with back link to former last page
write and release former last page
write WAL for addition of overflow page
release the locks on meta page and bitmap page acquired in
free-page-acquire algorithm
release the lock on former last page
release the lock on new overflow page
insert tuple into new page
-- etc.
@ -418,27 +507,27 @@ free page; there can be no other process holding lock on it.
Bucket splitting uses a similar algorithm if it has to extend the new
bucket, but it need not worry about concurrent extension since it has
exclusive lock on the new bucket.
buffer content lock in exclusive mode on the new bucket.
Freeing an overflow page is done by garbage collection and by bucket
splitting (the old bucket may contain no-longer-needed overflow pages).
In both cases, the process holds exclusive lock on the containing bucket,
so need not worry about other accessors of pages in the bucket. The
algorithm is:
Freeing an overflow page requires the process to hold buffer content lock in
exclusive mode on the containing bucket, so need not worry about other
accessors of pages in the bucket. The algorithm is:
delink overflow page from bucket chain
(this requires read/update/write/release of fore and aft siblings)
read/share-lock meta page
pin meta page and take buffer content lock in shared mode
determine which bitmap page contains the free space bit for page
release meta page
read/exclusive-lock bitmap page
release meta page buffer content lock
pin bitmap page and take buffer content lock in exclusive mode
retake meta page buffer content lock in exclusive mode
move (insert) tuples that belong to the overflow page being freed
update bitmap bit
write/release bitmap page
if page number is less than what we saw as first-free-bit in meta:
read/exclusive-lock meta page
mark bitmap page dirty
if page number is still less than first-free-bit,
update first-free-bit field and write meta page
release meta page
update first-free-bit field and mark meta page dirty
write WAL for delinking overflow page operation
release buffer content lock and pin
release meta page buffer content lock and pin
We have to do it this way because we must clear the bitmap bit before
changing the first-free-bit field (hashm_firstfree). It is possible that
@ -448,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to. What must be
avoided is having first-free-bit greater than the actual first free bit,
because then that free page would never be found by searchers.
All the freespace operations should be called while holding no buffer
locks. Since they need no lmgr locks, deadlock is not possible.
The reason of moving tuples from overflow page while delinking the later is
to make that as an atomic operation. Not doing so could lead to spurious reads
on standby. Basically, the user might see the same tuple twice.
WAL Considerations
------------------
The hash index operations like create index, insert, delete, bucket split,
allocate overflow page, and squeeze in themselves don't guarantee hash index
consistency after a crash. To provide robustness, we write WAL for each of
these operations.
CREATE INDEX writes multiple WAL records. First, we write a record to cover
the initializatoin of the metapage, followed by one for each new bucket
created, followed by one for the initial bitmap page. It's not important for
index creation to appear atomic, because the index isn't yet visible to any
other transaction, and the creating transaction will roll back in the event of
a crash. It would be difficult to cover the whole operation with a single
write-ahead log record anyway, because we can log only a fixed number of
pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery.
Ordinary item insertions (that don't force a page split or need a new overflow
page) are single WAL entries. They touch a single bucket page and the
metapage. The metapage is updated during replay as it is updated during
original operation.
If an insertion causes the addition of an overflow page, there will be one
WAL entry for the new overflow page and second entry for insert itself.
If an insertion causes a bucket split, there will be one WAL entry for insert
itself, followed by a WAL entry for allocating a new bucket, followed by a WAL
entry for each overflow bucket page in the new bucket to which the tuples are
moved from old bucket, followed by a WAL entry to indicate that split is
complete for both old and new buckets. A split operation which requires
overflow pages to complete the operation will need to write a WAL record for
each new allocation of an overflow page.
As splitting involves multiple atomic actions, it's possible that the system
crashes between moving tuples from bucket pages of the old bucket to new
bucket. In such a case, after recovery, the old and new buckets will be
marked with bucket-being-split and bucket-being-populated flags respectively
which indicates that split is in progress for those buckets. The reader
algorithm works correctly, as it will scan both the old and new buckets when
the split is in progress as explained in the reader algorithm section above.
We finish the split at next insert or split operation on the old bucket as
explained in insert and split algorithm above. It could be done during
searches, too, but it seems best not to put any extra updates in what would
otherwise be a read-only operation (updating is not possible in hot standby
mode anyway). It would seem natural to complete the split in VACUUM, but since
splitting a bucket might require allocating a new page, it might fail if you
run out of disk space. That would be bad during VACUUM - the reason for
running VACUUM in the first place might be that you run out of disk space,
and now VACUUM won't finish because you're out of disk space. In contrast,
an insertion can require enlarging the physical file anyway.
Deletion of tuples from a bucket is performed for two reasons: to remove dead
tuples, and to remove tuples that were moved by a bucket split. A WAL entry
is made for each bucket page from which tuples are removed, and then another
WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples
are removed, a separate WAL entry is made to update the metapage.
As deletion involves multiple atomic operations, it is quite possible that
system crashes after (a) removing tuples from some of the bucket pages, (b)
before clearing the garbage flag, or (c) before updating the metapage. If the
system crashes before completing (b), it will again try to clean the bucket
during next vacuum or insert after recovery which can have some performance
impact, but it will work fine. If the system crashes before completing (c),
after recovery there could be some additional splits until the next vacuum
updates the metapage, but the other operations like insert, delete and scan
will work correctly. We can fix this problem by actually updating the
metapage based on delete operation during replay, but it's not clear whether
it's worth the complication.
A squeeze operation moves tuples from one of the buckets later in the chain to
one of the bucket earlier in chain and writes WAL record when either the
bucket to which it is writing tuples is filled or bucket from which it
is removing the tuples becomes empty.
As a squeeze operation involves writing multiple atomic operations, it is
quite possible that the system crashes before completing the operation on
entire bucket. After recovery, the operations will work correctly, but
the index will remain bloated and this can impact performance of read and
insert operations until the next vacuum squeeze the bucket completely.
Other Notes
-----------
All the shenanigans with locking prevent a split occurring while *another*
process is stopped in a given bucket. They do not ensure that one of
our *own* backend's scans is not stopped in the bucket, because lmgr
doesn't consider a process's own locks to conflict. So the Split
algorithm must check for that case separately before deciding it can go
ahead with the split. VACUUM does not have this problem since nothing
else can be happening within the vacuuming backend.
Should we instead try to fix the state of any conflicting local scan?
Seems mighty ugly --- got to move the held bucket S-lock as well as lots
of other messiness. For now, just punt and don't split.
Clean up locks prevent a split from occurring while *another* process is stopped
in a given bucket. It also ensures that one of our *own* backend's scans is not
stopped in the bucket.

View File

@ -3,8 +3,8 @@
* hash.cpp
* Implementation of Margo Seltzer's Hashing package for postgres.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -20,6 +20,8 @@
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/xloginsert.h"
#include "access/tableam.h"
#include "access/relscan.h"
#include "catalog/index.h"
@ -34,6 +36,7 @@
typedef struct {
HSpool *spool; /* NULL if not using spooling */
double indtuples; /* # tuples accepted into index */
Relation heapRel; /* heap relation descriptor */
} HashBuildState;
static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, const bool *isnull, bool tupleIsAlive,
@ -52,6 +55,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
double reltuples;
double allvisfrac;
uint32 num_buckets;
long sort_threshold;
HashBuildState buildstate;
/*
@ -66,7 +70,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac, NULL);
/* Initialize the hash index metadata page and initial buckets */
num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
/*
* If we just insert the tuples into the index in scan order, then
* (assuming their hash codes are pretty random) there will be no locality
@ -74,25 +78,38 @@ Datum hashbuild(PG_FUNCTION_ARGS)
* then we'll thrash horribly. To prevent that scenario, we can sort the
* tuples by (expected) bucket number. However, such a sort is useless
* overhead when the index does fit in RAM. We choose to sort if the
* initial index size exceeds NBuffers.
* initial index size exceeds maintenance_work_mem, or the number of
* buffers usable for the index, whichever is less. (Limiting by the
* number of buffers should reduce thrashing between PG buffers and kernel
* buffers, which seems useful even if no physical I/O results. Limiting
* by maintenance_work_mem is useful to allow easy testing of the sort
* code path, and may be useful to DBAs as an additional control knob.)
*
* NOTE: this test will need adjustment if a bucket is ever different from
* one page.
* one page. Also, "initial index size" accounting does not include the
* metapage, nor the first bitmap page.
*/
if (num_buckets >= (uint32)g_instance.attr.attr_storage.NBuffers)
buildstate.spool = _h_spoolinit(index, num_buckets, &indexInfo->ii_desc);
sort_threshold = (u_sess->attr.attr_memory.maintenance_work_mem * 1024L) / BLCKSZ;
if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
sort_threshold = Min(sort_threshold, g_instance.attr.attr_storage.NBuffers);
else
sort_threshold = Min(sort_threshold, u_sess->storage_cxt.NLocBuffer);
if (num_buckets >= (uint32)sort_threshold)
buildstate.spool = _h_spoolinit(heap, index, num_buckets, &indexInfo->ii_desc);
else
buildstate.spool = NULL;
/* prepare to build the index */
buildstate.indtuples = 0;
buildstate.heapRel = heap;
/* do the heap scan */
reltuples = tableam_index_build_scan(heap, index, indexInfo, true, hashbuildCallback, (void*)&buildstate, NULL);
if (buildstate.spool != NULL) {
/* sort the tuples and insert them into the index */
_h_indexbuild(buildstate.spool);
_h_indexbuild(buildstate.spool, buildstate.heapRel);
_h_spooldestroy(buildstate.spool);
}
@ -114,7 +131,7 @@ Datum hashbuildempty(PG_FUNCTION_ARGS)
{
Relation index = (Relation)PG_GETARG_POINTER(0);
_hash_metapinit(index, 0, INIT_FORKNUM);
_hash_init(index, 0, INIT_FORKNUM);
PG_RETURN_VOID();
}
@ -126,21 +143,24 @@ static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, con
void *state)
{
HashBuildState *buildstate = (HashBuildState *)state;
Datum index_values[1];
bool index_isnull[1];
IndexTuple itup;
/* Hash indexes don't index nulls, see notes in hashinsert */
if (isnull[0]) {
/* convert data to a hash key; on failure, do not insert anything */
if (!_hash_convert_tuple(index,
values, isnull,
index_values, index_isnull))
return;
}
/* Either spool the tuple for sorting, or just put it into the index */
if (buildstate->spool != NULL) {
_h_spool(buildstate->spool, &htup->t_self, values, isnull);
_h_spool(buildstate->spool, &htup->t_self, index_values, index_isnull);
} else {
/* form an index tuple and point it at the heap tuple */
itup = _hash_form_tuple(index, values, isnull);
itup = index_form_tuple(RelationGetDescr(index), index_values, index_isnull);
itup->t_tid = htup->t_self;
_hash_doinsert(index, itup);
_hash_doinsert(index, itup, buildstate->heapRel);
pfree(itup);
}
@ -159,30 +179,22 @@ Datum hashinsert(PG_FUNCTION_ARGS)
Datum *values = (Datum *)PG_GETARG_POINTER(1);
bool *isnull = (bool *)PG_GETARG_POINTER(2);
ItemPointer ht_ctid = (ItemPointer)PG_GETARG_POINTER(3);
#ifdef NOT_USED
Relation heapRel = (Relation)PG_GETARG_POINTER(4);
IndexUniqueCheck checkUnique = (IndexUniqueCheck)PG_GETARG_INT32(5);
#endif
Datum index_values[1];
bool index_isnull[1];
IndexTuple itup;
/*
* If the single index key is null, we don't insert it into the index.
* Hash tables support scans on '='. Relational algebra says that A = B
* returns null if either A or B is null. This means that no
* qualification used in an index scan could ever return true on a null
* attribute. It also means that indices can't be used by ISNULL or
* NOTNULL scans, but that's an artifact of the strategy map architecture
* chosen in 1986, not of the way nulls are handled here.
*/
if (isnull[0])
PG_RETURN_BOOL(false);
/* convert data to a hash key; on failure, do not insert anything */
if (!_hash_convert_tuple(rel,
values, isnull,
index_values, index_isnull))
return false;
/* generate an index tuple */
itup = _hash_form_tuple(rel, values, isnull);
/* form an index tuple and point it at the heap tuple */
itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
itup->t_tid = *ht_ctid;
_hash_doinsert(rel, itup);
_hash_doinsert(rel, itup, heapRel);
pfree(itup);
@ -212,7 +224,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
* Reacquire the read lock here.
*/
if (BufferIsValid(so->hashso_curbuf))
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
/*
* If we've already initialized this scan, we can just advance it in the
@ -224,16 +236,21 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
/*
* An insertion into the current index page could have happened while
* we didn't have read lock on it. Re-find our position by looking
* for the TID we previously returned. (Because we hold share lock on
* the bucket, no deletions or splits could have occurred; therefore
* we can expect that the TID still exists in the current index page,
* at an offset >= where we were.)
* for the TID we previously returned. (Because we hold a pin on the
* primary bucket page, no deletions or splits could have occurred;
* therefore we can expect that the TID still exists in the current
* index page, at an offset >= where we were.)
*/
OffsetNumber maxoffnum;
buf = so->hashso_curbuf;
Assert(BufferIsValid(buf));
page = BufferGetPage(buf);
/*
* We don't need test for old snapshot here as the current buffer is
* pinned, so vacuum can't clean the page.
*/
maxoffnum = PageGetMaxOffsetNumber(page);
for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) {
IndexTuple itup;
@ -253,14 +270,22 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
*/
if (scan->kill_prior_tuple) {
/*
* Yes, so mark it by setting the LP_DEAD state in the item flags.
* Yes, so remember it for later. (We'll deal with all such tuples
* at once right after leaving the index page or at end of scan.)
* In case if caller reverses the indexscan direction it is quite
* possible that the same item might get entered multiple times.
* But, we don't detect that; instead, we just forget any excess
* entries.
*/
ItemIdMarkDead(PageGetItemId(page, offnum));
if (so->killedItems == NULL)
so->killedItems = (HashScanPosItem *)palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem));
/*
* Since this can be redone later if needed, mark as a hint.
*/
MarkBufferDirtyHint(buf, true);
if (so->numKilled < MaxIndexTuplesPerPage) {
so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
so->killedItems[so->numKilled].indexOffset =
ItemPointerGetOffsetNumber(&(so->hashso_curpos));
so->numKilled++;
}
}
/*
@ -285,7 +310,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
/* Release read lock on current buffer, but keep it pinned */
if (BufferIsValid(so->hashso_curbuf))
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
/* Return current heap TID on success */
scan->xs_ctup.t_self = so->hashso_heappos;
@ -360,17 +385,20 @@ Datum hashbeginscan(PG_FUNCTION_ARGS)
scan = RelationGetIndexScan(rel, nkeys, norderbys);
so = (HashScanOpaque)palloc(sizeof(HashScanOpaqueData));
so->hashso_bucket_valid = false;
so->hashso_bucket_blkno = 0;
so->hashso_curbuf = InvalidBuffer;
so->hashso_bucket_buf = InvalidBuffer;
so->hashso_split_bucket_buf = InvalidBuffer;
/* set position invalid (this will cause _hash_first call) */
ItemPointerSetInvalid(&(so->hashso_curpos));
ItemPointerSetInvalid(&(so->hashso_heappos));
scan->opaque = so;
so->hashso_buc_populated = false;
so->hashso_buc_split = false;
/* register scan in case we change pages it's using */
_hash_regscan(scan);
so->killedItems = NULL;
so->numKilled = 0;
scan->opaque = so;
PG_RETURN_POINTER(scan);
}
@ -388,14 +416,13 @@ Datum hashrescan(PG_FUNCTION_ARGS)
Relation rel = scan->indexRelation;
/* release any pin we still hold */
if (BufferIsValid(so->hashso_curbuf))
_hash_dropbuf(rel, so->hashso_curbuf);
so->hashso_curbuf = InvalidBuffer;
if (so->numKilled > 0) {
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
_hash_kill_items(scan);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
}
/* release lock on bucket, too */
if (so->hashso_bucket_blkno)
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
so->hashso_bucket_blkno = 0;
_hash_dropscanbuf(rel, so);
/* set position invalid (this will cause _hash_first call) */
ItemPointerSetInvalid(&(so->hashso_curpos));
@ -407,10 +434,11 @@ Datum hashrescan(PG_FUNCTION_ARGS)
rc = memmove_s(scan->keyData, (unsigned)scan->numberOfKeys * sizeof(ScanKeyData), scankey,
(unsigned)scan->numberOfKeys * sizeof(ScanKeyData));
securec_check(rc, "", "");
so->hashso_bucket_valid = false;
}
so->hashso_buc_populated = false;
so->hashso_buc_split = false;
PG_RETURN_VOID();
}
@ -423,18 +451,20 @@ Datum hashendscan(PG_FUNCTION_ARGS)
HashScanOpaque so = (HashScanOpaque)scan->opaque;
Relation rel = scan->indexRelation;
/* don't need scan registered anymore */
_hash_dropscan(scan);
/*
* Before leaving current page, deal with any killed items. Also, ensure
* that we acquire lock on current page before calling _hash_kill_items.
*/
if (so->numKilled > 0) {
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
_hash_kill_items(scan);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
}
/* release any pin we still hold */
if (BufferIsValid(so->hashso_curbuf))
_hash_dropbuf(rel, so->hashso_curbuf);
so->hashso_curbuf = InvalidBuffer;
_hash_dropscanbuf(rel, so);
/* release lock on bucket, too */
if (so->hashso_bucket_blkno)
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
so->hashso_bucket_blkno = 0;
if (so->killedItems != NULL)
pfree(so->killedItems);
pfree(so);
scan->opaque = NULL;
@ -465,6 +495,9 @@ Datum hashrestrpos(PG_FUNCTION_ARGS)
* The set of target tuples is specified via a callback routine that tells
* whether any given heap tuple (identified by ItemPointer) is being deleted.
*
* This function also deletes the tuples that are moved by split to other
* bucket.
*
* Result: a palloc'd struct containing statistical info for VACUUM displays.
*/
Datum hashbulkdelete(PG_FUNCTION_ARGS)
@ -480,29 +513,24 @@ Datum hashbulkdelete(PG_FUNCTION_ARGS)
Bucket orig_maxbucket;
Bucket cur_maxbucket;
Bucket cur_bucket;
Buffer metabuf;
Buffer metabuf = InvalidBuffer;
HashMetaPage metap;
HashMetaPageData local_metapage;
errno_t rc;
HashMetaPage cachedmetap;
tuples_removed = 0;
num_index_tuples = 0;
/*
* Read the metapage to fetch original bucket and tuple counts. Also, we
* keep a copy of the last-seen metapage so that we can use its
* hashm_spares[] values to compute bucket page addresses. This is a bit
* hokey but perfectly safe, since the interesting entries in the spares
* array cannot change under us; and it beats rereading the metapage for
* each bucket.
* We need a copy of the metapage so that we can use its hashm_spares[]
* values to compute bucket page addresses, but a cached copy should be
* good enough. (If not, we'll detect that further down and refresh the
* cache as necessary.)
*/
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
orig_maxbucket = metap->hashm_maxbucket;
orig_ntuples = metap->hashm_ntuples;
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
securec_check(rc, "", "");
_hash_relbuf(rel, metabuf);
cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
Assert(cachedmetap != NULL);
orig_maxbucket = cachedmetap->hashm_maxbucket;
orig_ntuples = cachedmetap->hashm_ntuples;
/* Scan the buckets that we know exist */
cur_bucket = 0;
@ -512,90 +540,85 @@ loop_top:
while (cur_bucket <= cur_maxbucket) {
BlockNumber bucket_blkno;
BlockNumber blkno;
bool bucket_dirty = false;
Buffer bucket_buf;
Buffer buf;
HashPageOpaque bucket_opaque;
Page page;
bool split_cleanup = false;
/* Get address of bucket's start page */
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
/* Exclusive-lock the bucket so we can shrink it */
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
/* Shouldn't have any active scans locally, either */
if (_hash_has_active_scan(rel, cur_bucket))
ereport(ERROR,
(errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), (errmsg("hash index has active scan during VACUUM."))));
/* Scan each page in bucket */
blkno = bucket_blkno;
while (BlockNumberIsValid(blkno)) {
Buffer buf;
Page page;
HashPageOpaque opaque;
OffsetNumber offno;
OffsetNumber maxoffno;
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
vacuum_delay_point();
/*
* We need to acquire a cleanup lock on the primary bucket page to out
* wait concurrent scans before deleting the dead tuples.
*/
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
LockBufferForCleanup(buf);
_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy);
page = BufferGetPage(buf);
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == cur_bucket);
page = BufferGetPage(buf);
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/* Scan each tuple in page */
maxoffno = PageGetMaxOffsetNumber(page);
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
IndexTuple itup;
ItemPointer htup;
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offno));
htup = &(itup->t_tid);
if (callback(htup, callback_state, InvalidOid, InvalidBktId)) {
/* mark the item for deletion */
deletable[ndeletable++] = offno;
tuples_removed += 1;
} else
num_index_tuples += 1;
}
/*
* If the bucket contains tuples that are moved by split, then we need
* to delete such tuples. We can't delete such tuples if the split
* operation on bucket is not finished as those are needed by scans.
*/
if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) {
split_cleanup = true;
/*
* Apply deletions and write page if needed, advance to next page.
* This bucket might have been split since we last held a lock on
* the metapage. If so, hashm_maxbucket, hashm_highmask and
* hashm_lowmask might be old enough to cause us to fail to remove
* tuples left behind by the most recent split. To prevent that,
* now that the primary page of the target bucket has been locked
* (and thus can't be further split), check whether we need to
* update our cached metapage data.
*/
blkno = opaque->hasho_nextblkno;
if (ndeletable > 0) {
PageIndexMultiDelete(page, deletable, ndeletable);
_hash_wrtbuf(rel, buf);
bucket_dirty = true;
} else
_hash_relbuf(rel, buf);
Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) {
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
Assert(cachedmetap != NULL);
}
}
/* If we deleted anything, try to compact free space */
if (bucket_dirty)
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy);
bucket_buf = buf;
/* Release bucket lock */
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
cachedmetap->hashm_maxbucket,
cachedmetap->hashm_highmask,
cachedmetap->hashm_lowmask, &tuples_removed,
&num_index_tuples, split_cleanup,
callback, callback_state);
_hash_dropbuf(rel, bucket_buf);
/* Advance to next bucket */
cur_bucket++;
}
if (BufferIsInvalid(metabuf))
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
/* Write-lock metapage and check for split since we started */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
if (cur_maxbucket != metap->hashm_maxbucket) {
/* There's been a split, so process the additional bucket(s) */
cur_maxbucket = metap->hashm_maxbucket;
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
securec_check(rc, "", "");
_hash_relbuf(rel, metabuf);
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
Assert(cachedmetap != NULL);
cur_maxbucket = cachedmetap->hashm_maxbucket;
goto loop_top;
}
/* Okay, we're really done. Update tuple count in metapage. */
START_CRIT_SECTION();
if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) {
/*
* No one has split or inserted anything since start of scan, so
@ -616,7 +639,27 @@ loop_top:
num_index_tuples = metap->hashm_ntuples;
}
_hash_wrtbuf(rel, metabuf);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_update_meta_page xlrec;
XLogRecPtr recptr;
xlrec.ntuples = metap->hashm_ntuples;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
PageSetLSN(BufferGetPage(metabuf), recptr);
}
END_CRIT_SECTION();
_hash_relbuf(rel, metabuf);
/* return statistics */
if (stats == NULL)
@ -652,9 +695,244 @@ Datum hashvacuumcleanup(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(stats);
}
void hash_redo(XLogReaderState *record)
/*
* Helper function to perform deletion of index entries from a bucket.
*
* This function expects that the caller has acquired a cleanup lock on the
* primary bucket page, and will return with a write lock again held on the
* primary bucket page. The lock won't necessarily be held continuously,
* though, because we'll release it when visiting overflow pages.
*
* It would be very bad if this function cleaned a page while some other
* backend was in the midst of scanning it, because hashgettuple assumes
* that the next valid TID will be greater than or equal to the current
* valid TID. There can't be any concurrent scans in progress when we first
* enter this function because of the cleanup lock we hold on the primary
* bucket page, but as soon as we release that lock, there might be. We
* handle that by conspiring to prevent those scans from passing our cleanup
* scan. To do that, we lock the next page in the bucket chain before
* releasing the lock on the previous page. (This type of lock chaining is
* not ideal, so we might want to look for a better solution at some point.)
*
* We need to retain a pin on the primary bucket to ensure that no concurrent
* split can start.
*/
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
uint32 maxbucket, uint32 highmask, uint32 lowmask,
double *tuples_removed, double *num_index_tuples,
bool split_cleanup,
IndexBulkDeleteCallback callback, void *callback_state)
{
ereport(PANIC, (errmsg("hash_redo: unimplemented")));
BlockNumber blkno;
Buffer buf;
Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
bool bucket_dirty = false;
blkno = bucket_blkno;
buf = bucket_buf;
if (split_cleanup)
new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
lowmask, maxbucket);
/* Scan each page in bucket */
for (;;) {
HashPageOpaque opaque;
OffsetNumber offno;
OffsetNumber maxoffno;
Buffer next_buf;
Page page;
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
bool retain_pin = false;
bool clear_dead_marking = false;
vacuum_delay_point();
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/* Scan each tuple in page */
maxoffno = PageGetMaxOffsetNumber(page);
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
ItemPointer htup;
IndexTuple itup;
Bucket bucket;
bool kill_tuple = false;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno));
htup = &(itup->t_tid);
/*
* To remove the dead tuples, we strictly want to rely on results
* of callback function. refer btvacuumpage for detailed reason.
*/
if (callback && callback(htup, callback_state, InvalidOid, InvalidBktId)) {
kill_tuple = true;
if (tuples_removed)
*tuples_removed += 1;
} else if (split_cleanup) {
/* delete the tuples that are moved by split. */
bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
maxbucket, highmask, lowmask);
/* mark the item for deletion */
if (bucket != cur_bucket) {
/*
* We expect tuples to either belong to current bucket or
* new_bucket. This is ensured because we don't allow
* further splits from bucket that contains garbage. See
* comments in _hash_expandtable.
*/
Assert(bucket == new_bucket);
kill_tuple = true;
}
}
if (kill_tuple) {
/* mark the item for deletion */
deletable[ndeletable++] = offno;
} else {
/* we're keeping it, so count it */
if (num_index_tuples)
*num_index_tuples += 1;
}
}
/* retain the pin on primary bucket page till end of bucket scan */
if (blkno == bucket_blkno)
retain_pin = true;
else
retain_pin = false;
blkno = opaque->hasho_nextblkno;
/*
* Apply deletions, advance to next page and write page if needed.
*/
if (ndeletable > 0) {
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
PageIndexMultiDelete(page, deletable, ndeletable);
bucket_dirty = true;
/*
* Let us mark the page as clean if vacuum removes the DEAD tuples
* from an index page. We do this by clearing
* LH_PAGE_HAS_DEAD_TUPLES flag.
*/
if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) {
opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
clear_dead_marking = true;
}
MarkBufferDirty(buf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_delete xlrec;
XLogRecPtr recptr;
xlrec.clear_dead_marking = clear_dead_marking;
xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
/*
* bucket buffer needs to be registered to ensure that we can
* acquire a cleanup lock on it during replay.
*/
if (!xlrec.is_primary_bucket_page) {
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
}
XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber));
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
if (!xlrec.is_primary_bucket_page) {
PageSetLSN(BufferGetPage(bucket_buf), recptr);
}
PageSetLSN(BufferGetPage(buf), recptr);
}
END_CRIT_SECTION();
}
/* bail out if there are no more pages to scan. */
if (!BlockNumberIsValid(blkno))
break;
next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
LH_OVERFLOW_PAGE,
bstrategy);
/*
* release the lock on previous page after acquiring the lock on next
* page
*/
if (retain_pin)
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
else
_hash_relbuf(rel, buf);
buf = next_buf;
}
/*
* lock the bucket page to clear the garbage flag and squeeze the bucket.
* if the current buffer is same as bucket buffer, then we already have
* lock on bucket page.
*/
if (buf != bucket_buf) {
_hash_relbuf(rel, buf);
LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
}
/*
* Clear the garbage flag from bucket after deleting the tuples that are
* moved by split. We purposefully clear the flag before squeeze bucket,
* so that after restart, vacuum shouldn't again try to delete the moved
* by split tuples.
*/
if (split_cleanup) {
HashPageOpaque bucket_opaque;
Page page;
page = BufferGetPage(bucket_buf);
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
MarkBufferDirty(bucket_buf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
XLogRecPtr recptr;
XLogBeginInsert();
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
/*
* If we have deleted anything, try to compact free space. For squeezing
* the bucket, we must have a cleanup lock, else it can impact the
* ordering of tuples for a scan that has started before it.
*/
if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy);
else
LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
}
Datum hashmerge(PG_FUNCTION_ARGS)

View File

@ -0,0 +1,861 @@
/* -------------------------------------------------------------------------
*
* hash_xlog.cpp
* WAL replay logic for hash index.
*
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/gausskernel/storage/access/hash/hash_xlog.cpp
*
* -------------------------------------------------------------------------
*/
#include "access/xlogproc.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/xlogutils.h"
#include "access/xlog.h"
#include "access/transam.h"
#include "access/xlogproc.h"
#include "storage/procarray.h"
#include "miscadmin.h"
/*
* replay a hash index meta page
*/
static void hash_xlog_init_meta_page(XLogReaderState *record)
{
RedoBufferInfo metabuf;
ForkNumber forknum;
/* create the index' metapage */
XLogInitBufferForRedo(record, 0, &metabuf);
Assert(BufferIsValid(metabuf.buf));
HashRedoInitMetaPageOperatorPage(&metabuf, XLogRecGetData(record));
MarkBufferDirty(metabuf.buf);
/*
* Force the on-disk state of init forks to always be in sync with the
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
* special handling for init forks as create index operations don't log a
* full page image of the metapage.
*/
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
if (forknum == INIT_FORKNUM)
FlushOneBuffer(metabuf.buf);
/* all done */
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay a hash index bitmap page
*/
static void hash_xlog_init_bitmap_page(XLogReaderState *record)
{
RedoBufferInfo bitmapbuf;
RedoBufferInfo metabuf;
ForkNumber forknum;
/*
* Initialize bitmap page
*/
XLogInitBufferForRedo(record, 0, &bitmapbuf);
HashRedoInitBitmapPageOperatorBitmapPage(&bitmapbuf, XLogRecGetData(record));
MarkBufferDirty(bitmapbuf.buf);
/*
* Force the on-disk state of init forks to always be in sync with the
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
* special handling for init forks as create index operations don't log a
* full page image of the metapage.
*/
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
if (forknum == INIT_FORKNUM)
FlushOneBuffer(bitmapbuf.buf);
UnlockReleaseBuffer(bitmapbuf.buf);
/* add the new bitmap page to the metapage's list of bitmaps */
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
/*
* Note: in normal operation, we'd update the metapage while still
* holding lock on the bitmap page. But during replay it's not
* necessary to hold that lock, since nobody can see it yet; the
* creating transaction hasn't yet committed.
*/
HashRedoInitBitmapPageOperatorMetaPage(&metabuf);
MarkBufferDirty(metabuf.buf);
XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL);
if (forknum == INIT_FORKNUM)
FlushOneBuffer(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay a hash index insert without split
*/
static void hash_xlog_insert(XLogReaderState *record)
{
RedoBufferInfo buffer;
RedoBufferInfo metabuf;
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
Size datalen;
char *datapos = XLogRecGetBlockData(record, 0, &datalen);
HashRedoInsertOperatorPage(&buffer, XLogRecGetData(record), datapos, datalen);
MarkBufferDirty(buffer.buf);
}
if (BufferIsValid(buffer.buf))
UnlockReleaseBuffer(buffer.buf);
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
/*
* Note: in normal operation, we'd update the metapage while still
* holding lock on the page we inserted into. But during replay it's
* not necessary to hold that lock, since no other index updates can
* be happening concurrently.
*/
HashRedoInsertOperatorMetaPage(&metabuf);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay addition of overflow page for hash index
*/
static void hash_xlog_add_ovfl_page(XLogReaderState* record)
{
RedoBufferInfo leftbuf;
RedoBufferInfo ovflbuf;
RedoBufferInfo metabuf;
BlockNumber leftblk;
BlockNumber rightblk;
char *data = NULL;
Size datalen;
XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
XLogInitBufferForRedo(record, 0, &ovflbuf);
Assert(BufferIsValid(ovflbuf.buf));
data = XLogRecGetBlockData(record, 0, &datalen);
HashRedoAddOvflPageOperatorOvflPage(&ovflbuf, leftblk, data, datalen);
MarkBufferDirty(ovflbuf.buf);
if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) {
HashRedoAddOvflPageOperatorLeftPage(&leftbuf, rightblk);
MarkBufferDirty(leftbuf.buf);
}
if (BufferIsValid(leftbuf.buf))
UnlockReleaseBuffer(leftbuf.buf);
UnlockReleaseBuffer(ovflbuf.buf);
/*
* Note: in normal operation, we'd update the bitmap and meta page while
* still holding lock on the overflow pages. But during replay it's not
* necessary to hold those locks, since no other index updates can be
* happening concurrently.
*/
if (XLogRecHasBlockRef(record, 2)) {
RedoBufferInfo mapbuffer;
if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) {
data = XLogRecGetBlockData(record, 2, &datalen);
HashRedoAddOvflPageOperatorMapPage(&mapbuffer, data);
MarkBufferDirty(mapbuffer.buf);
}
if (BufferIsValid(mapbuffer.buf))
UnlockReleaseBuffer(mapbuffer.buf);
}
if (XLogRecHasBlockRef(record, 3)) {
RedoBufferInfo newmapbuf;
XLogInitBufferForRedo(record, 3, &newmapbuf);
HashRedoAddOvflPageOperatorNewmapPage(&newmapbuf, XLogRecGetData(record));
MarkBufferDirty(newmapbuf.buf);
UnlockReleaseBuffer(newmapbuf.buf);
}
if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) {
data = XLogRecGetBlockData(record, 4, &datalen);
HashRedoAddOvflPageOperatorMetaPage(&metabuf, XLogRecGetData(record), data, datalen);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay allocation of page for split operation
*/
static void hash_xlog_split_allocate_page(XLogReaderState *record)
{
RedoBufferInfo oldbuf;
RedoBufferInfo newbuf;
RedoBufferInfo metabuf;
Size datalen PG_USED_FOR_ASSERTS_ONLY;
char *data = NULL;
XLogRedoAction action;
/*
* To be consistent with normal operation, here we take cleanup locks on
* both the old and new buckets even though there can't be any concurrent
* inserts.
*/
/* replay the record for old bucket */
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
/*
* Note that we still update the page even if it was restored from a full
* page image, because the special space is not included in the image.
*/
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
HashRedoSplitAllocatePageOperatorObukPage(&oldbuf, XLogRecGetData(record));
MarkBufferDirty(oldbuf.buf);
}
/* replay the record for new bucket */
XLogInitBufferForRedo(record, 1, &newbuf);
HashRedoSplitAllocatePageOperatorNbukPage(&newbuf, XLogRecGetData(record));
if (!IsBufferCleanupOK(newbuf.buf))
elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock");
MarkBufferDirty(newbuf.buf);
/*
* We can release the lock on old bucket early as well but doing here to
* consistent with normal operation.
*/
if (BufferIsValid(oldbuf.buf))
UnlockReleaseBuffer(oldbuf.buf);
if (BufferIsValid(newbuf.buf))
UnlockReleaseBuffer(newbuf.buf);
/*
* Note: in normal operation, we'd update the meta page while still
* holding lock on the old and new bucket pages. But during replay it's
* not necessary to hold those locks, since no other bucket splits can be
* happening concurrently.
*/
/* replay the record for metapage changes */
if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) {
data = XLogRecGetBlockData(record, 2, &datalen);
HashRedoSplitAllocatePageOperatorMetaPage(&metabuf, XLogRecGetData(record), data);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay of split operation
*/
static void hash_xlog_split_page(XLogReaderState *record)
{
RedoBufferInfo buf;
if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
elog(ERROR, "Hash split record did not contain a full-page image");
if (BufferIsValid(buf.buf))
UnlockReleaseBuffer(buf.buf);
}
/*
* replay completion of split operation
*/
static void hash_xlog_split_complete(XLogReaderState *record)
{
RedoBufferInfo oldbuf;
RedoBufferInfo newbuf;
XLogRedoAction action;
/* replay the record for old bucket */
action = XLogReadBufferForRedo(record, 0, &oldbuf);
/*
* Note that we still update the page even if it was restored from a full
* page image, because the bucket flag is not included in the image.
*/
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
HashRedoSplitCompleteOperatorObukPage(&oldbuf, XLogRecGetData(record));
MarkBufferDirty(oldbuf.buf);
}
if (BufferIsValid(oldbuf.buf))
UnlockReleaseBuffer(oldbuf.buf);
/* replay the record for new bucket */
action = XLogReadBufferForRedo(record, 1, &newbuf);
/*
* Note that we still update the page even if it was restored from a full
* page image, because the bucket flag is not included in the image.
*/
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
HashRedoSplitCompleteOperatorNbukPage(&newbuf, XLogRecGetData(record));
MarkBufferDirty(newbuf.buf);
}
if (BufferIsValid(newbuf.buf))
UnlockReleaseBuffer(newbuf.buf);
}
/*
* replay move of page contents for squeeze operation of hash index
*/
static void hash_xlog_move_page_contents(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
RedoBufferInfo bucketbuf;
RedoBufferInfo writebuf;
RedoBufferInfo deletebuf;
XLogRedoAction action;
bucketbuf.buf = InvalidBuffer;
writebuf.buf = InvalidBuffer;
deletebuf.buf = InvalidBuffer;
/*
* Ensure we have a cleanup lock on primary bucket page before we start
* with the actual replay operation. This is to ensure that neither a
* scan can start nor a scan can be already-in-progress during the replay
* of this operation. If we allow scans during this operation, then they
* can miss some records or show the same record multiple times.
*/
if (xldata->is_prim_bucket_same_wrt) {
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
} else {
/*
* we don't care for return value as the purpose of reading bucketbuf
* is to ensure a cleanup lock on primary bucket page.
*/
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
PageSetLSN(bucketbuf.pageinfo.page, lsn);
action = XLogReadBufferForRedo(record, 1, &writebuf);
}
/* replay the record for adding entries in overflow buffer */
if (action == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 1, &datalen);
HashXlogMoveAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
MarkBufferDirty(writebuf.buf);
}
/* replay the record for deleting entries from overflow buffer */
if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) {
char *ptr = NULL;
Size len;
ptr = XLogRecGetBlockData(record, 2, &len);
HashXlogMoveDeleteOvflPageOperatorPage(&deletebuf, (void *)ptr, len);
MarkBufferDirty(deletebuf.buf);
}
/*
* Replay is complete, now we can release the buffers. We release locks at
* end of replay operation to ensure that we hold lock on primary bucket
* page till end of operation. We can optimize by releasing the lock on
* write buffer as soon as the operation for same is complete, if it is
* not same as primary bucket page, but that doesn't seem to be worth
* complicating the code.
*/
if (BufferIsValid(deletebuf.buf))
UnlockReleaseBuffer(deletebuf.buf);
if (BufferIsValid(writebuf.buf))
UnlockReleaseBuffer(writebuf.buf);
if (BufferIsValid(bucketbuf.buf))
UnlockReleaseBuffer(bucketbuf.buf);
}
/*
* replay squeeze page operation of hash index
*/
static void hash_xlog_squeeze_page(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
RedoBufferInfo bucketbuf;
RedoBufferInfo writebuf;
RedoBufferInfo ovflbuf;
RedoBufferInfo prevbuf;
RedoBufferInfo mapbuf;
XLogRedoAction action;
bucketbuf.buf = InvalidBuffer;
prevbuf.buf = InvalidBuffer;
/*
* Ensure we have a cleanup lock on primary bucket page before we start
* with the actual replay operation. This is to ensure that neither a
* scan can start nor a scan can be already-in-progress during the replay
* of this operation. If we allow scans during this operation, then they
* can miss some records or show the same record multiple times.
*/
if (xldata->is_prim_bucket_same_wrt) {
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
} else {
/*
* we don't care for return value as the purpose of reading bucketbuf
* is to ensure a cleanup lock on primary bucket page.
*/
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
PageSetLSN(bucketbuf.pageinfo.page, lsn);
action = XLogReadBufferForRedo(record, 1, &writebuf);
}
/* replay the record for adding entries in overflow buffer */
if (action == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 1, &datalen);
HashXlogSqueezeAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
MarkBufferDirty(writebuf.buf);
}
/* replay the record for initializing overflow buffer */
if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) {
HashXlogSqueezeInitOvflbufOperatorPage(&ovflbuf, XLogRecGetData(record));
MarkBufferDirty(ovflbuf.buf);
}
if (BufferIsValid(ovflbuf.buf))
UnlockReleaseBuffer(ovflbuf.buf);
/* replay the record for page previous to the freed overflow page */
if (!xldata->is_prev_bucket_same_wrt &&
XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) {
HashXlogSqueezeUpdatePrevPageOperatorPage(&prevbuf, XLogRecGetData(record));
MarkBufferDirty(prevbuf.buf);
}
if (BufferIsValid(prevbuf.buf))
UnlockReleaseBuffer(prevbuf.buf);
/* replay the record for page next to the freed overflow page */
if (XLogRecHasBlockRef(record, 4)) {
RedoBufferInfo nextbuf;
if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) {
HashXlogSqueezeUpdateNextPageOperatorPage(&nextbuf, XLogRecGetData(record));
MarkBufferDirty(nextbuf.buf);
}
if (BufferIsValid(nextbuf.buf))
UnlockReleaseBuffer(nextbuf.buf);
}
if (BufferIsValid(writebuf.buf))
UnlockReleaseBuffer(writebuf.buf);
if (BufferIsValid(bucketbuf.buf))
UnlockReleaseBuffer(bucketbuf.buf);
/*
* Note: in normal operation, we'd update the bitmap and meta page while
* still holding lock on the primary bucket page and overflow pages. But
* during replay it's not necessary to hold those locks, since no other
* index updates can be happening concurrently.
*/
/* replay the record for bitmap page */
if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 5, &datalen);
HashXlogSqueezeUpdateBitmapOperatorPage(&mapbuf, (void *)data);
MarkBufferDirty(mapbuf.buf);
}
if (BufferIsValid(mapbuf.buf))
UnlockReleaseBuffer(mapbuf.buf);
/* replay the record for meta page */
if (XLogRecHasBlockRef(record, 6)) {
RedoBufferInfo metabuf;
if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 6, &datalen);
HashXlogSqueezeUpdateMateOperatorPage(&metabuf, (void *)data);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
}
/*
* replay delete operation of hash index
*/
static void hash_xlog_delete(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
RedoBufferInfo bucketbuf;
RedoBufferInfo deletebuf;
XLogRedoAction action;
bucketbuf.buf = InvalidBuffer;
/*
* Ensure we have a cleanup lock on primary bucket page before we start
* with the actual replay operation. This is to ensure that neither a
* scan can start nor a scan can be already-in-progress during the replay
* of this operation. If we allow scans during this operation, then they
* can miss some records or show the same record multiple times.
*/
if (xldata->is_primary_bucket_page) {
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
} else {
/*
* we don't care for return value as the purpose of reading bucketbuf
* is to ensure a cleanup lock on primary bucket page.
*/
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
PageSetLSN(bucketbuf.pageinfo.page, lsn);
action = XLogReadBufferForRedo(record, 1, &deletebuf);
}
/* replay the record for deleting entries in bucket page */
if (action == BLK_NEEDS_REDO) {
char *ptr = NULL;
Size len;
ptr = XLogRecGetBlockData(record, 1, &len);
HashXlogDeleteBlockOperatorPage(&deletebuf, XLogRecGetData(record), (void *)ptr, len);
MarkBufferDirty(deletebuf.buf);
}
if (BufferIsValid(deletebuf.buf))
UnlockReleaseBuffer(deletebuf.buf);
if (BufferIsValid(bucketbuf.buf))
UnlockReleaseBuffer(bucketbuf.buf);
}
/*
* replay split cleanup flag operation for primary bucket page.
*/
static void hash_xlog_split_cleanup(XLogReaderState *record)
{
RedoBufferInfo buffer;
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
HashXlogSplitCleanupOperatorPage(&buffer);
MarkBufferDirty(buffer.buf);
}
if (BufferIsValid(buffer.buf))
UnlockReleaseBuffer(buffer.buf);
}
/*
* replay for update meta page
*/
static void hash_xlog_update_meta_page(XLogReaderState *record)
{
RedoBufferInfo metabuf;
if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) {
HashXlogUpdateMetaOperatorPage(&metabuf, XLogRecGetData(record));
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* Get the latestRemovedXid from the heap pages pointed at by the index
* tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
* on which this function is based.
*/
static TransactionId hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record)
{
xl_hash_vacuum_one_page *xlrec;
OffsetNumber *unused = NULL;
Buffer ibuffer;
Buffer hbuffer;
Page ipage;
Page hpage;
RelFileNode rnode;
BlockNumber blkno;
ItemId iitemid;
ItemId hitemid;
IndexTuple itup;
BlockNumber hblkno;
OffsetNumber hoffnum;
TransactionId latestRemovedXid = InvalidTransactionId;
int i;
xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
/*
* If there's nothing running on the standby we don't need to derive a
* full latestRemovedXid value, so use a fast path out of here. This
* returns InvalidTransactionId, and so will conflict with all HS
* transactions; but since we just worked out that that's zero people,
* it's OK.
*
* XXX There is a race condition here, which is that a new backend might
* start just after we look. If so, it cannot need to conflict, but this
* coding will result in throwing a conflict anyway.
*/
if (CountDBBackends(InvalidOid) == 0)
return latestRemovedXid;
/*
* Check if WAL replay has reached a consistent database state. If not, we
* must PANIC. See the definition of
* btree_xlog_delete_get_latestRemovedXid for more details.
*/
if (!t_thrd.xlog_cxt.reachedConsistency)
elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data");
/*
* Get index page. If the DB is consistent, this should not fail, nor
* should any of the heap page fetches below. If one does, we return
* InvalidTransactionId to cancel all HS transactions. That's probably
* overkill, but it's safe, and certainly better than panicking here.
*/
XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL, NULL);
if (!BufferIsValid(ibuffer))
return InvalidTransactionId;
LockBuffer(ibuffer, HASH_READ);
ipage = (Page) BufferGetPage(ibuffer);
/*
* Loop through the deleted index items to obtain the TransactionId from
* the heap items they point to.
*/
unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage);
for (i = 0; i < xlrec->ntuples; i++) {
/*
* Identify the index tuple about to be deleted.
*/
iitemid = PageGetItemId(ipage, unused[i]);
itup = (IndexTuple) PageGetItem(ipage, iitemid);
/*
* Locate the heap page that the index tuple points at
*/
hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL, NULL);
if (!BufferIsValid(hbuffer)) {
UnlockReleaseBuffer(ibuffer);
return InvalidTransactionId;
}
LockBuffer(hbuffer, HASH_READ);
hpage = (Page) BufferGetPage(hbuffer);
/*
* Look up the heap tuple header that the index tuple points at by
* using the heap node supplied with the xlrec. We can't use
* heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
* Note that we are not looking at tuple data here, just headers.
*/
hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
hitemid = PageGetItemId(hpage, hoffnum);
/*
* Follow any redirections until we find something useful.
*/
while (ItemIdIsRedirected(hitemid)) {
hoffnum = ItemIdGetRedirect(hitemid);
hitemid = PageGetItemId(hpage, hoffnum);
CHECK_FOR_INTERRUPTS();
}
/*
* If the heap item has storage, then read the header and use that to
* set latestRemovedXid.
*
* Some LP_DEAD items may not be accessible, so we ignore them.
*/
if (ItemIdHasStorage(hitemid)) {
HeapTupleData tuple;
tuple.t_data = (HeapTupleHeader) PageGetItem(hpage, hitemid);
HeapTupleCopyBaseFromPage(&tuple, &hpage);
HeapTupleHeaderAdvanceLatestRemovedXid(&tuple, &latestRemovedXid);
} else if (ItemIdIsDead(hitemid)) {
/*
* Conjecture: if hitemid is dead then it had xids before the xids
* marked on LP_NORMAL items. So we just ignore this item and move
* onto the next, for the purposes of calculating
* latestRemovedxids.
*/
} else
Assert(!ItemIdIsUsed(hitemid));
UnlockReleaseBuffer(hbuffer);
}
UnlockReleaseBuffer(ibuffer);
/*
* If all heap tuples were LP_DEAD then we will be returning
* InvalidTransactionId here, which avoids conflicts. This matches
* existing logic which assumes that LP_DEAD tuples must already be older
* than the latestRemovedXid on the cleanup record that set them as
* LP_DEAD, hence must already have generated a conflict.
*/
return latestRemovedXid;
}
/*
* replay delete operation in hash index to remove
* tuples marked as DEAD during index tuple insertion.
*/
static void hash_xlog_vacuum_one_page(XLogReaderState *record)
{
RedoBufferInfo buffer;
RedoBufferInfo metabuf;
XLogRedoAction action;
/*
* If we have any conflict processing to do, it must happen before we
* update the page.
*
* Hash index records that are marked as LP_DEAD and being removed during
* hash index tuple insertion can conflict with standby queries. You might
* think that vacuum records would conflict as well, but we've handled
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
* cleaned by the vacuum of the heap and so we can resolve any conflicts
* just once when that arrives. After that we know that no conflicts
* exist from individual hash index vacuum records on that index.
*/
if (InHotStandby) {
TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record);
RelFileNode rnode;
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
}
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
if (action == BLK_NEEDS_REDO) {
Size len;
len = XLogRecGetDataLen(record);
HashXlogVacuumOnePageOperatorPage(&buffer, XLogRecGetData(record), len);
MarkBufferDirty(buffer.buf);
}
if (BufferIsValid(buffer.buf))
UnlockReleaseBuffer(buffer.buf);
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
HashXlogVacuumMateOperatorPage(&metabuf, XLogRecGetData(record));
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
void hash_redo(XLogReaderState *record)
{
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
switch (info) {
case XLOG_HASH_INIT_META_PAGE:
hash_xlog_init_meta_page(record);
break;
case XLOG_HASH_INIT_BITMAP_PAGE:
hash_xlog_init_bitmap_page(record);
break;
case XLOG_HASH_INSERT:
hash_xlog_insert(record);
break;
case XLOG_HASH_ADD_OVFL_PAGE:
hash_xlog_add_ovfl_page(record);
break;
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
hash_xlog_split_allocate_page(record);
break;
case XLOG_HASH_SPLIT_PAGE:
hash_xlog_split_page(record);
break;
case XLOG_HASH_SPLIT_COMPLETE:
hash_xlog_split_complete(record);
break;
case XLOG_HASH_MOVE_PAGE_CONTENTS:
hash_xlog_move_page_contents(record);
break;
case XLOG_HASH_SQUEEZE_PAGE:
hash_xlog_squeeze_page(record);
break;
case XLOG_HASH_DELETE:
hash_xlog_delete(record);
break;
case XLOG_HASH_SPLIT_CLEANUP:
hash_xlog_split_cleanup(record);
break;
case XLOG_HASH_UPDATE_META_PAGE:
hash_xlog_update_meta_page(record);
break;
case XLOG_HASH_VACUUM_ONE_PAGE:
hash_xlog_vacuum_one_page(record);
break;
default:
elog(PANIC, "hash_redo: unknown op code %u", info);
}
}
bool IsHashVacuumPages(XLogReaderState *record)
{
uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK));
if (XLogRecGetRmid(record) == RM_HASH_ID) {
if (info == XLOG_HASH_DELETE) {
return true;
}
}
return false;
}

View File

@ -3,8 +3,8 @@
* hashinsert.cpp
* Item insertion in hash tables for Postgres.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -17,21 +17,30 @@
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/heapam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "storage/lock/lwlock.h"
#include "storage/buf/buf_internals.h"
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode);
/*
* _hash_doinsert() -- Handle insertion of a single index tuple.
*
* This routine is called by the public interface routines, hashbuild
* and hashinsert. By here, itup is completely filled in.
* This routine is called by the public interface routines, hashbuild
* and hashinsert. By here, itup is completely filled in.
*/
void _hash_doinsert(Relation rel, IndexTuple itup)
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
{
Buffer buf;
Buffer bucket_buf;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
HashMetaPage usedmetap = NULL;
Page metapage;
Page page;
HashPageOpaque pageopaque;
@ -39,7 +48,7 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
bool do_expand = false;
uint32 hashkey;
Bucket bucket;
OffsetNumber itup_off;
/*
* Get the hash key for the item (it's stored in the index tuple itself).
*/
@ -49,16 +58,16 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
itemsz = IndexTupleDSize(*itup);
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
/*
* Acquire shared split lock so we can compute the target bucket safely
* (see README).
*/
_hash_getlock(rel, 0, HASH_SHARE);
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
restart_insert:
/*
* Read the metapage. We don't lock it yet; HashMaxItemSize() will
* examine pd_pagesize_version, but that can't change so we can examine it
* without a lock.
*/
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
metapage = BufferGetPage(metabuf);
metap = HashPageGetMeta(metapage);
/*
* Check whether the item can fit on a hash page at all. (Eventually, we
@ -73,87 +82,154 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
(unsigned long)HashMaxItemSize(metapage)),
errhint("Values larger than a buffer page cannot be indexed.")));
/*
* Compute the target bucket number, and convert to block number.
*/
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
/* Lock the primary bucket page for the target bucket. */
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, &usedmetap);
Assert(usedmetap != NULL);
blkno = BUCKET_TO_BLKNO(metap, bucket);
/* remember the primary bucket buffer to release the pin on it at end. */
bucket_buf = buf;
/* release lock on metapage, but keep pin since we'll need it again */
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
/*
* Acquire share lock on target bucket; then we can release split lock.
*/
_hash_getlock(rel, blkno, HASH_SHARE);
_hash_droplock(rel, 0, HASH_SHARE);
/* Fetch the primary bucket page for the bucket */
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
page = BufferGetPage(buf);
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(pageopaque->hasho_bucket == bucket);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
bucket = pageopaque->hasho_bucket;
/*
* If this bucket is in the process of being split, try to finish the
* split before inserting, because that might create room for the
* insertion to proceed without allocating an additional overflow page.
* It's only interesting to finish the split if we're trying to insert
* into the bucket from which we're removing tuples (the "old" bucket),
* not if we're trying to insert into the bucket into which tuples are
* being moved (the "new" bucket).
*/
if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) {
/* release the lock on bucket buffer, before completing the split. */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
_hash_finish_split(rel, metabuf, buf, bucket,
usedmetap->hashm_maxbucket,
usedmetap->hashm_highmask,
usedmetap->hashm_lowmask);
/* release the pin on old and meta buffer. retry for insert. */
_hash_dropbuf(rel, buf);
_hash_dropbuf(rel, metabuf);
goto restart_insert;
}
/* Do the insertion */
while (PageGetFreeSpace(page) < itemsz) {
BlockNumber nextblkno;
/*
* Check if current page has any DEAD tuples. If yes, delete these
* tuples and see if we can get a space for the new item to be
* inserted before moving to the next page in the bucket chain.
*/
if (H_HAS_DEAD_TUPLES(pageopaque)) {
if (IsBufferCleanupOK(buf)) {
_hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
if (PageGetFreeSpace(page) >= itemsz)
break; /* OK, now we have enough space */
}
}
/*
* no space on this page; check for an overflow page
*/
BlockNumber nextblkno = pageopaque->hasho_nextblkno;
nextblkno = pageopaque->hasho_nextblkno;
if (BlockNumberIsValid(nextblkno)) {
/*
* ovfl page exists; go get it. if it doesn't have room, we'll
* find out next pass through the loop test above.
* find out next pass through the loop test above. we always
* release both the lock and pin if this is an overflow page, but
* only the lock if this is the primary bucket page, since the pin
* on the primary bucket must be retained throughout the scan.
*/
_hash_relbuf(rel, buf);
if (buf != bucket_buf)
_hash_relbuf(rel, buf);
else
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
page = BufferGetPage(buf);
} else {
/*
* we're at the end of the bucket chain and we haven't found a
* page with enough room. allocate a new overflow page.
*
* release our write lock without modifying buffer
*/
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
/* release our write lock without modifying buffer */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
/* chain to a new overflow page */
buf = _hash_addovflpage(rel, metabuf, buf);
buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
page = BufferGetPage(buf);
/* should fit now, given test above */
Assert(PageGetFreeSpace(page) >= itemsz);
}
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
Assert(pageopaque->hasho_bucket == bucket);
}
/* found page with enough space, so add the item here */
(void)_hash_pgaddtup(rel, buf, itemsz, itup);
/* write and release the modified page */
_hash_wrtbuf(rel, buf);
/* We can drop the bucket lock now */
_hash_droplock(rel, blkno, HASH_SHARE);
/*
* Write-lock the metapage so we can increment the tuple count. After
* incrementing it, check to see if it's time for a split.
*/
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* Do the update. No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
/* found page with enough space, so add the item here */
itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
MarkBufferDirty(buf);
/* metapage operations */
metap = HashPageGetMeta(metapage);
metap->hashm_ntuples += 1;
/* Make sure this stays in sync with _hash_expandtable() */
do_expand = metap->hashm_ntuples > (double)metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
/* Write out the metapage and drop lock, but keep pin */
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_insert xlrec;
XLogRecPtr recptr;
xlrec.offnum = itup_off;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup));
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
PageSetLSN(BufferGetPage(buf), recptr);
PageSetLSN(BufferGetPage(metabuf), recptr);
}
END_CRIT_SECTION();
/* drop lock on metapage, but keep pin */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
/*
* Release the modified page and ensure to release the pin on primary
* page.
*/
_hash_relbuf(rel, buf);
if (buf != bucket_buf)
_hash_dropbuf(rel, bucket_buf);
/* Attempt to split if a split is needed */
if (do_expand)
@ -192,3 +268,130 @@ OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple
return itup_off;
}
/*
* _hash_pgaddmultitup() -- add a tuple vector to a particular page in the index.
*
* This routine has same requirements for locking and tuple ordering as
* _hash_pgaddtup().
*
* Returns the offset number array at which the tuples were inserted.
*/
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
{
OffsetNumber itup_off;
Page page;
uint32 hashkey;
int i;
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
page = BufferGetPage(buf);
for (i = 0; i < nitups; i++) {
Size itemsize;
itemsize = IndexTupleDSize(*itups[i]);
itemsize = MAXALIGN(itemsize);
/* Find where to insert the tuple (preserving page's hashkey ordering) */
hashkey = _hash_get_indextuple_hashkey(itups[i]);
itup_off = _hash_binsearch(page, hashkey);
itup_offsets[i] = itup_off;
if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel));
}
}
/*
* _hash_vacuum_one_page - vacuum just one index page.
*
* Try to remove LP_DEAD items from the given page. We must acquire cleanup
* lock on the page being modified before calling this function.
*/
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode)
{
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
OffsetNumber offnum;
OffsetNumber maxoff;
Page page = BufferGetPage(buf);
HashPageOpaque pageopaque;
HashMetaPage metap;
/* Scan each tuple in page to see if it is marked as LP_DEAD */
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) {
ItemId itemId = PageGetItemId(page, offnum);
if (ItemIdIsDead(itemId))
deletable[ndeletable++] = offnum;
}
if (ndeletable > 0) {
/*
* Write-lock the meta page so that we can decrement tuple count.
*/
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
PageIndexMultiDelete(page, deletable, ndeletable);
/*
* Mark the page as not containing any LP_DEAD items. This is not
* certainly true (there might be some that have recently been marked,
* but weren't included in our target-item list), but it will almost
* always be true and it doesn't seem worth an additional page scan to
* check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
* anyway.
*/
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
metap = HashPageGetMeta(BufferGetPage(metabuf));
metap->hashm_ntuples -= ndeletable;
MarkBufferDirty(buf);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_vacuum_one_page xlrec;
XLogRecPtr recptr;
xlrec.hnode = hnode;
xlrec.ntuples = ndeletable;
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
/*
* We need the target-offsets array whether or not we store the
* whole buffer, to allow us to find the latestRemovedXid on a
* standby server.
*/
XLogRegisterData((char *) deletable,
ndeletable * sizeof(OffsetNumber));
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
PageSetLSN(BufferGetPage(buf), recptr);
PageSetLSN(BufferGetPage(metabuf), recptr);
}
END_CRIT_SECTION();
/*
* Releasing write lock on meta page as we have updated the tuple
* count.
*/
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,138 +0,0 @@
/* -------------------------------------------------------------------------
*
* hashscan.cpp
* manage scans on hash tables
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/gausskernel/storage/access/hash/hashscan.cpp
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/relscan.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "utils/resowner.h"
/*
* We track all of a backend's active scans on hash indexes using a list
* of HashScanListData structs, which are allocated in t_thrd.top_mem_cxt.
* It's okay to use a long-lived context because we rely on the ResourceOwner
* mechanism to clean up unused entries after transaction or subtransaction
* abort. We can't safely keep the entries in the executor's per-query
* context, because that might be already freed before we get a chance to
* clean up the list. (XXX seems like there should be a better way to
* manage this...)
*/
typedef struct HashScanListData {
IndexScanDesc hashsl_scan;
ResourceOwner hashsl_owner;
struct HashScanListData *hashsl_next;
} HashScanListData;
typedef HashScanListData *HashScanList;
/*
* ReleaseResources_hash() --- clean up hash subsystem resources.
*
* This is here because it needs to touch this module's static var HashScans.
*/
void ReleaseResources_hash(void)
{
HashScanList l = NULL;
HashScanList prev = NULL;
HashScanList next = NULL;
/*
* Release all HashScanList items belonging to the current ResourceOwner.
* Note that we do not release the underlying IndexScanDesc; that's in
* executor memory and will go away on its own (in fact quite possibly has
* gone away already, so we mustn't try to touch it here).
*
* Note: this should be a no-op during normal query shutdown. However, in
* an abort situation ExecutorEnd is not called and so there may be open
* index scans to clean up.
*/
prev = NULL;
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = next) {
next = l->hashsl_next;
if (l->hashsl_owner == t_thrd.utils_cxt.CurrentResourceOwner) {
if (prev == NULL)
u_sess->exec_cxt.HashScans = next;
else
prev->hashsl_next = next;
pfree(l);
/* prev does not change */
} else
prev = l;
}
}
/*
* _hash_regscan() -- register a new scan.
*/
void _hash_regscan(IndexScanDesc scan)
{
HashScanList new_el;
new_el = (HashScanList)MemoryContextAlloc(
SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sizeof(HashScanListData));
new_el->hashsl_scan = scan;
new_el->hashsl_owner = t_thrd.utils_cxt.CurrentResourceOwner;
new_el->hashsl_next = u_sess->exec_cxt.HashScans;
u_sess->exec_cxt.HashScans = new_el;
}
/*
* _hash_dropscan() -- drop a scan from the scan list
*/
void _hash_dropscan(IndexScanDesc scan)
{
HashScanList chk = NULL;
HashScanList last = NULL;
last = NULL;
for (chk = u_sess->exec_cxt.HashScans; chk != NULL && chk->hashsl_scan != scan; chk = chk->hashsl_next)
last = chk;
if (chk == NULL)
ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("hash scan list trashed")));
if (last == NULL)
u_sess->exec_cxt.HashScans = chk->hashsl_next;
else
last->hashsl_next = chk->hashsl_next;
pfree(chk);
}
/*
* Is there an active scan in this bucket?
*/
bool _hash_has_active_scan(Relation rel, Bucket bucket)
{
Oid relid = RelationGetRelid(rel);
HashScanList l = NULL;
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = l->hashsl_next) {
if (relid == l->hashsl_scan->indexRelation->rd_id) {
HashScanOpaque so = (HashScanOpaque)l->hashsl_scan->opaque;
if (so->hashso_bucket_valid && so->hashso_bucket == bucket)
return true;
}
}
return false;
}

View File

@ -3,8 +3,8 @@
* hashsearch.cpp
* search code for openGauss hash tables
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -64,40 +64,131 @@ bool _hash_next(IndexScanDesc scan, ScanDirection dir)
}
/*
* Advance to next page in a bucket, if any.
* Advance to next page in a bucket, if any. If we are scanning the bucket
* being populated during split operation then this function advances to the
* bucket being split after the last bucket page of bucket being populated.
*/
static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
static void _hash_readnext(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
{
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque)scan->opaque;
bool block_found = false;
blkno = (*opaquep)->hasho_nextblkno;
_hash_relbuf(rel, *bufp);
/*
* Retain the pin on primary bucket page till the end of scan. Refer the
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
else
_hash_relbuf(rel, *bufp);
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno)) {
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
block_found = true;
} else if (so->hashso_buc_populated && !so->hashso_buc_split) {
/*
* end of bucket, scan bucket being split if there was a split in
* progress at the start of scan.
*/
*bufp = so->hashso_split_bucket_buf;
/*
* buffer for bucket being split must be valid as we acquire the pin
* on it before the start of scan and retain it till end of scan.
*/
Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
/*
* setting hashso_buc_split to true indicates that we are scanning
* bucket being split.
*/
so->hashso_buc_split = true;
block_found = true;
}
if (block_found) {
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
}
}
/*
* Advance to previous page in a bucket, if any.
* Advance to previous page in a bucket, if any. If the current scan has
* started during split operation then this function advances to bucket
* being populated after the first bucket page of bucket being split.
*/
static void _hash_readprev(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
static void _hash_readprev(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
{
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
bool haveprevblk;
blkno = (*opaquep)->hasho_prevblkno;
_hash_relbuf(rel, *bufp);
/*
* Retain the pin on primary bucket page till the end of scan. Refer the
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) {
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
haveprevblk = false;
} else {
_hash_relbuf(rel, *bufp);
haveprevblk = true;
}
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno)) {
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
if (haveprevblk) {
Assert(BlockNumberIsValid(blkno));
*bufp = _hash_getbuf(rel, blkno, HASH_READ,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
/*
* We always maintain the pin on bucket page for whole scan operation,
* so releasing the additional pin we have acquired here.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
_hash_dropbuf(rel, *bufp);
} else if (so->hashso_buc_populated && so->hashso_buc_split) {
/*
* end of bucket, scan bucket being populated if there was a split in
* progress at the start of scan.
*/
*bufp = so->hashso_bucket_buf;
/*
* buffer for bucket being populated must be valid as we acquire the
* pin on it before the start of scan and retain it till end of scan.
*/
Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
/* move to the end of bucket chain */
while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
_hash_readnext(scan, bufp, pagep, opaquep);
/*
* setting hashso_buc_split to false indicates that we are scanning
* bucket being populated.
*/
so->hashso_buc_split = false;
}
}
@ -117,12 +208,9 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
ScanKey cur;
uint32 hashkey;
Bucket bucket;
BlockNumber blkno;
Buffer buf;
Buffer metabuf;
Page page;
HashPageOpaque opaque;
HashMetaPage metap;
IndexTuple itup;
ItemPointer current;
OffsetNumber offnum;
@ -174,48 +262,71 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
so->hashso_sk_hash = hashkey;
/*
* Acquire shared split lock so we can compute the target bucket safely
* (see README).
*/
_hash_getlock(rel, 0, HASH_SHARE);
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/*
* Compute the target bucket number, and convert to block number.
*/
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
blkno = BUCKET_TO_BLKNO(metap, bucket);
/* done with the metapage */
_hash_relbuf(rel, metabuf);
/*
* Acquire share lock on target bucket; then we can release split lock.
*/
_hash_getlock(rel, blkno, HASH_SHARE);
_hash_droplock(rel, 0, HASH_SHARE);
/* Update scan opaque state to show we have lock on the bucket */
so->hashso_bucket = bucket;
so->hashso_bucket_valid = true;
so->hashso_bucket_blkno = blkno;
/* Fetch the primary bucket page for the bucket */
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
page = BufferGetPage(buf);
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
bucket = opaque->hasho_bucket;
so->hashso_bucket_buf = buf;
/*
* If a bucket split is in progress, then while scanning the bucket being
* populated, we need to skip tuples that were copied from bucket being
* split. We also need to maintain a pin on the bucket being split to
* ensure that split-cleanup work done by vacuum doesn't remove tuples
* from it till this scan is done. We need to maintain a pin on the
* bucket being populated to ensure that vacuum doesn't squeeze that
* bucket till this scan is complete; otherwise, the ordering of tuples
* can't be maintained during forward and backward scans. Here, we have
* to be cautious about locking order: first, acquire the lock on bucket
* being split; then, release the lock on it but not the pin; then,
* acquire a lock on bucket being populated and again re-verify whether
* the bucket split is still in progress. Acquiring the lock on bucket
* being split first ensures that the vacuum waits for this scan to
* finish.
*/
if (H_BUCKET_BEING_POPULATED(opaque)) {
BlockNumber old_blkno;
Buffer old_buf;
old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
/*
* release the lock on new bucket and re-acquire it after acquiring
* the lock on old bucket.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
/*
* remember the split bucket buffer so as to use it later for
* scanning.
*/
so->hashso_split_bucket_buf = old_buf;
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
if (H_BUCKET_BEING_POPULATED(opaque)) {
so->hashso_buc_populated = true;
} else {
_hash_dropbuf(rel, so->hashso_split_bucket_buf);
so->hashso_split_bucket_buf = InvalidBuffer;
}
}
/* If a backwards scan is requested, move to the end of the chain */
if (ScanDirectionIsBackward(dir)) {
while (BlockNumberIsValid(opaque->hasho_nextblkno))
_hash_readnext(rel, &buf, &page, &opaque);
/*
* Backward scans that start during split needs to start from end of
* bucket being split.
*/
while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
(so->hashso_buc_populated && !so->hashso_buc_split))
_hash_readnext(scan, &buf, &page, &opaque);
}
/* Now find the first tuple satisfying the qualification */
@ -239,6 +350,12 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
* false. Else, return true and set the hashso_curpos for the
* scan to the right thing.
*
* Here we need to ensure that if the scan has started during split, then
* skip the tuples that are moved by split while scanning bucket being
* populated and then scan the bucket being split to cover all such
* tuples. This is done to ensure that we don't miss tuples in the scans
* that are started during split.
*
* 'bufP' points to the current buffer, which is pinned and read-locked.
* On success exit, we have pin and read-lock on whichever page
* contains the right item; on failure, we have released all buffers.
@ -283,9 +400,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
do {
switch (dir) {
case ForwardScanDirection:
if (offnum != InvalidOffsetNumber)
if (offnum != InvalidOffsetNumber) {
offnum = OffsetNumberNext(offnum); /* move forward */
else {
} else {
/* new page, locate starting position by binary search */
offnum = _hash_binsearch(page, so->hashso_sk_hash);
}
@ -298,14 +415,27 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
if (offnum <= maxoff) {
Assert(offnum >= FirstOffsetNumber);
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
/*
* skip the tuples that are moved by split operation
* for the scan that has started when split was in
* progress
*/
if (so->hashso_buc_populated && !so->hashso_buc_split &&
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
offnum = OffsetNumberNext(offnum); /* move forward */
continue;
}
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
break; /* yes, so exit for-loop */
}
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_hash_kill_items(scan);
/*
* ran off the end of this page, try the next
*/
_hash_readnext(rel, &buf, &page, &opaque);
_hash_readnext(scan, &buf, &page, &opaque);
if (BufferIsValid(buf)) {
maxoff = PageGetMaxOffsetNumber(page);
offnum = _hash_binsearch(page, so->hashso_sk_hash);
@ -318,9 +448,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
break;
case BackwardScanDirection:
if (offnum != InvalidOffsetNumber)
if (offnum != InvalidOffsetNumber) {
offnum = OffsetNumberPrev(offnum); /* move back */
else {
} else {
/* new page, locate starting position by binary search */
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
}
@ -333,14 +463,26 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
if (offnum >= FirstOffsetNumber) {
Assert(offnum <= maxoff);
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
/*
* skip the tuples that are moved by split operation
* for the scan that has started when split was in
* progress
*/
if (so->hashso_buc_populated && !so->hashso_buc_split &&
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
offnum = OffsetNumberPrev(offnum); /* move back */
continue;
}
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
break; /* yes, so exit for-loop */
}
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_hash_kill_items(scan);
/*
* ran off the end of this page, try the next
*/
_hash_readprev(rel, &buf, &page, &opaque);
_hash_readprev(scan, &buf, &page, &opaque);
if (BufferIsValid(buf)) {
maxoff = PageGetMaxOffsetNumber(page);
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
@ -360,9 +502,16 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
}
if (itup == NULL) {
/* we ran off the end of the bucket without finding a match */
/*
* We ran off the end of the bucket without finding a match.
* Release the pin on bucket buffers. Normally, such pins are
* released at end of scan, however scrolling cursors can
* reacquire the bucket lock and pin in the same scan multiple
* times.
*/
*bufP = so->hashso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
_hash_dropscanbuf(rel, so);
return false;
}

View File

@ -14,8 +14,8 @@
* plenty of locality of access.
*
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@ -37,15 +37,23 @@
struct HSpool {
Tuplesortstate *sortstate; /* state data for tuplesort.c */
Relation index;
/*
* We sort the hash keys based on the buckets they belong to. Below masks
* are used in _hash_hashkey2bucket to determine the bucket of given hash
* key.
*/
uint32 high_mask;
uint32 low_mask;
uint32 max_buckets;
};
/*
* create and initialize a spool structure
*/
HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void *meminfo)
{
HSpool *hspool = (HSpool *)palloc0(sizeof(HSpool));
uint32 hash_mask;
UtilityDesc *desc = (UtilityDesc *)meminfo;
int work_mem = (desc->query_mem[0] > 0) ? desc->query_mem[0] : u_sess->attr.attr_memory.maintenance_work_mem;
int max_mem = (desc->query_mem[1] > 0) ? desc->query_mem[1] : 0;
@ -57,18 +65,26 @@ HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
* num_buckets buckets in the index, the appropriate mask can be computed
* as follows.
*
* Note: at present, the passed-in num_buckets is always a power of 2, so
* we could just compute num_buckets - 1. We prefer not to assume that
* here, though.
* NOTE : This hash mask calculation should be in sync with similar
* calculation in _hash_init_metabuffer.
*/
hash_mask = (((uint32)1) << _hash_log2(num_buckets)) - 1;
hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
hspool->low_mask = (hspool->high_mask >> 1);
hspool->max_buckets = num_buckets - 1;
/*
* We size the sort area as maintenance_work_mem rather than work_mem to
* speed index creation. This should be OK since a single backend can't
* run multiple index creations in parallel.
*/
hspool->sortstate = tuplesort_begin_index_hash(index, hash_mask, work_mem, false, max_mem);
hspool->sortstate = tuplesort_begin_index_hash(heap,
index,
hspool->high_mask,
hspool->low_mask,
hspool->max_buckets,
work_mem,
false,
max_mem);
return hspool;
}
@ -94,7 +110,7 @@ void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, const bool *isnul
* given a spool loaded by successive calls to _h_spool,
* create an entire index.
*/
void _h_indexbuild(HSpool *hspool)
void _h_indexbuild(HSpool *hspool, Relation heapRel)
{
IndexTuple itup;
bool should_free = false;
@ -102,7 +118,7 @@ void _h_indexbuild(HSpool *hspool)
tuplesort_performsort(hspool->sortstate);
while ((itup = tuplesort_getindextuple(hspool->sortstate, true, &should_free)) != NULL) {
_hash_doinsert(hspool->index, itup);
_hash_doinsert(hspool->index, itup, heapRel);
if (should_free)
pfree(itup);
}

View File

@ -3,8 +3,8 @@
* hashutil.cpp
* Utility code for openGauss hash implementation.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -22,7 +22,9 @@
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "storage/buf/buf_internals.h"
#define CALC_NEW_BUCKET(old_bucket, lowmask) ((old_bucket) | ((lowmask) + 1))
/*
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
*/
@ -133,6 +135,70 @@ uint32 _hash_log2(uint32 num)
return i;
}
/*
* _hash_spareindex -- returns spare index / global splitpoint phase of the bucket
*/
uint32 _hash_spareindex(uint32 num_bucket)
{
uint32 splitpoint_group;
uint32 splitpoint_phases;
splitpoint_group = _hash_log2(num_bucket);
if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
return splitpoint_group;
/* account for single-phase groups */
splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
/* account for multi-phase groups before splitpoint_group */
splitpoint_phases +=
((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
HASH_SPLITPOINT_PHASE_BITS);
/* account for phases within current group */
splitpoint_phases +=
(((num_bucket - 1) >>
(splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
return splitpoint_phases;
}
/*
* _hash_get_totalbuckets -- returns total number of buckets allocated till
* the given splitpoint phase.
*/
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
{
uint32 splitpoint_group;
uint32 total_buckets;
uint32 phases_within_splitpoint_group;
if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
return (1 << splitpoint_phase);
/* get splitpoint's group */
splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
splitpoint_group +=
((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
HASH_SPLITPOINT_PHASE_BITS);
/* account for buckets before splitpoint_group */
total_buckets = (1 << (splitpoint_group - 1));
/* account for buckets within splitpoint_group */
phases_within_splitpoint_group =
(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
total_buckets +=
(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
phases_within_splitpoint_group);
return total_buckets;
}
/*
* _hash_checkpage -- sanity checks on the format of all hash pages
*
@ -216,25 +282,36 @@ uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
}
/*
* _hash_form_tuple - form an index tuple containing hash code only
* _hash_convert_tuple - convert raw index data to hash key
*
* Inputs: values and isnull arrays for the user data column(s)
* Outputs: values and isnull arrays for the index tuple, suitable for
* passing to index_form_tuple().
*
* Returns true if successful, false if not (because there are null values).
* On a false result, the given data need not be indexed.
*
* Note: callers know that the index-column arrays are always of length 1.
* In principle, there could be more than one input column, though we do not
* currently support that.
*/
IndexTuple _hash_form_tuple(Relation index, Datum *values, const bool *isnull)
bool _hash_convert_tuple(Relation index,
Datum *user_values, const bool *user_isnull,
Datum *index_values, bool *index_isnull)
{
IndexTuple itup;
uint32 hashkey;
Datum hashkeydatum;
TupleDesc hashdesc;
if (isnull[0]) {
hashkeydatum = (Datum)0;
} else {
hashkey = _hash_datum2hashkey(index, values[0]);
hashkeydatum = UInt32GetDatum(hashkey);
}
hashdesc = RelationGetDescr(index);
Assert(hashdesc->natts == 1);
itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
return itup;
/*
* We do not insert null values into hash indexes. This is okay because
* the only supported search operator is '=', and we assume it is strict.
*/
if (user_isnull[0])
return false;
hashkey = _hash_datum2hashkey(index, user_values[0]);
index_values[0] = UInt32GetDatum(hashkey);
index_isnull[0] = false;
return true;
}
/*
@ -312,3 +389,154 @@ OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
return lower;
}
/*
* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
* from which current (new) bucket is being split.
*/
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
{
Bucket old_bucket;
uint32 mask;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
/*
* To get the old bucket from the current bucket, we need a mask to modulo
* into lower half of table. This mask is stored in meta page as
* hashm_lowmask, but here we can't rely on the same, because we need a
* value of lowmask that was prevalent at the time when bucket split was
* started. Masking the most significant bit of new bucket would give us
* old bucket.
*/
mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
old_bucket = new_bucket & mask;
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
blkno = BUCKET_TO_BLKNO(metap, old_bucket);
_hash_relbuf(rel, metabuf);
return blkno;
}
/*
* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
* that will be generated after split from old bucket.
*
* This is used to find the new bucket from old bucket based on current table
* half. It is mainly required to finish the incomplete splits where we are
* sure that not more than one bucket could have split in progress from old
* bucket.
*/
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
{
Bucket new_bucket;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
metap->hashm_lowmask,
metap->hashm_maxbucket);
blkno = BUCKET_TO_BLKNO(metap, new_bucket);
_hash_relbuf(rel, metabuf);
return blkno;
}
/*
* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
* generated after split from current (old) bucket.
*
* This is used to find the new bucket from old bucket. New bucket can be
* obtained by OR'ing old bucket with most significant bit of current table
* half (lowmask passed in this function can be used to identify msb of
* current table half). There could be multiple buckets that could have
* been split from current bucket. We need the first such bucket that exists.
* Caller must ensure that no more than one split has happened from old
* bucket.
*/
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
uint32 lowmask, uint32 maxbucket)
{
Bucket new_bucket;
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
if (new_bucket > maxbucket) {
lowmask = lowmask >> 1;
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
}
return new_bucket;
}
/*
* _hash_kill_items - set LP_DEAD state for items an indexscan caller has
* told us were killed.
*
* scan->opaque, referenced locally through so, contains information about the
* current page and killed tuples thereon (generally, this should only be
* called if so->numKilled > 0).
*
* We match items by heap TID before assuming they are the right ones to
* delete.
*/
void _hash_kill_items(IndexScanDesc scan)
{
HashScanOpaque so = (HashScanOpaque) scan->opaque;
Page page;
HashPageOpaque opaque;
OffsetNumber offnum;
OffsetNumber maxoff;
int numKilled = so->numKilled;
int i;
bool killedsomething = false;
Assert(so->numKilled > 0);
Assert(so->killedItems != NULL);
/*
* Always reset the scan state, so we don't look for same items on other
* pages.
*/
so->numKilled = 0;
page = BufferGetPage(so->hashso_curbuf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
for (i = 0; i < numKilled; i++) {
offnum = so->killedItems[i].indexOffset;
while (offnum <= maxoff) {
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple)PageGetItem(page, iid);
if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) {
/* found the item */
ItemIdMarkDead(iid);
killedsomething = true;
break; /* out of inner search loop */
}
offnum = OffsetNumberNext(offnum);
}
}
/*
* Since this can be redone later if needed, mark as dirty hint. Whenever
* we mark anything LP_DEAD, we also set the page's
* LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
*/
if (killedsomething) {
opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
MarkBufferDirtyHint(so->hashso_curbuf, true);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -196,6 +196,9 @@ bool XLogBlockRefreshRedoBufferInfo(XLogBlockHead *blockhead, RedoBufferInfo *bu
if (bufferinfo->blockinfo.rnode.relNode != XLogBlockHeadGetRelNode(blockhead)) {
return false;
}
if (bufferinfo->blockinfo.rnode.opt != XLogBlockHeadGetCompressOpt(blockhead)) {
return false;
}
if (bufferinfo->blockinfo.forknum != XLogBlockHeadGetForkNum(blockhead)) {
return false;
}
@ -219,6 +222,7 @@ void XLogBlockInitRedoBlockInfo(XLogBlockHead *blockhead, RedoBufferTag *blockin
blockinfo->rnode.dbNode = XLogBlockHeadGetDbNode(blockhead);
blockinfo->rnode.relNode = XLogBlockHeadGetRelNode(blockhead);
blockinfo->rnode.bucketNode = XLogBlockHeadGetBucketId(blockhead);
blockinfo->rnode.opt = XLogBlockHeadGetCompressOpt(blockhead);
blockinfo->forknum = XLogBlockHeadGetForkNum(blockhead);
blockinfo->blkno = XLogBlockHeadGetBlockNum(blockhead);
blockinfo->pblk = XLogBlockHeadGetPhysicalBlock(blockhead);
@ -305,7 +309,7 @@ void XLogRecSetBlockCommonState(XLogReaderState *record, XLogBlockParseEnum bloc
blockparse->blockhead.spcNode = filenode.rnode.node.spcNode;
blockparse->blockhead.dbNode = filenode.rnode.node.dbNode;
blockparse->blockhead.bucketNode = filenode.rnode.node.bucketNode;
blockparse->blockhead.opt = filenode.rnode.node.opt;
blockparse->blockhead.blkno = filenode.segno;
blockparse->blockhead.forknum = filenode.forknumber;
@ -1288,6 +1292,8 @@ void XLogBlockDataCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoB
break;
case RM_UBTREE2_ID:
UBTree2RedoDataBlock(blockhead, blockdatarec, bufferinfo);
case RM_HASH_ID:
HashRedoDataBlock(blockhead, blockdatarec, bufferinfo);
break;
case RM_XLOG_ID:
xlog_redo_data_block(blockhead, blockdatarec, bufferinfo);
@ -1417,7 +1423,7 @@ void XLogBlockDdlCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoBu
rnode.dbNode = blockhead->dbNode;
rnode.relNode = blockhead->relNode;
rnode.bucketNode = blockhead->bucketNode;
rnode.opt = blockhead->opt;
switch (blockddlrec->blockddltype) {
case BLOCK_DDL_CREATE_RELNODE:
smgr_redo_create(rnode, blockhead->forknum, blockddlrec->mainData);
@ -1486,7 +1492,7 @@ void XLogBlockSegDdlDoRealAction(XLogBlockHead* blockhead, void* blockrecbody, R
rnode.dbNode = blockhead->dbNode;
rnode.relNode = blockhead->relNode;
rnode.bucketNode = blockhead->bucketNode;
rnode.opt = blockhead->opt;
switch (segddlrec->blockddlrec.blockddltype) {
case BLOCK_DDL_TRUNCATE_RELNODE:
xlog_block_segpage_redo_truncate(rnode, blockhead, segddlrec);
@ -1511,7 +1517,7 @@ void XLogBlockDdlDoSmgrAction(XLogBlockHead *blockhead, void *blockrecbody, Redo
rnode.dbNode = blockhead->dbNode;
rnode.relNode = blockhead->relNode;
rnode.bucketNode = blockhead->bucketNode;
rnode.opt = blockhead->opt;
switch (blockddlrec->blockddltype) {
case BLOCK_DDL_CREATE_RELNODE:
smgr_redo_create(rnode, blockhead->forknum, blockddlrec->mainData);

View File

@ -16,7 +16,8 @@
#include "postgres.h"
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/rmgr.h"
#include "access/hash_xlog.h"
const char* hash_type_name(uint8 subtype)
{
@ -25,5 +26,150 @@ const char* hash_type_name(uint8 subtype)
void hash_desc(StringInfo buf, XLogReaderState *record)
{
/* nothing to do */
char *rec = XLogRecGetData(record);
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
switch (info) {
case XLOG_HASH_INIT_META_PAGE:
{
xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec;
appendStringInfo(buf, "num_tuples %g, fillfactor %d",
xlrec->num_tuples, xlrec->ffactor);
break;
}
case XLOG_HASH_INIT_BITMAP_PAGE:
{
xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec;
appendStringInfo(buf, "bmsize %d", xlrec->bmsize);
break;
}
case XLOG_HASH_INSERT:
{
xl_hash_insert *xlrec = (xl_hash_insert *) rec;
appendStringInfo(buf, "off %u", xlrec->offnum);
break;
}
case XLOG_HASH_ADD_OVFL_PAGE:
{
xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec;
appendStringInfo(buf, "bmsize %d, bmpage_found %c",
xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F');
break;
}
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
{
xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec;
appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c",
xlrec->new_bucket,
(xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F',
(xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F');
break;
}
case XLOG_HASH_SPLIT_COMPLETE:
{
xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec;
appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u",
xlrec->old_bucket_flag, xlrec->new_bucket_flag);
break;
}
case XLOG_HASH_MOVE_PAGE_CONTENTS:
{
xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec;
appendStringInfo(buf, "ntups %d, is_primary %c",
xlrec->ntups,
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
break;
}
case XLOG_HASH_SQUEEZE_PAGE:
{
xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec;
appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c",
xlrec->prevblkno,
xlrec->nextblkno,
xlrec->ntups,
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
break;
}
case XLOG_HASH_DELETE:
{
xl_hash_delete *xlrec = (xl_hash_delete *) rec;
appendStringInfo(buf, "clear_dead_marking %c, is_primary %c",
xlrec->clear_dead_marking ? 'T' : 'F',
xlrec->is_primary_bucket_page ? 'T' : 'F');
break;
}
case XLOG_HASH_UPDATE_META_PAGE:
{
xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec;
appendStringInfo(buf, "ntuples %g",
xlrec->ntuples);
break;
}
case XLOG_HASH_VACUUM_ONE_PAGE:
{
xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec;
appendStringInfo(buf, "ntuples %d",
xlrec->ntuples);
break;
}
}
}
const char *hash_identify(uint8 info)
{
const char *id = NULL;
switch (info & ~XLR_INFO_MASK) {
case XLOG_HASH_INIT_META_PAGE:
id = "INIT_META_PAGE";
break;
case XLOG_HASH_INIT_BITMAP_PAGE:
id = "INIT_BITMAP_PAGE";
break;
case XLOG_HASH_INSERT:
id = "INSERT";
break;
case XLOG_HASH_ADD_OVFL_PAGE:
id = "ADD_OVFL_PAGE";
break;
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
id = "SPLIT_ALLOCATE_PAGE";
break;
case XLOG_HASH_SPLIT_PAGE:
id = "SPLIT_PAGE";
break;
case XLOG_HASH_SPLIT_COMPLETE:
id = "SPLIT_COMPLETE";
break;
case XLOG_HASH_MOVE_PAGE_CONTENTS:
id = "MOVE_PAGE_CONTENTS";
break;
case XLOG_HASH_SQUEEZE_PAGE:
id = "SQUEEZE_PAGE";
break;
case XLOG_HASH_DELETE:
id = "DELETE";
break;
case XLOG_HASH_SPLIT_CLEANUP:
id = "SPLIT_CLEANUP";
break;
case XLOG_HASH_UPDATE_META_PAGE:
id = "UPDATE_META_PAGE";
break;
case XLOG_HASH_VACUUM_ONE_PAGE:
id = "VACUUM_ONE_PAGE";
}
return id;
}

View File

@ -325,7 +325,11 @@ static void dw_prepare_page(dw_batch_t *batch, uint16 page_num, uint16 page_id,
if (t_thrd.proc->workingVersionNum < DW_SUPPORT_SINGLE_FLUSH_VERSION) {
page_num = page_num | IS_HASH_BKT_SEGPAGE_MASK;
}
batch->buftag_ver = HASHBUCKET_TAG;
if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) {
batch->buftag_ver = HASHBUCKET_TAG;
} else {
batch->buftag_ver = PAGE_COMPRESS_TAG;
}
} else {
batch->buftag_ver = ORIGIN_TAG;
}
@ -349,7 +353,7 @@ void dw_prepare_file_head(char *file_head, uint16 start, uint16 dwn, int32 dw_ve
curr_head->head.page_id = 0;
curr_head->head.dwn = dwn;
curr_head->start = start;
curr_head->buftag_version = HASHBUCKET_TAG;
curr_head->buftag_version = PAGE_COMPRESS_TAG;
curr_head->tail.dwn = dwn;
curr_head->dw_version = dw_version;
dw_calc_file_head_checksum(curr_head);
@ -477,15 +481,21 @@ static void dw_recover_pages(T1 *batch, T2 *buf_tag, PageHeader data_page, BufTa
for (i = 0; i < GET_REL_PGAENUM(batch->page_num); i++) {
buf_tag = &batch->buf_tag[i];
relnode.dbNode = buf_tag->rnode.dbNode;
relnode.spcNode = buf_tag->rnode.spcNode;
relnode.relNode = buf_tag->rnode.relNode;
if (tag_ver == HASHBUCKET_TAG) {
relnode.dbNode = buf_tag->rnode.dbNode;
relnode.spcNode = buf_tag->rnode.spcNode;
relnode.relNode = buf_tag->rnode.relNode;
relnode.opt = 0;
// 2 bytes are used for bucketNode.
relnode.bucketNode = (int2)((BufferTagSecondVer *)buf_tag)->rnode.bucketNode;
} else if (tag_ver == PAGE_COMPRESS_TAG) {
relnode.opt = ((BufferTag *)buf_tag)->rnode.opt;
relnode.bucketNode = ((BufferTag *)buf_tag)->rnode.bucketNode;
} else {
relnode.dbNode = buf_tag->rnode.dbNode;
relnode.spcNode = buf_tag->rnode.spcNode;
relnode.relNode = buf_tag->rnode.relNode;
relnode.opt = 0;
relnode.bucketNode = InvalidBktId;
}
dw_page = (PageHeader)((char *)batch + (i + 1) * BLCKSZ);
@ -891,7 +901,10 @@ static void dw_recover_partial_write_batch(dw_batch_file_context *cxt)
if (t_thrd.proc->workingVersionNum < DW_SUPPORT_SINGLE_FLUSH_VERSION) {
bool is_hashbucket = ((curr_head->page_num & IS_HASH_BKT_SEGPAGE_MASK) != 0);
curr_head->buftag_ver = is_hashbucket ? HASHBUCKET_TAG : ORIGIN_TAG;
curr_head->buftag_ver = is_hashbucket ?
(t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION ? HASHBUCKET_TAG
: PAGE_COMPRESS_TAG)
: ORIGIN_TAG;
}
remain_pages = read_asst.buf_end - read_asst.buf_start;
@ -2216,9 +2229,9 @@ int buftag_compare(const void *pa, const void *pb)
void dw_log_recovery_page(int elevel, const char *state, BufferTag buf_tag)
{
ereport(elevel, (errmodule(MOD_DW),
errmsg("[single flush] recovery, %s: buf_tag[rel %u/%u/%u blk %u fork %d]",
errmsg("[single flush] recovery, %s: buf_tag[rel %u/%u/%u blk %u fork %d], compress: %u",
state, buf_tag.rnode.spcNode, buf_tag.rnode.dbNode, buf_tag.rnode.relNode, buf_tag.blockNum,
buf_tag.forkNum)));
buf_tag.forkNum, buf_tag.rnode.opt)));
}
bool dw_read_data_page(BufferTag buf_tag, SMgrRelation reln, char* data_block)

View File

@ -53,6 +53,7 @@ static inline void PRXLogRecGetBlockTag(XLogRecParseState *recordBlockState, Rel
rnode->relNode = blockparse->blockhead.relNode;
rnode->spcNode = blockparse->blockhead.spcNode;
rnode->bucketNode = blockparse->blockhead.bucketNode;
rnode->opt = blockparse->blockhead.opt;
}
if (blknum != NULL) {
*blknum = blockparse->blockhead.blkno;
@ -245,6 +246,7 @@ void PRTrackRelStorageDrop(XLogRecParseState *recordBlockState, HTAB *redoItemHa
rNode.dbNode = blockparse->blockhead.dbNode;
rNode.relNode = blockparse->blockhead.relNode;
rNode.bucketNode = blockparse->blockhead.bucketNode;
rNode.opt = blockparse->blockhead.opt;
#ifdef USE_ASSERT_CHECKING
ereport(LOG, (errmsg("PRTrackRelTruncate:(%X/%X)clear relation %u/%u/%u forknum %u record",
(uint32)(blockparse->blockhead.end_ptr >> 32), (uint32)(blockparse->blockhead.end_ptr), rNode.spcNode,

View File

@ -33,6 +33,7 @@
#include "access/xlog_internal.h"
#include "access/nbtree.h"
#include "access/ubtree.h"
#include "access/hash_xlog.h"
#include "access/xlogreader.h"
#include "access/gist_private.h"
#include "access/multixact.h"
@ -190,7 +191,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE },
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
@ -1152,8 +1153,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
/* Run from the dispatcher thread. */
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
{
DispatchTxnRecord(record, expectedTLIs);
return true;
bool isNeedFullSync = false;
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
if (IsHashVacuumPages(record) && g_supportHotStandby) {
GetSlotIds(record);
/* sync with trxn thread */
/* only need to process in pageworker thread, wait trxn sync */
/* pageworker exe, trxn don't need exe */
DispatchToSpecPageWorker(record, expectedTLIs);
} else {
DispatchRecordWithPages(record, expectedTLIs);
}
return isNeedFullSync;
}
static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)

View File

@ -1369,6 +1369,7 @@ void RedoPageWorkerRedoBcmBlock(XLogRecParseState *procState)
node.dbNode = procState->blockparse.blockhead.dbNode;
node.relNode = procState->blockparse.blockhead.relNode;
node.bucketNode = procState->blockparse.blockhead.bucketNode;
node.opt = procState->blockparse.blockhead.opt;
XLogBlockNewCuParse *newCuParse = &(procState->blockparse.extra_rec.blocknewcu);
uint8 info = XLogBlockHeadGetInfo(&procState->blockparse.blockhead) & ~XLR_INFO_MASK;
switch (info & XLOG_HEAP_OPMASK) {

View File

@ -32,6 +32,7 @@
#include "access/xlog_internal.h"
#include "access/nbtree.h"
#include "access/ubtree.h"
#include "access/hash_xlog.h"
#include "access/xlogreader.h"
#include "access/gist_private.h"
#include "access/multixact.h"
@ -181,7 +182,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE},
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
@ -1073,8 +1074,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
/* Run from the dispatcher thread. */
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
{
DispatchTxnRecord(record, expectedTLIs, recordXTime, false);
return true;
bool isNeedFullSync = false;
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
if (IsHashVacuumPages(record) && g_supportHotStandby) {
GetWorkerIds(record, ANY_WORKER, true);
/* sync with trxn thread */
/* only need to process in pageworker thread, wait trxn sync */
/* pageworker exe, trxn don't need exe */
DispatchToSpecPageWorker(record, expectedTLIs, true);
} else {
DispatchRecordWithPages(record, expectedTLIs, true);
}
return isNeedFullSync;
}
static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)

View File

@ -29,6 +29,7 @@
#include "access/gin.h"
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/heapam.h"
#include "access/ustore/knl_uredo.h"
#include "access/multixact.h"

View File

@ -511,7 +511,8 @@ XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, int bucket_id, bool istoast)
* The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are
* reserved for use by me.
*/
if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE | XLR_BTREE_UPGRADE_FLAG | XLR_IS_TOAST)) != 0) {
if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE |
XLR_BTREE_UPGRADE_FLAG | XLR_REL_COMPRESS | XLR_IS_TOAST)) != 0) {
ereport(PANIC, (errmsg("invalid xlog info mask %hhx", info)));
}
@ -717,6 +718,12 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_
bool samerel = false;
bool tde = false;
// must be uncompressed table during upgrade
bool isCompressedTable = regbuf->rnode.opt != 0;
if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) {
Assert(!isCompressedTable);
}
if (!regbuf->in_use)
continue;
@ -864,7 +871,7 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_
samerel = false;
prev_regbuf = regbuf;
if (!samerel && IsSegmentFileNode(regbuf->rnode)) {
if (!samerel && (IsSegmentFileNode(regbuf->rnode) || isCompressedTable)) {
Assert(bkpb.id <= XLR_MAX_BLOCK_ID);
bkpb.id += BKID_HAS_BUCKET_OR_SEGPAGE;
}
@ -880,9 +887,21 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_
}
if (!samerel) {
if (IsSegmentFileNode(regbuf->rnode)) {
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), &regbuf->rnode, remained_size);
hashbucket_flag = true;
if (IsSegmentFileNode(regbuf->rnode) || isCompressedTable) {
if (IsSegmentFileNode(regbuf->rnode)) {
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), &regbuf->rnode, remained_size);
hashbucket_flag = true;
} else if (isCompressedTable) {
if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) {
Assert(!isCompressedTable);
RelFileNodeV2 relFileNodeV2;
RelFileNodeV2Copy(relFileNodeV2, regbuf->rnode);
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNodeV2), &regbuf->rnode, remained_size);
} else {
info |= XLR_REL_COMPRESS;
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), &regbuf->rnode, remained_size);
}
}
} else {
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNodeOld), &regbuf->rnode, remained_size);
no_hashbucket_flag = true;

View File

@ -949,6 +949,18 @@ void ResetDecoder(XLogReaderState *state)
remaining -= sizeof(type); \
} while (0)
/**
* happens during the upgrade, copy the RelFileNodeV2 to RelFileNode
* support little-endian system
* @param relfileNode relfileNode
*/
static void CompressTableRecord(RelFileNode* relfileNode)
{
if (relfileNode->bucketNode <= -1 && relfileNode->opt == 0xFFFF) {
relfileNode->opt = 0;
}
}
/*
* Decode the previously read record.
*
@ -1067,8 +1079,11 @@ bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errorms
if (remaining < filenodelen)
goto shortdata_err;
blk->rnode.bucketNode = InvalidBktId;
blk->rnode.opt = 0;
errno_t rc = memcpy_s(&blk->rnode, filenodelen, ptr, filenodelen);
securec_check(rc, "\0", "\0");
/* support decode old version of relfileNode */
CompressTableRecord(&blk->rnode);
ptr += filenodelen;
remaining -= filenodelen;

View File

@ -1331,6 +1331,7 @@ void XLogForgetDDLRedo(XLogRecParseState *redoblockstate)
relNode.dbNode = redoblockstate->blockparse.blockhead.dbNode;
relNode.relNode = redoblockstate->blockparse.blockhead.relNode;
relNode.bucketNode = redoblockstate->blockparse.blockhead.bucketNode;
relNode.opt = redoblockstate->blockparse.blockhead.opt;
XLogTruncateRelation(relNode, redoblockstate->blockparse.blockhead.forknum,
redoblockstate->blockparse.blockhead.blkno);
}
@ -1342,7 +1343,8 @@ void XLogDropSpaceShrink(XLogRecParseState *redoblockstate)
.spcNode = redoblockstate->blockparse.blockhead.spcNode,
.dbNode = redoblockstate->blockparse.blockhead.dbNode,
.relNode = redoblockstate->blockparse.blockhead.relNode,
.bucketNode = redoblockstate->blockparse.blockhead.bucketNode
.bucketNode = redoblockstate->blockparse.blockhead.bucketNode,
.opt = redoblockstate->blockparse.blockhead.opt
};
ForkNumber forknum = redoblockstate->blockparse.blockhead.forknum;
BlockNumber target_size = redoblockstate->blockparse.blockhead.blkno;

View File

@ -1400,6 +1400,7 @@ static void UHeapXlogUpdateBlock(XLogBlockHead *blockhead, XLogBlockDataParse *b
rnode.dbNode = blockhead->dbNode;
rnode.relNode = blockhead->relNode;
rnode.bucketNode = blockhead->bucketNode;
rnode.opt = blockhead->opt;
XLogRecordPageWithFreeSpace(rnode, bufferinfo->blockinfo.blkno, freespace);
}
} else {

View File

@ -5757,6 +5757,51 @@ bool ConditionalLockBufferForCleanup(Buffer buffer)
return false;
}
/*
* IsBufferCleanupOK - as above, but we already have the lock
*
* Check whether it's OK to perform cleanup on a buffer we've already
* locked. If we observe that the pin count is 1, our exclusive lock
* happens to be a cleanup lock, and we can proceed with anything that
* would have been allowable had we sought a cleanup lock originally.
*/
bool IsBufferCleanupOK(Buffer buffer)
{
BufferDesc *bufHdr;
uint32 buf_state;
Assert(BufferIsValid(buffer));
if (BufferIsLocal(buffer)) {
/* There should be exactly one pin */
if (u_sess->storage_cxt.LocalRefCount[-buffer - 1] != 1)
return false;
/* Nobody else to wait for */
return true;
}
/* There should be exactly one local pin */
if (GetPrivateRefCount(buffer) != 1)
return false;
bufHdr = GetBufferDescriptor(buffer - 1);
/* caller must hold exclusive lock on buffer */
Assert(LWLockHeldByMeInMode(bufHdr->content_lock, LW_EXCLUSIVE));
buf_state = LockBufHdr(bufHdr);
Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) {
/* pincount is OK. */
UnlockBufHdr(bufHdr, buf_state);
return true;
}
UnlockBufHdr(bufHdr, buf_state);
return false;
}
/*
* Functions for buffer I/O handling
*

View File

@ -191,6 +191,16 @@ static pthread_mutex_t VFDLockArray[NUM_VFD_PARTITIONS];
#define VFDMappingPartitionLock(hashcode) \
(&VFDLockArray[VFDTableHashPartition(hashcode)])
/*
* pc_munmap
*/
#define SAFE_MUNMAP(vfdP) \
do { \
if ((vfdP)->with_pcmap && (vfdP)->pcmap != NULL) { \
UnReferenceAddrFile((vfdP)); \
(vfdP)->pcmap = NULL; \
} \
} while (0)
/* --------------------
*
* Private Routines
@ -344,11 +354,13 @@ RelFileNodeForkNum RelFileNodeForkNumFill(RelFileNode* rnode,
filenode.rnode.node.spcNode = rnode->spcNode;
filenode.rnode.node.dbNode = rnode->dbNode;
filenode.rnode.node.bucketNode = rnode->bucketNode;
filenode.rnode.node.opt = rnode->opt;
} else {
filenode.rnode.node.relNode = InvalidOid;
filenode.rnode.node.spcNode = InvalidOid;
filenode.rnode.node.dbNode = InvalidOid;
filenode.rnode.node.bucketNode = InvalidBktId;
filenode.rnode.node.opt = 0;
}
filenode.rnode.backend = backend;
@ -915,6 +927,7 @@ static void LruDelete(File file)
vfdP = &vfdcache[file];
SAFE_MUNMAP(vfdP);
/* delete the vfd record from the LRU ring */
Delete(file);
@ -1704,6 +1717,8 @@ void FileCloseWithThief(File file)
{
Vfd* vfdP = &GetVfdCache()[file];
if (!FileIsNotOpen(file)) {
SAFE_MUNMAP(vfdP);
/* remove the file from the lru ring */
Delete(file);
/* the thief has close the real fd */
@ -1843,6 +1858,8 @@ void FileClose(File file)
vfdP = &vfdcache[file];
if (!FileIsNotOpen(file)) {
SAFE_MUNMAP(vfdP);
/* remove the file from the lru ring */
Delete(file);
@ -3994,3 +4011,48 @@ static void UnlinkIfExistsFname(const char *fname, bool isdir, int elevel)
}
}
/*
* initialize page compress memory map.
*
*/
void SetupPageCompressMemoryMap(File file, RelFileNode node, const RelFileNodeForkNum& relFileNodeForkNum)
{
Vfd *vfdP = &GetVfdCache()[file];
auto chunk_size = CHUNK_SIZE_LIST[GET_COMPRESS_CHUNK_SIZE(node.opt)];
int returnCode = FileAccess(file);
if (returnCode < 0) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to open file %s: %m", vfdP->fileName)));
}
RelFileNodeForkNum newOne(relFileNodeForkNum);
newOne.forknumber = PCA_FORKNUM;
PageCompressHeader *map = GetPageCompressHeader(vfdP, chunk_size, newOne);
vfdP->with_pcmap = true;
vfdP->pcmap = map;
}
/*
* Return the page compress memory map.
*
*/
PageCompressHeader *GetPageCompressMemoryMap(File file, uint32 chunk_size)
{
int returnCode;
Vfd *vfdP = &GetVfdCache()[file];
PageCompressHeader *map = NULL;
Assert(FileIsValid(file));
returnCode = FileAccess(file);
if (returnCode < 0) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to open file %s: %m", vfdP->fileName)));
}
Assert(vfdP->with_pcmap);
if (vfdP->pcmap == NULL) {
map = GetPageCompressHeader(vfdP, chunk_size, vfdP->fileNode);
vfdP->with_pcmap = true;
vfdP->pcmap = map;
}
return vfdP->pcmap;
}

View File

@ -436,3 +436,28 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno)
((PageHeader)page)->pd_checksum = pg_checksum_page((char*)page, blkno);
}
/*
* PageGetFreeSpaceForMultipleTuples
* Returns the size of the free (allocatable) space on a page,
* reduced by the space needed for multiple new line pointers.
*
* Note: this should usually only be used on index pages. Use
* PageGetHeapFreeSpace on heap pages.
*/
Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
{
int space;
/*
* Use signed arithmetic here so that we behave sensibly if pd_lower >
* pd_upper.
*/
space = (int)((PageHeader)page)->pd_upper - (int)((PageHeader)page)->pd_lower;
if (space < (int)(ntups * sizeof(ItemIdData)))
return 0;
space -= ntups * sizeof(ItemIdData);
return (Size) space;
}

View File

@ -105,6 +105,10 @@ static void formatBitmap(const unsigned char *start, int len, char bit1, char bi
void PrepForRead(char *path, int64 blocknum, char *relation_type, char *outputFilename, RelFileNode *relnode,
bool parse_page)
{
if (CalculateCompressMainForkSize(path, true) != 0) {
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("compressed table file is not allowed here."))));
}
char *pathFirstpart = (char *)palloc(MAXFNAMELEN * sizeof(char));
errno_t rc = memset_s(pathFirstpart, MAXFNAMELEN, 0, MAXFNAMELEN);
securec_check(rc, "\0", "\0");
@ -133,7 +137,7 @@ void PrepForRead(char *path, int64 blocknum, char *relation_type, char *outputFi
(errmsg("The tablespace oid is 0. Please check the first parameter path. "
"If you are not sure about the table path, please check pg_relation_filepath."))));
RelFileNodeRelCopy(*relnode, relfilenode.rnode.node);
relnode->opt = 0;
char *pagesuffix = "page";
char *xlogsuffix = "xlog";
rc = snprintf_s(outputFilename + (int)strlen(outputFilename), MAXFILENAME, MAXFILENAME - 1, "%s/%u_%u_%u_%d.%s",
@ -496,6 +500,7 @@ static void CheckSegment(RelFileNode *relnode, ForkNumber forkNum)
relnodeHead->dbNode = relnode->dbNode;
relnodeHead->relNode = 1;
relnodeHead->bucketNode = relnode->bucketNode;
relnodeHead->opt = relnode->opt;
Buffer buffer_temp = ReadBufferFast(spc, *relnodeHead, forkNum, relnode->relNode, RBM_NORMAL);
if (!BufferIsValid(buffer_temp))
ereport(ERROR, (errcode_for_file_access(), errmsg("Segment Head is invalid %u/%u/%u %d %u",

View File

@ -123,6 +123,7 @@ Datum gs_read_block_from_remote(PG_FUNCTION_ARGS)
key.relfilenode.dbNode = dbNode;
key.relfilenode.relNode = relNode;
key.relfilenode.bucketNode = bucketNode;
key.relfilenode.opt = 0;
key.forknum = forkNum;
key.blocknum = blockNum;
@ -141,6 +142,48 @@ Datum gs_read_block_from_remote(PG_FUNCTION_ARGS)
}
}
/*
* Read block from buffer from primary, returning it as bytea
*/
Datum gs_read_block_from_remote_compress(PG_FUNCTION_ARGS)
{
RepairBlockKey key;
uint32 blockSize;
uint64 lsn;
int timeout = 0;
bool isForCU = false;
bytea* result = NULL;
if (GetUserId() != BOOTSTRAP_SUPERUSERID) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be initial account to read files"))));
}
/* handle optional arguments */
key.relfilenode.spcNode = PG_GETARG_UINT32(0);
key.relfilenode.dbNode = PG_GETARG_UINT32(1);
key.relfilenode.relNode = PG_GETARG_UINT32(2);
key.relfilenode.bucketNode = PG_GETARG_INT16(3);
key.relfilenode.opt = PG_GETARG_UINT16(4);
key.forknum = PG_GETARG_INT32(5);
key.blocknum = (uint64)PG_GETARG_TRANSACTIONID(6);
blockSize = PG_GETARG_UINT32(7);
lsn = (uint64)PG_GETARG_TRANSACTIONID(8);
isForCU = PG_GETARG_BOOL(9);
timeout = PG_GETARG_INT32(10);
/* get block from local buffer */
if (isForCU) {
/* if request to read CU block, we use forkNum column to replace colid. */
(void)StandbyReadCUforPrimary(key, key.blocknum, blockSize, lsn, timeout, &result);
} else {
(void)StandbyReadPageforPrimary(key, blockSize, lsn, &result, timeout, NULL);
}
if (NULL != result) {
PG_RETURN_BYTEA_P(result);
} else {
PG_RETURN_NULL();
}
}
/*
* @Description: read cu for primary
* @IN spcnode: tablespace id

View File

@ -34,6 +34,7 @@
#include "access/xlog.h"
#include "storage/smgr/fd.h"
#include "storage/ipc.h"
#include "storage/page_compression.h"
#include "storage/pmsignal.h"
#include "storage/checksum.h"
#ifdef ENABLE_MOT
@ -116,6 +117,9 @@ static void send_xlog_header(const char *linkpath);
static void save_xlogloc(const char *xloglocation);
static XLogRecPtr GetMinArchiveSlotLSN(void);
/* compressed Function */
static void SendCompressedFile(char* readFileName, int basePathLen, struct stat& statbuf, bool missingOk, int64* size);
/*
* save xlog location
*/
@ -1259,6 +1263,35 @@ static bool IsDCFPath(const char *pathname)
return false;
}
#define SEND_DIR_ADD_SIZE(size, statbuf) ((size) = (size) + (((statbuf).st_size + 511) & ~511) + BUILD_PATH_LEN)
/**
* send file or compressed file
* @param sizeOnly send or not
* @param pathbuf path
* @param pathBufLen pathLen
* @param basepathlen subfix of path
* @param statbuf path stat
*/
static void SendRealFile(bool sizeOnly, char* pathbuf, size_t pathBufLen, int basepathlen, struct stat* statbuf)
{
int64 size = 0;
// we must ensure the page integrity when in IncrementalCheckpoint
if (!sizeOnly && g_instance.attr.attr_storage.enableIncrementalCheckpoint &&
IsCompressedFile(pathbuf, strlen(pathbuf)) != COMPRESSED_TYPE_UNKNOWN) {
SendCompressedFile(pathbuf, basepathlen, (*statbuf), true, &size);
} else {
bool sent = false;
if (!sizeOnly) {
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, statbuf, true);
}
if (sent || sizeOnly) {
/* Add size, rounded up to 512byte block */
SEND_DIR_ADD_SIZE(size, (*statbuf));
}
}
}
/*
* Include all files from the given directory in the output tar stream. If
* 'sizeonly' is true, we just calculate a total length and return it, without
@ -1557,15 +1590,7 @@ static int64 sendDir(const char *path, int basepathlen, bool sizeonly, List *tab
if (!skip_this_dir)
size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, sendtblspclinks);
} else if (S_ISREG(statbuf.st_mode)) {
bool sent = false;
if (!sizeonly)
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, true);
if (sent || sizeonly) {
/* Add size, rounded up to 512byte block */
size = size + ((statbuf.st_size + 511) & ~511) + BUILD_PATH_LEN;
}
SendRealFile(sizeonly, pathbuf, strlen(pathbuf), basepathlen, &statbuf);
} else
ereport(WARNING, (errmsg("skipping special file \"%s\"", pathbuf)));
}
@ -1692,6 +1717,15 @@ bool is_row_data_file(const char *path, int *segNo, UndoFileType *undoFileType)
int nmatch;
char *fname = NULL;
/* Skip compressed page files */
size_t pathLen = strlen(path);
if (pathLen >= 4) {
const char* suffix = path + pathLen - 4;
if (strncmp(suffix, "_pca", 4) == 0 || strncmp(suffix, "_pcd", 4) == 0) {
return false;
}
}
if ((fname = strstr((char *)path, "pg_tblspc/")) != NULL) {
nmatch = sscanf_s(fname, "pg_tblspc/%u/%*[^/]/%u/%s", &spcNode, &dbNode, buf, sizeof(buf));
if (nmatch == 3) {
@ -1809,6 +1843,245 @@ static void SendTableSpaceForBackup(basebackup_options* opt, List* tablespaces,
}
}
/**
* init buf_block if not yet; repalloc PqSendBuffer if necessary
*/
static void SendFilePreInit(void)
{
if (t_thrd.basebackup_cxt.buf_block == NULL) {
MemoryContext oldcxt = MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
t_thrd.basebackup_cxt.buf_block = (char *)palloc0(TAR_SEND_SIZE);
MemoryContextSwitchTo(oldcxt);
}
/*
* repalloc to `MaxBuildAllocSize' in one time, to avoid many small step repalloc in `pq_putmessage_noblock'
* and low performance.
*/
if (INT2SIZET(t_thrd.libpq_cxt.PqSendBufferSize) < MaxBuildAllocSize) {
t_thrd.libpq_cxt.PqSendBuffer = (char *)repalloc(t_thrd.libpq_cxt.PqSendBuffer, MaxBuildAllocSize);
t_thrd.libpq_cxt.PqSendBufferSize = MaxBuildAllocSize;
}
}
/**
* check file
* @param readFileName
* @param statbuf
* @param supress error if missingOk is false when file is not found
* @return return null if file.size > MAX_TAR_MEMBER_FILELEN or file cant found
*/
static FILE *SizeCheckAndAllocate(char *readFileName, const struct stat &statbuf, bool missingOk)
{
/*
* Some compilers will throw a warning knowing this test can never be true
* because pgoff_t can't exceed the compared maximum on their platform.
*/
if (statbuf.st_size > MAX_TAR_MEMBER_FILELEN) {
ereport(WARNING, (errcode(ERRCODE_NAME_TOO_LONG),
errmsg("archive member \"%s\" too large for tar format", readFileName)));
return NULL;
}
FILE *fp = AllocateFile(readFileName, "rb");
if (fp == NULL) {
if (errno == ENOENT && missingOk)
return NULL;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", readFileName)));
}
return fp;
}
static void TransferPcaFile(const char *readFileName, int basePathLen, const struct stat &statbuf,
PageCompressHeader *transfer,
size_t len)
{
const char *tarfilename = readFileName + basePathLen + 1;
_tarWriteHeader(tarfilename, NULL, (struct stat*)(&statbuf));
char *data = (char *) transfer;
size_t lenBuffer = len;
while (lenBuffer > 0) {
size_t transferLen = Min(TAR_SEND_SIZE, lenBuffer);
if (pq_putmessage_noblock('d', data, transferLen)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup")));
}
data = data + transferLen;
lenBuffer -= transferLen;
}
size_t pad = ((len + 511) & ~511) - len;
if (pad > 0) {
securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, pad, 0, pad), "", "");
(void) pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, pad);
}
}
static void FileStat(char* path, struct stat* fileStat)
{
if (stat(path, fileStat) != 0) {
if (errno != ENOENT) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file or directory \"%s\": %m", path)));
}
}
}
static void SendCompressedFile(char* readFileName, int basePathLen, struct stat& statbuf, bool missingOk, int64* size)
{
char* tarfilename = readFileName + basePathLen + 1;
SendFilePreInit();
FILE* fp = SizeCheckAndAllocate(readFileName, statbuf, missingOk);
if (fp == NULL) {
return;
}
size_t readFileNameLen = strlen(readFileName);
/* dont send pca file */
if (readFileNameLen < 4 || strncmp(readFileName + readFileNameLen - 4, "_pca", 4) == 0 ||
strncmp(readFileName + readFileNameLen - 4, "_pcd", 4) != 0) {
FreeFile(fp);
return;
}
char tablePath[MAXPGPATH] = {0};
securec_check_c(memcpy_s(tablePath, MAXPGPATH, readFileName, readFileNameLen - 4), "", "");
int segmentNo = 0;
UndoFileType undoFileType = UNDO_INVALID;
if (!is_row_data_file(tablePath, &segmentNo, &undoFileType)) {
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("%s is not a relation file.", tablePath)));
}
char pcaFilePath[MAXPGPATH];
securec_check_c(memcpy_s(pcaFilePath, MAXPGPATH, readFileName, readFileNameLen), "", "");
pcaFilePath[readFileNameLen - 1] = 'a';
FILE* pcaFile = AllocateFile(pcaFilePath, "rb");
if (pcaFile == NULL) {
if (errno == ENOENT && missingOk) {
FreeFile(fp);
return;
}
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcaFilePath)));
}
uint16 chunkSize = ReadChunkSize(pcaFile, pcaFilePath, MAXPGPATH);
struct stat pcaStruct;
FileStat((char*)pcaFilePath, &pcaStruct);
size_t pcaFileLen = SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize);
PageCompressHeader* map = pc_mmap_real_size(fileno(pcaFile), pcaFileLen, true);
if (map == MAP_FAILED) {
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("Failed to mmap page compression address file %s: %m", pcaFilePath)));
}
PageCompressHeader* transfer = (PageCompressHeader*)palloc0(pcaFileLen);
/* decompressed page buffer, avoid frequent allocation */
BlockNumber blockNum = 0;
size_t chunkIndex = 1;
off_t totalLen = 0;
off_t sendLen = 0;
/* send the pkg header containing msg like file size */
BlockNumber totalBlockNum = (BlockNumber)pg_atomic_read_u32(&map->nblocks);
/* some chunks may have been allocated but not used.
* Reserve 0 chunks for avoiding the error when the size of a compressed block extends */
auto reservedChunks = 0;
securec_check(memcpy_s(transfer, pcaFileLen, map, pcaFileLen), "", "");
decltype(statbuf.st_size) realSize = (map->allocated_chunks + reservedChunks) * chunkSize;
statbuf.st_size = statbuf.st_size >= realSize ? statbuf.st_size : realSize;
_tarWriteHeader(tarfilename, NULL, (struct stat*)(&statbuf));
bool* onlyExtend = (bool*)palloc0(totalBlockNum * sizeof(bool));
/* allocated in advance to prevent repeated allocated */
char pageBuffer[BLCKSZ];
ReadBlockChunksStruct rbStruct{map, pageBuffer, BLCKSZ, fp, segmentNo, readFileName};
for (blockNum = 0; blockNum < totalBlockNum; blockNum++) {
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(transfer, chunkSize, blockNum);
/* skip some blocks which only extends. The size of blocks is 0. */
if (addr->nchunks == 0) {
onlyExtend[blockNum] = true;
continue;
}
/* read block to t_thrd.basebackup_cxt.buf_block */
size_t bufferSize = TAR_SEND_SIZE - sendLen;
size_t len = ReadAllChunkOfBlock(t_thrd.basebackup_cxt.buf_block + sendLen, bufferSize, blockNum, rbStruct);
/* merge Blocks */
sendLen += len;
if (totalLen + (off_t)len > statbuf.st_size) {
ReleaseMap(map, readFileName);
ereport(ERROR,
(errcode_for_file_access(),
errmsg("some blocks in %s had been changed. Retry backup please. PostBlocks:%u, currentReadBlocks "
":%u, transferSize: %lu. totalLen: %lu, len: %lu",
readFileName,
totalBlockNum,
blockNum,
statbuf.st_size,
totalLen,
len)));
}
if (sendLen > TAR_SEND_SIZE - BLCKSZ) {
if (pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, sendLen)) {
ReleaseMap(map, readFileName);
ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup")));
}
sendLen = 0;
}
uint8 nchunks = len / chunkSize;
addr->nchunks = addr->allocated_chunks = nchunks;
for (size_t i = 0; i < nchunks; i++) {
addr->chunknos[i] = chunkIndex++;
}
addr->checksum = AddrChecksum32(blockNum, addr, chunkSize);
totalLen += len;
}
ReleaseMap(map, readFileName);
if (sendLen != 0) {
if (pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, sendLen)) {
ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup")));
}
}
/* If the file was truncated while we were sending it, pad it with zeros */
if (totalLen < statbuf.st_size) {
securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, TAR_SEND_SIZE, 0, TAR_SEND_SIZE), "", "");
while (totalLen < statbuf.st_size) {
size_t cnt = Min(TAR_SEND_SIZE, statbuf.st_size - totalLen);
(void)pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, cnt);
totalLen += cnt;
}
}
size_t pad = ((totalLen + 511) & ~511) - totalLen;
if (pad > 0) {
securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, pad, 0, pad), "", "");
(void)pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, pad);
}
SEND_DIR_ADD_SIZE(*size, statbuf);
// allocate chunks of some pages which only extend
for (size_t blockNum = 0; blockNum < totalBlockNum; ++blockNum) {
if (onlyExtend[blockNum]) {
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(transfer, chunkSize, blockNum);
for (size_t i = 0; i < addr->allocated_chunks; i++) {
addr->chunknos[i] = chunkIndex++;
}
}
}
transfer->nblocks = transfer->last_synced_nblocks = blockNum;
transfer->last_synced_allocated_chunks = transfer->allocated_chunks = chunkIndex;
TransferPcaFile(pcaFilePath, basePathLen, pcaStruct, transfer, pcaFileLen);
SEND_DIR_ADD_SIZE(*size, pcaStruct);
FreeFile(pcaFile);
FreeFile(fp);
pfree(transfer);
pfree(onlyExtend);
}
/*
* Given the member, write the TAR header & send the file.
*
@ -1832,39 +2105,11 @@ static bool sendFile(char *readfilename, char *tarfilename, struct stat *statbuf
const int MAX_RETRY_LIMIT = 60;
int retryCnt = 0;
UndoFileType undoFileType = UNDO_INVALID;
if (t_thrd.basebackup_cxt.buf_block == NULL) {
MemoryContext oldcxt = NULL;
oldcxt = MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
t_thrd.basebackup_cxt.buf_block = (char *)palloc0(TAR_SEND_SIZE);
MemoryContextSwitchTo(oldcxt);
}
/*
* repalloc to `MaxBuildAllocSize' in one time, to avoid many small step repalloc in `pq_putmessage_noblock'
* and low performance.
*/
if (INT2SIZET(t_thrd.libpq_cxt.PqSendBufferSize) < MaxBuildAllocSize) {
t_thrd.libpq_cxt.PqSendBuffer = (char *)repalloc(t_thrd.libpq_cxt.PqSendBuffer, MaxBuildAllocSize);
t_thrd.libpq_cxt.PqSendBufferSize = MaxBuildAllocSize;
}
/*
* Some compilers will throw a warning knowing this test can never be true
* because pgoff_t can't exceed the compared maximum on their platform.
*/
if (statbuf->st_size > MAX_FILE_SIZE_LIMIT) {
ereport(WARNING, (errcode(ERRCODE_NAME_TOO_LONG),
errmsg("archive member \"%s\" too large for tar format", tarfilename)));
return false;
}
fp = AllocateFile(readfilename, "rb");
SendFilePreInit();
fp = SizeCheckAndAllocate(readfilename, *statbuf, missing_ok);
if (fp == NULL) {
if (errno == ENOENT && missing_ok)
return false;
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", readfilename)));
return false;
}
isNeedCheck = is_row_data_file(readfilename, &segNo, &undoFileType);

View File

@ -13,6 +13,7 @@ set(TGT_smgr_INC
${EVENT_INCLUDE_PATH}
${PROTOBUF_INCLUDE_PATH}
${ZLIB_INCLUDE_PATH}
${ZSTD_INCLUDE_PATH}
)
set(smgr_DEF_OPTIONS ${MACRO_OPTIONS})

View File

@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
endif
endif
endif
OBJS = md.o smgr.o smgrtype.o knl_uundofile.o segstore.o
OBJS = md.o smgr.o smgrtype.o knl_uundofile.o segstore.o page_compression.o mmap_shared.o
SUBDIRS = segment

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,149 @@
/*
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* ---------------------------------------------------------------------------------------
*
*
*
* IDENTIFICATION
* src/gausskernel/storage/smgr/mmap_shared.cpp
*
* ---------------------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "catalog/pg_type.h"
#include "utils/datum.h"
#include "utils/relcache.h"
#include "utils/memutils.h"
#include "utils/memprot.h"
#include "storage/page_compression.h"
#include "executor/executor.h"
#include "storage/vfd.h"
struct MmapEntry {
RelFileNodeForkNum relFileNodeForkNum;
/*
* the following are setting sin runtime
*/
size_t reference = 0;
PageCompressHeader *pcmap = NULL;
};
constexpr size_t LOCK_ARRAY_SIZE = 1024;
static pthread_mutex_t mmapLockArray[LOCK_ARRAY_SIZE];
static inline uint32 MmapTableHashCode(const RelFileNodeForkNum &relFileNodeForkNum)
{
return tag_hash((void *)&relFileNodeForkNum, sizeof(RelFileNodeForkNum));
}
static inline pthread_mutex_t *MmapPartitionLock(size_t hashCode)
{
return &mmapLockArray[hashCode % LOCK_ARRAY_SIZE];
}
static inline PageCompressHeader *MmapSharedMapFile(Vfd *vfdP, uint16 chunkSize, uint2 opt, bool readonly)
{
auto map = pc_mmap_real_size(vfdP->fd, SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize), false);
if (map->chunk_size == 0 || map->algorithm == 0) {
map->chunk_size = chunkSize;
map->algorithm = GET_COMPRESS_ALGORITHM(opt);
if (pc_msync(map) != 0) {
ereport(data_sync_elevel(ERROR),
(errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", vfdP->fileName)));
}
}
if (RecoveryInProgress() && !map->sync) {
CheckAndRepairCompressAddress(map, chunkSize, map->algorithm, vfdP->fileName);
}
return map;
}
void RealInitialMMapLockArray()
{
for (size_t i = 0; i < LOCK_ARRAY_SIZE; ++i) {
pthread_mutex_init(&mmapLockArray[i], NULL);
}
HASHCTL ctl;
/* hash accessed by database file id */
errno_t rc = memset_s(&ctl, sizeof(ctl), 0, sizeof(ctl));
securec_check(rc, "", "");
ctl.keysize = sizeof(RelFileNodeForkNum);
ctl.entrysize = sizeof(MmapEntry);
ctl.hash = tag_hash;
ctl.num_partitions = LOCK_ARRAY_SIZE;
const size_t initLen = 256;
g_instance.mmapCache = HeapMemInitHash(
"mmap hash", initLen,
(Max(g_instance.attr.attr_common.max_files_per_process, t_thrd.storage_cxt.max_userdatafiles)) / 2, &ctl,
HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
}
PageCompressHeader *GetPageCompressHeader(void *vfd, uint16 chunkSize, const RelFileNodeForkNum &relFileNodeForkNum)
{
Vfd *currentVfd = (Vfd *)vfd;
uint32 hashCode = MmapTableHashCode(relFileNodeForkNum);
AutoMutexLock mmapLock(MmapPartitionLock(hashCode));
mmapLock.lock();
bool find = false;
MmapEntry *mmapEntry = (MmapEntry *)hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum,
hashCode, HASH_ENTER, &find);
if (!find) {
mmapEntry->pcmap = NULL;
mmapEntry->reference = 0;
}
if (mmapEntry->pcmap == NULL) {
mmapEntry->pcmap = MmapSharedMapFile(currentVfd, chunkSize, relFileNodeForkNum.rnode.node.opt, false);
}
++mmapEntry->reference;
mmapLock.unLock();
return mmapEntry->pcmap;
}
void UnReferenceAddrFile(void *vfd)
{
Vfd *currentVfd = (Vfd *)vfd;
RelFileNodeForkNum relFileNodeForkNum = currentVfd->fileNode;
uint32 hashCode = MmapTableHashCode(relFileNodeForkNum);
AutoMutexLock mmapLock(MmapPartitionLock(hashCode));
mmapLock.lock();
MmapEntry *mmapEntry = (MmapEntry *)hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum,
hashCode, HASH_FIND, NULL);
if (mmapEntry == NULL) {
ereport(ERROR, (errcode_for_file_access(),
errmsg("UnReferenceAddrFile failed! mmap not found, filePath: %s", currentVfd->fileName)));
}
--mmapEntry->reference;
if (mmapEntry->reference == 0) {
if (pc_munmap(mmapEntry->pcmap) != 0) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", currentVfd->fileName)));
}
if (hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum, hashCode, HASH_REMOVE,
NULL) == NULL) {
ereport(ERROR,
(errcode_for_file_access(),
errmsg("UnReferenceAddrFile failed! remove hash key failed, filePath: %s", currentVfd->fileName)));
}
} else if (mmapEntry->reference < 0) {
ereport(FATAL, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", currentVfd->fileName)));
}
mmapLock.unLock();
}

View File

@ -0,0 +1,472 @@
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
* Copyright (c) 2020, PostgreSQL Global Development Group
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*
* page_compression.cpp
* Routines for page compression
*
* There are two implementations at the moment: zstd, and the Postgres
* pg_lzcompress(). zstd support requires that the server was compiled
* with --with-zstd.
* IDENTIFICATION
* ./src/gausskernel/storage/smgr/page_compression.cpp
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "miscadmin.h"
#include "catalog/pg_type.h"
#include "utils/datum.h"
#include "utils/relcache.h"
#include "utils/timestamp.h"
#include "storage/checksum.h"
#include "storage/page_compression.h"
#include "storage/page_compression_impl.h"
static void CheckHeaderOfCompressAddr(PageCompressHeader* pcMap, uint16 chunk_size, uint8 algorithm, const char* path)
{
if (pcMap->chunk_size != chunk_size || pcMap->algorithm != algorithm) {
if (u_sess->attr.attr_security.zero_damaged_pages) {
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid chunk_size %u or algorithm %u in head of compress relation address file \"%s\", "
"and reinitialized it.",
pcMap->chunk_size,
pcMap->algorithm,
path)));
pcMap->algorithm = algorithm;
pg_atomic_write_u32(&pcMap->nblocks, RELSEG_SIZE);
pg_atomic_write_u32(&pcMap->allocated_chunks, 0);
pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, 0);
pcMap->chunk_size = chunk_size;
} else {
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid chunk_size %u or algorithm %u in head of compress relation address file \"%s\"",
pcMap->chunk_size,
pcMap->algorithm,
path)));
}
}
}
void CheckAndRepairCompressAddress(PageCompressHeader *pcMap, uint16 chunk_size, uint8 algorithm, const char *path)
{
TimestampTz lastRecoveryTime = pcMap->last_recovery_start_time;
TimestampTz pgStartTime = t_thrd.time_cxt.pg_start_time;
error_t rc;
/* if the relation had been checked in this startup, skip */
if (lastRecoveryTime == pgStartTime) {
return;
}
/* check head of compress address file */
CheckHeaderOfCompressAddr(pcMap, chunk_size, algorithm, path);
uint32 nblocks = pg_atomic_read_u32(&pcMap->nblocks);
uint32 allocated_chunks = pg_atomic_read_u32(&pcMap->allocated_chunks);
BlockNumber *global_chunknos = (BlockNumber *)palloc0(MAX_CHUNK_NUMBER(chunk_size) * sizeof(BlockNumber));
BlockNumber max_blocknum = (BlockNumber)-1;
BlockNumber max_nonzero_blocknum = (BlockNumber)-1;
BlockNumber max_allocated_chunkno = (pc_chunk_number_t)0;
/* check compress address of every pages */
for (BlockNumber blocknum = 0; blocknum < (BlockNumber)RELSEG_SIZE; ++blocknum) {
PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum);
if (pcAddr->checksum != AddrChecksum32(blocknum, pcAddr, chunk_size)) {
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid checkum %u of block %u in file \"%s\"",
pcAddr->checksum, blocknum, path)));
pcAddr->allocated_chunks = pcAddr->nchunks = 0;
for (int i = 0; i < BLCKSZ / chunk_size; ++i) {
pcAddr->chunknos[i] = 0;
}
pcAddr->checksum = 0;
}
/*
* skip when found first zero filled block after nblocks
* if(blocknum >= (BlockNumber)nblocks && pcAddr->allocated_chunks == 0)
* break;
*/
/* check allocated_chunks for one page */
if (pcAddr->allocated_chunks > BLCKSZ / chunk_size) {
if (u_sess->attr.attr_security.zero_damaged_pages) {
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
securec_check_c(rc, "\0", "\0");
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid allocated_chunks %u of block %u in file \"%s\", and zero this block",
pcAddr->allocated_chunks, blocknum, path)));
continue;
} else {
pfree(global_chunknos);
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid allocated_chunks %u of block %u in file \"%s\"",
pcAddr->allocated_chunks, blocknum, path)));
}
}
/* check chunknos for one page */
for (int i = 0; i < pcAddr->allocated_chunks; ++i) {
/* check for invalid chunkno */
if (pcAddr->chunknos[i] == 0 || pcAddr->chunknos[i] > MAX_CHUNK_NUMBER(chunk_size)) {
if (u_sess->attr.attr_security.zero_damaged_pages) {
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
securec_check_c(rc, "\0", "\0");
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid chunk number %u of block %u in file \"%s\", and zero this block",
pcAddr->chunknos[i], blocknum, path)));
continue;
} else {
pfree(global_chunknos);
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("invalid chunk number %u of block %u in file \"%s\"", pcAddr->chunknos[i],
blocknum, path)));
}
}
/* check for duplicate chunkno */
if (global_chunknos[pcAddr->chunknos[i] - 1] != 0) {
if (u_sess->attr.attr_security.zero_damaged_pages) {
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
securec_check_c(rc, "\0", "\0");
ereport(
WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg(
"chunk number %u of block %u duplicate with block %u in file \"%s\", and zero this block",
pcAddr->chunknos[i], blocknum, global_chunknos[pcAddr->chunknos[i] - 1], path)));
continue;
} else {
pfree(global_chunknos);
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("chunk number %u of block %u duplicate with block %u in file \"%s\"",
pcAddr->chunknos[i], blocknum, global_chunknos[pcAddr->chunknos[i] - 1], path)));
}
}
}
/* clean chunknos beyond allocated_chunks for one page */
for (int i = pcAddr->allocated_chunks; i < BLCKSZ / chunk_size; ++i) {
if (pcAddr->chunknos[i] != 0) {
pcAddr->chunknos[i] = 0;
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("clear chunk number %u beyond allocated_chunks %u of block %u in file \"%s\"",
pcAddr->chunknos[i], pcAddr->allocated_chunks, blocknum, path)));
}
}
/* check nchunks for one page */
if (pcAddr->nchunks > pcAddr->allocated_chunks) {
if (u_sess->attr.attr_security.zero_damaged_pages) {
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
securec_check_c(rc, "\0", "\0");
ereport(
WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("nchunks %u exceeds allocated_chunks %u of block %u in file \"%s\", and zero this block",
pcAddr->nchunks, pcAddr->allocated_chunks, blocknum, path)));
continue;
} else {
pfree(global_chunknos);
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("nchunks %u exceeds allocated_chunks %u of block %u in file \"%s\"",
pcAddr->nchunks, pcAddr->allocated_chunks, blocknum, path)));
}
}
max_blocknum = blocknum;
if (pcAddr->nchunks > 0) {
max_nonzero_blocknum = blocknum;
}
for (int i = 0; i < pcAddr->allocated_chunks; ++i) {
global_chunknos[pcAddr->chunknos[i] - 1] = blocknum + 1;
if (pcAddr->chunknos[i] > max_allocated_chunkno) {
max_allocated_chunkno = pcAddr->chunknos[i];
}
}
}
int unused_chunks = 0;
/* check for holes in allocated chunks */
for (BlockNumber i = 0; i < max_allocated_chunkno; i++) {
if (global_chunknos[i] == 0) {
unused_chunks++;
}
}
if (unused_chunks > 0) {
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("there are %u chunks of total allocated chunks %u can not be use in file \"%s\"",
unused_chunks, max_allocated_chunkno, path),
errhint("You may need to run VACUMM FULL to optimize space allocation.")));
}
/* update nblocks in head of compressed file */
if (nblocks < max_nonzero_blocknum + 1) {
pg_atomic_write_u32(&pcMap->nblocks, max_nonzero_blocknum + 1);
pg_atomic_write_u32(&pcMap->last_synced_nblocks, max_nonzero_blocknum + 1);
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("update nblocks head of compressed file \"%s\". old: %u, new: %u", path, nblocks,
max_nonzero_blocknum + 1)));
}
/* update allocated_chunks in head of compress file */
if (allocated_chunks != max_allocated_chunkno) {
pg_atomic_write_u32(&pcMap->allocated_chunks, max_allocated_chunkno);
pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, max_allocated_chunkno);
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("update allocated_chunks in head of compressed file \"%s\". old: %u, new: %u", path,
allocated_chunks, max_allocated_chunkno)));
}
/* clean compress address after max_blocknum + 1 */
for (BlockNumber blocknum = max_blocknum + 1; blocknum < (BlockNumber)RELSEG_SIZE; blocknum++) {
char buf[128];
char *p = NULL;
PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum);
/* skip zero block */
if (pcAddr->allocated_chunks == 0 && pcAddr->nchunks == 0) {
continue;
}
/* clean compress address and output content of the address */
rc = memset_s(buf, sizeof(buf), 0, sizeof(buf));
securec_check_c(rc, "\0", "\0");
p = buf;
for (int i = 0; i < pcAddr->allocated_chunks; i++) {
if (pcAddr->chunknos[i]) {
const char *formatStr = i == 0 ? "%u" : ",%u";
errno_t rc =
snprintf_s(p, sizeof(buf) - (p - buf), sizeof(buf) - (p - buf) - 1, formatStr, pcAddr->chunknos[i]);
securec_check_ss(rc, "\0", "\0");
p += strlen(p);
}
}
rc =
memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
securec_check_c(rc, "\0", "\0");
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("clean unused compress address of block %u in file \"%s\", old "
"allocated_chunks/nchunks/chunknos: %u/%u/{%s}",
blocknum, path, pcAddr->allocated_chunks, pcAddr->nchunks, buf)));
}
pfree(global_chunknos);
if (pc_msync(pcMap) != 0) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", path)));
}
pcMap->last_recovery_start_time = pgStartTime;
}
int64 CalculateMainForkSize(char* pathName, RelFileNode* rnode, ForkNumber forkNumber)
{
Assert(IS_COMPRESSED_RNODE((*rnode), forkNumber));
Assert(rnode->bucketNode == -1);
return CalculateCompressMainForkSize(pathName);
}
void CopyCompressedPath(char dst[MAXPGPATH], const char* pathName, CompressedFileType compressFileType)
{
int rc;
if (compressFileType == COMPRESSED_TABLE_PCA_FILE) {
rc = snprintf_s(dst, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, pathName);
} else {
rc = snprintf_s(dst, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, pathName);
}
securec_check_ss(rc, "\0", "\0");
}
int64 CalculateCompressMainForkSize(char* pathName, bool suppressedENOENT)
{
int64 totalsize = 0;
char pcFilePath[MAXPGPATH];
CopyCompressedPath(pcFilePath, pathName, COMPRESSED_TABLE_PCA_FILE);
totalsize += CalculateFileSize(pcFilePath, MAXPGPATH, suppressedENOENT);
CopyCompressedPath(pcFilePath, pathName, COMPRESSED_TABLE_PCD_FILE);
totalsize += CalculateFileSize(pcFilePath, MAXPGPATH, suppressedENOENT);
return totalsize;
}
uint16 ReadChunkSize(FILE* pcaFile, char* pcaFilePath, size_t len)
{
uint16 chunkSize;
if (fseeko(pcaFile, (off_t)offsetof(PageCompressHeader, chunk_size), SEEK_SET) != 0) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not seek in file \"%s\": \"%lu\": %m", pcaFilePath, len)));
}
if (fread(&chunkSize, sizeof(chunkSize), 1, pcaFile) <= 0) {
ereport(ERROR,
(errcode_for_file_access(), errmsg("could not open file \"%s\": \"%lu\": %m", pcaFilePath, len)));
}
return chunkSize;
}
int64 CalculateFileSize(char* pathName, size_t size, bool suppressedENOENT)
{
struct stat structstat;
if (stat(pathName, &structstat)) {
if (errno == ENOENT) {
if (suppressedENOENT) {
return 0;
}
ereport(ERROR, (errcode_for_file_access(), errmsg("could not FIND file \"%s\": %m", pathName)));
} else {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathName)));
}
}
return structstat.st_size;
}
uint1 ConvertChunkSize(uint32 compressedChunkSize, bool *success)
{
uint1 chunkSize = INDEX_OF_HALF_BLCKSZ;
switch (compressedChunkSize) {
case BLCKSZ / 2:
chunkSize = INDEX_OF_HALF_BLCKSZ;
break;
case BLCKSZ / 4:
chunkSize = INDEX_OF_QUARTER_BLCKSZ;
break;
case BLCKSZ / 8:
chunkSize = INDEX_OF_EIGHTH_BRICK_BLCKSZ;
break;
case BLCKSZ / 16:
chunkSize = INDEX_OF_SIXTEENTHS_BLCKSZ;
break;
default:
*success = false;
return chunkSize;
}
*success = true;
return chunkSize;
}
constexpr int MAX_RETRY_LIMIT = 60;
constexpr long RETRY_SLEEP_TIME = 1000000L;
size_t ReadAllChunkOfBlock(char *dst, size_t destLen, BlockNumber blockNumber, ReadBlockChunksStruct& rbStruct)
{
PageCompressHeader* header = rbStruct.header;
if (blockNumber >= header->nblocks) {
ereport(ERROR,
(ERRCODE_INVALID_PARAMETER_VALUE,
errmsg("blocknum \"%u\" exceeds max block number", blockNumber)));
}
char* pageBuffer = rbStruct.pageBuffer;
const char* fileName = rbStruct.fileName;
decltype(PageCompressHeader::chunk_size) chunkSize = header->chunk_size;
decltype(ReadBlockChunksStruct::segmentNo) segmentNo = rbStruct.segmentNo;
PageCompressAddr* currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
size_t tryCount = 0;
/* for empty chunks write */
uint8 allocatedChunks;
uint8 nchunks;
do {
allocatedChunks = currentAddr->allocated_chunks;
nchunks = currentAddr->nchunks;
for (uint8 i = 0; i < nchunks; ++i) {
off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]);
uint8 start = i;
while (i < nchunks - 1 && currentAddr->chunknos[i + 1] == currentAddr->chunknos[i] + 1) {
i++;
}
if (fseeko(rbStruct.fp, seekPos, SEEK_SET) != 0) {
ReleaseMap(header, fileName);
ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in file \"%s\": %m", fileName)));
}
size_t readAmount = chunkSize * (i - start + 1);
if (fread(dst + start * chunkSize, 1, readAmount, rbStruct.fp) != readAmount && ferror(rbStruct.fp)) {
ReleaseMap(header, fileName);
ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fileName)));
}
}
if (nchunks == 0) {
break;
}
if (DecompressPage(dst, pageBuffer, header->algorithm) == BLCKSZ) {
PageHeader phdr = PageHeader(pageBuffer);
BlockNumber blkNo = blockNumber + segmentNo * ((BlockNumber)RELSEG_SIZE);
if (PageIsNew(phdr) || pg_checksum_page(pageBuffer, blkNo) == phdr->pd_checksum) {
break;
}
}
if (tryCount < MAX_RETRY_LIMIT) {
++tryCount;
pg_usleep(RETRY_SLEEP_TIME);
} else {
ReleaseMap(header, fileName);
ereport(ERROR,
(errcode_for_file_access(),
errmsg("base backup cheksum or Decompressed blockno %u failed in file \"%s\", aborting backup. "
"nchunks: %u, allocatedChunks: %u, segno: %d.",
blockNumber,
fileName,
nchunks,
allocatedChunks,
segmentNo)));
}
} while (true);
if (allocatedChunks > nchunks) {
auto currentWriteSize = nchunks * chunkSize;
securec_check(
memset_s(dst + currentWriteSize, destLen - currentWriteSize, 0, (allocatedChunks - nchunks) * chunkSize),
"",
"");
}
return allocatedChunks * chunkSize;
}
CompressedFileType IsCompressedFile(char *fileName, size_t fileNameLen)
{
size_t suffixLen = 4;
if (fileNameLen >= suffixLen) {
const char *suffix = fileName + fileNameLen - suffixLen;
if (strncmp(suffix, "_pca", suffixLen) == 0) {
return COMPRESSED_TABLE_PCA_FILE;
} else if (strncmp(suffix, "_pcd", suffixLen) == 0) {
return COMPRESSED_TABLE_PCD_FILE;
}
}
return COMPRESSED_TYPE_UNKNOWN;
}
void ReleaseMap(PageCompressHeader* map, const char* fileName)
{
if (map != NULL && pc_munmap(map) != 0) {
ereport(WARNING, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", fileName)));
}
}

View File

@ -31,7 +31,8 @@
typedef enum BufTagVer {
ORIGIN_TAG = 0,
HASHBUCKET_TAG
HASHBUCKET_TAG,
PAGE_COMPRESS_TAG
} BufTagVer;
typedef struct st_dw_batch {

View File

@ -4,7 +4,7 @@
* header file for openGauss hash access method implementation
*
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/hash.h
@ -33,36 +33,59 @@
*/
typedef uint32 Bucket;
#define INVALID_BUCKET_NUM (0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B) + 1) - 1] : 0)) + 1)
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B) + 1) - 1] : 0)) + 1)
/*
* Special space for hash index pages.
*
* hasho_flag tells us which type of page we're looking at. For
* example, knowing overflow pages from bucket pages is necessary
* information when you're deleting tuples from a page. If all the
* tuples are deleted from an overflow page, the overflow is made
* available to other buckets by calling _hash_freeovflpage(). If all
* the tuples are deleted from a bucket page, no additional action is
* necessary.
* hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
* Additional bits in the flag word are used for more transient purposes.
*
* To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
* However, we ensure that each used page type has a distinct bit so that
* we can OR together page types for uses such as the allowable-page-types
* argument of _hash_checkpage().
*/
#define LH_UNUSED_PAGE (0)
#define LH_OVERFLOW_PAGE (1 << 0)
#define LH_BUCKET_PAGE (1 << 1)
#define LH_BITMAP_PAGE (1 << 2)
#define LH_META_PAGE (1 << 3)
#define LH_BUCKET_BEING_POPULATED (1 << 4)
#define LH_BUCKET_BEING_SPLIT (1 << 5)
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
#define LH_PAGE_TYPE \
(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
/*
* In an overflow page, hasho_prevblkno stores the block number of the previous
* page in the bucket chain; in a bucket page, hasho_prevblkno stores the
* hashm_maxbucket value as of the last time the bucket was last split, or
* else as of the time the bucket was created. The latter convention is used
* to determine whether a cached copy of the metapage is too stale to be used
* without needing to lock or pin the metapage.
*
* hasho_nextblkno is always the block number of the next page in the
* bucket chain, or InvalidBlockNumber if there are no more such pages.
*/
typedef struct HashPageOpaqueData {
BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
BlockNumber hasho_nextblkno; /* next ovfl blkno */
Bucket hasho_bucket; /* bucket number this pg belongs to */
uint16 hasho_flag; /* page type code, see above */
uint16 hasho_page_id; /* for identification of hash indexes */
BlockNumber hasho_prevblkno; /* see above */
BlockNumber hasho_nextblkno; /* see above */
Bucket hasho_bucket; /* bucket number this pg belongs to */
uint16 hasho_flag; /* page type code + flag bits, see above */
uint16 hasho_page_id; /* for identification of hash indexes */
} HashPageOpaqueData;
typedef HashPageOpaqueData* HashPageOpaque;
#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
/*
* The page ID is for the convenience of pg_filedump and similar utilities,
* which otherwise would have a hard time telling pages of different index
@ -71,26 +94,19 @@ typedef HashPageOpaqueData* HashPageOpaque;
*/
#define HASHO_PAGE_ID 0xFF80
typedef struct HashScanPosItem {
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
} HashScanPosItem;
/*
* HashScanOpaqueData is private state for a hash index scan.
* HashScanOpaqueData is private state for a hash index scan.
*/
typedef struct HashScanOpaqueData {
/* Hash value of the scan key, ie, the hash key we seek */
uint32 hashso_sk_hash;
/*
* By definition, a hash scan should be examining only one bucket. We
* record the bucket number here as soon as it is known.
*/
Bucket hashso_bucket;
bool hashso_bucket_valid;
/*
* If we have a share lock on the bucket, we record it here. When
* hashso_bucket_blkno is zero, we have no such lock.
*/
BlockNumber hashso_bucket_blkno;
/*
* We also want to remember which buffer we're currently examining in the
* scan. We keep the buffer pinned (but not locked) across hashgettuple
@ -99,11 +115,33 @@ typedef struct HashScanOpaqueData {
*/
Buffer hashso_curbuf;
/* remember the buffer associated with primary bucket */
Buffer hashso_bucket_buf;
/*
* remember the buffer associated with primary bucket page of bucket being
* split. it is required during the scan of the bucket which is being
* populated during split operation.
*/
Buffer hashso_split_bucket_buf;
/* Current position of the scan, as an index TID */
ItemPointerData hashso_curpos;
/* Current position of the scan, as a heap TID */
ItemPointerData hashso_heappos;
/* Whether scan starts on bucket being populated due to split */
bool hashso_buc_populated;
/*
* Whether scanning bucket being split? The value of this parameter is
* referred only when hashso_buc_populated is true.
*/
bool hashso_buc_split;
/* info about killed items if any (killedItems is NULL if never used) */
HashScanPosItem *killedItems; /* tids and offset numbers of killed items */
int numKilled; /* number of currently stored items */
} HashScanOpaqueData;
typedef HashScanOpaqueData* HashScanOpaque;
@ -115,7 +153,7 @@ typedef HashScanOpaqueData* HashScanOpaque;
#define HASH_METAPAGE 0 /* metapage is always block 0 */
#define HASH_MAGIC 0x6440640
#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */
#define HASH_VERSION 4
/*
* Spares[] holds the number of overflow pages currently allocated at or
@ -128,17 +166,32 @@ typedef HashScanOpaqueData* HashScanOpaque;
*
* ovflpages that have been recycled for reuse can be found by looking at
* bitmaps that are stored within ovflpages dedicated for the purpose.
* The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the
* The blknos of these bitmap pages are kept in mapp[]; nmaps is the
* number of currently existing bitmaps.
*
* The limitation on the size of spares[] comes from the fact that there's
* no point in having more than 2^32 buckets with only uint32 hashcodes.
* (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
* adjusted in such a way to accommodate multi phased allocation of buckets
* after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
*
* There is no particular upper limit on the size of mapp[], other than
* needing to fit into the metapage. (With 8K block size, 128 bitmaps
* limit us to 64 Gb of overflow space...)
* needing to fit into the metapage. (With 8K block size, 1024 bitmaps
* limit us to 256 GB of overflow space...)
*/
#define HASH_MAX_SPLITPOINTS 32
#define HASH_MAX_BITMAPS 128
#define HASH_MAX_BITMAPS 1024
#define HASH_SPLITPOINT_PHASE_BITS 2
#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
/* defines max number of splitpoit phases a hash index can have */
#define HASH_MAX_SPLITPOINT_GROUP 32
#define HASH_MAX_SPLITPOINTS \
(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
HASH_SPLITPOINT_PHASES_PER_GRP) + \
HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
typedef struct HashMetaPageData {
uint32 hashm_magic; /* magic no. for hash tables */
@ -280,37 +333,40 @@ extern Datum hash_new_uint32(uint32 k);
/* private routines */
/* hashinsert.c */
extern void _hash_doinsert(Relation rel, IndexTuple itup);
extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup);
extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
OffsetNumber *itup_offsets, uint16 nitups);
/* hashovfl.c */
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy);
extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum);
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy);
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy);
/* hashpage.c */
extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags);
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
BlockNumber blkno, int flags);
extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh);
extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
int access, HashMetaPage *cachedmetap);
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage);
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum);
extern Buffer _hash_getbuf_with_strategy(
Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy);
extern void _hash_relbuf(Relation rel, Buffer buf);
extern void _hash_dropbuf(Relation rel, Buffer buf);
extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access);
extern uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum);
extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
extern uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum);
extern void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage);
extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf);
/* hashscan.c */
extern void _hash_regscan(IndexScanDesc scan);
extern void _hash_dropscan(IndexScanDesc scan);
extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
extern void ReleaseResources_hash(void);
extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
uint32 maxbucket, uint32 highmask, uint32 lowmask);
/* hashsearch.c */
extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
@ -320,10 +376,10 @@ extern bool _hash_step(IndexScanDesc scan, Buffer* bufP, ScanDirection dir);
/* hashsort.c */
typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
extern HSpool* _h_spoolinit(Relation index, uint32 num_buckets, void* meminfo);
extern HSpool* _h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void* meminfo);
extern void _h_spooldestroy(HSpool* hspool);
extern void _h_spool(HSpool* hspool, ItemPointer self, Datum* values, const bool* isnull);
extern void _h_indexbuild(HSpool* hspool);
extern void _h_indexbuild(HSpool* hspool, Relation heapRel);
/* hashutil.c */
extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
@ -331,16 +387,31 @@ extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask);
extern uint32 _hash_log2(uint32 num);
extern uint32 _hash_spareindex(uint32 num_bucket);
extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
extern IndexTuple _hash_form_tuple(Relation index, Datum* values, const bool* isnull);
extern bool _hash_convert_tuple(Relation index, Datum *user_values, const bool *user_isnull,
Datum *index_values, bool *index_isnull);
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
uint32 lowmask, uint32 maxbucket);
extern void _hash_kill_items(IndexScanDesc scan);
/* hash.c */
extern void hash_redo(XLogReaderState* record);
extern void hash_desc(StringInfo buf, XLogReaderState* record);
extern const char* hash_type_name(uint8 subtype);
extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
Buffer bucket_buf, BlockNumber bucket_blkno,
BufferAccessStrategy bstrategy,
uint32 maxbucket, uint32 highmask, uint32 lowmask,
double *tuples_removed, double *num_index_tuples,
bool bucket_has_garbage,
IndexBulkDeleteCallback callback, void *callback_state);
#ifdef PGXC
extern Datum compute_hash(Oid type, Datum value, char locator);

View File

@ -0,0 +1,352 @@
/*-------------------------------------------------------------------------
*
* hash_xlog.h
* header file for Postgres hash AM implementation
*
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/hash_xlog.h
*
*-------------------------------------------------------------------------
*/
#ifndef HASH_XLOG_H
#define HASH_XLOG_H
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
#include "storage/off.h"
/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */
#define HASH_XLOG_FREE_OVFL_BUFS 6
/*
* XLOG records for hash operations
*/
#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */
#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */
#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */
#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */
#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */
#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */
#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */
#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page
* and add to another page */
#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous
* pages in chain and free the ovfl
* page */
#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */
#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary
* bucket page after deleting tuples
* that are moved due to split */
#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */
#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index page */
typedef enum {
XLOG_HASH_INIT_META_PAGE_NUM = 0,
}XLogHashInitMetaPageEnum;
typedef enum {
XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM = 0,
XLOG_HASH_INIT_BITMAP_PAGE_META_NUM,
}XLogHashInitBitmapPageEnum;
typedef enum {
XLOG_HASH_INSERT_PAGE_NUM = 0,
XLOG_HASH_INSERT_META_NUM,
}XLogHashInsertEnum;
typedef enum {
XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM = 0,
XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM,
XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM,
XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM,
XLOG_HASH_ADD_OVFL_PAGE_META_NUM,
}XLogHashAddOvflPageEnum;
typedef enum {
XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM = 0,
XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM,
XLOG_HASH_SPLIT_ALLOCATE_PAGE_META_NUM,
}XLogHashSplitAllocatePageEnum;
typedef enum {
XLOG_HASH_SPLIT_PAGE_NUM = 0,
}XLogHashSplitPageEnum;
typedef enum {
XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM = 0,
XLOG_HASH_SPLIT_COMPLETE_NBUK_NUM,
}XLogHashSplitCompleteEnum;
typedef enum {
HASH_MOVE_BUK_BLOCK_NUM = 0,
HASH_MOVE_ADD_BLOCK_NUM,
HASH_MOVE_DELETE_OVFL_BLOCK_NUM,
}XLogHashMovePageEnum;
typedef enum {
HASH_SQUEEZE_BUK_BLOCK_NUM = 0,
HASH_SQUEEZE_ADD_BLOCK_NUM,
HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_META_BLOCK_NUM,
}XLogHashSqueezePageEnum;
typedef enum {
HASH_DELETE_BUK_BLOCK_NUM = 0,
HASH_DELETE_OVFL_BLOCK_NUM,
}XLogHashDeleteEnum;
typedef enum {
HASH_SPLIT_CLEANUP_BLOCK_NUM,
}XLogHashSplitCleanupEnum;
typedef enum {
HASH_UPDATE_META_BLOCK_NUM,
} XLogHashUpdateMateEnum;
typedef enum {
HASH_VACUUM_PAGE_BLOCK_NUM = 0,
HASH_VACUUM_META_BLOCK_NUM,
} XLogHashVacuumPageEnum;
/*
* xl_hash_split_allocate_page flag values, 8 bits are available.
*/
#define XLH_SPLIT_META_UPDATE_MASKS (1<<0)
#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1)
/*
* This is what we need to know about a HASH index create.
*
* Backup block 0: metapage
*/
typedef struct xl_hash_createidx
{
double num_tuples;
RegProcedure procid;
uint16 ffactor;
} xl_hash_createidx;
#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16))
/*
* This is what we need to know about simple (without split) insert.
*
* This data record is used for XLOG_HASH_INSERT
*
* Backup Blk 0: original page (data contains the inserted tuple)
* Backup Blk 1: metapage (HashMetaPageData)
*/
typedef struct xl_hash_insert
{
OffsetNumber offnum;
} xl_hash_insert;
#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber))
/*
* This is what we need to know about addition of overflow page.
*
* This data record is used for XLOG_HASH_ADD_OVFL_PAGE
*
* Backup Blk 0: newly allocated overflow page
* Backup Blk 1: page before new overflow page in the bucket chain
* Backup Blk 2: bitmap page
* Backup Blk 3: new bitmap page
* Backup Blk 4: metapage
*/
typedef struct xl_hash_add_ovfl_page
{
uint16 bmsize;
bool bmpage_found;
} xl_hash_add_ovfl_page;
#define SizeOfHashAddOvflPage \
(offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool))
/*
* This is what we need to know about allocating a page for split.
*
* This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE
*
* Backup Blk 0: page for old bucket
* Backup Blk 1: page for new bucket
* Backup Blk 2: metapage
*/
typedef struct xl_hash_split_allocate_page
{
uint32 new_bucket;
uint16 old_bucket_flag;
uint16 new_bucket_flag;
uint8 flags;
} xl_hash_split_allocate_page;
#define SizeOfHashSplitAllocPage \
(offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8))
/*
* This is what we need to know about completing the split operation.
*
* This data record is used for XLOG_HASH_SPLIT_COMPLETE
*
* Backup Blk 0: page for old bucket
* Backup Blk 1: page for new bucket
*/
typedef struct xl_hash_split_complete
{
uint16 old_bucket_flag;
uint16 new_bucket_flag;
} xl_hash_split_complete;
#define SizeOfHashSplitComplete \
(offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16))
/*
* This is what we need to know about move page contents required during
* squeeze operation.
*
* This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS
*
* Backup Blk 0: bucket page
* Backup Blk 1: page containing moved tuples
* Backup Blk 2: page from which tuples will be removed
*/
typedef struct xl_hash_move_page_contents
{
uint16 ntups;
bool is_prim_bucket_same_wrt; /* true if the page to which
* tuples are moved is same as
* primary bucket page */
} xl_hash_move_page_contents;
#define SizeOfHashMovePageContents \
(offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool))
/*
* This is what we need to know about the squeeze page operation.
*
* This data record is used for XLOG_HASH_SQUEEZE_PAGE
*
* Backup Blk 0: page containing tuples moved from freed overflow page
* Backup Blk 1: freed overflow page
* Backup Blk 2: page previous to the freed overflow page
* Backup Blk 3: page next to the freed overflow page
* Backup Blk 4: bitmap page containing info of freed overflow page
* Backup Blk 5: meta page
*/
typedef struct xl_hash_squeeze_page
{
BlockNumber prevblkno;
BlockNumber nextblkno;
uint16 ntups;
bool is_prim_bucket_same_wrt; /* true if the page to which
* tuples are moved is same as
* primary bucket page */
bool is_prev_bucket_same_wrt; /* true if the page to which
* tuples are moved is the page
* previous to the freed overflow
* page */
} xl_hash_squeeze_page;
#define SizeOfHashSqueezePage \
(offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool))
/*
* This is what we need to know about the deletion of index tuples from a page.
*
* This data record is used for XLOG_HASH_DELETE
*
* Backup Blk 0: primary bucket page
* Backup Blk 1: page from which tuples are deleted
*/
typedef struct xl_hash_delete
{
bool clear_dead_marking; /* true if this operation clears
* LH_PAGE_HAS_DEAD_TUPLES flag */
bool is_primary_bucket_page; /* true if the operation is for
* primary bucket page */
} xl_hash_delete;
#define SizeOfHashDelete \
(offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool))
/*
* This is what we need for metapage update operation.
*
* This data record is used for XLOG_HASH_UPDATE_META_PAGE
*
* Backup Blk 0: meta page
*/
typedef struct xl_hash_update_meta_page
{
double ntuples;
} xl_hash_update_meta_page;
#define SizeOfHashUpdateMetaPage \
(offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double))
/*
* This is what we need to initialize metapage.
*
* This data record is used for XLOG_HASH_INIT_META_PAGE
*
* Backup Blk 0: meta page
*/
typedef struct xl_hash_init_meta_page
{
double num_tuples;
RegProcedure procid;
uint16 ffactor;
} xl_hash_init_meta_page;
#define SizeOfHashInitMetaPage \
(offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16))
/*
* This is what we need to initialize bitmap page.
*
* This data record is used for XLOG_HASH_INIT_BITMAP_PAGE
*
* Backup Blk 0: bitmap page
* Backup Blk 1: meta page
*/
typedef struct xl_hash_init_bitmap_page
{
uint16 bmsize;
} xl_hash_init_bitmap_page;
#define SizeOfHashInitBitmapPage \
(offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16))
/*
* This is what we need for index tuple deletion and to
* update the meta page.
*
* This data record is used for XLOG_HASH_VACUUM_ONE_PAGE
*
* Backup Blk 0: bucket page
* Backup Blk 1: meta page
*/
typedef struct xl_hash_vacuum_one_page
{
RelFileNode hnode;
int ntuples;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_hash_vacuum_one_page;
#define SizeOfHashVacuumOnePage \
(offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(int))
extern void hash_redo(XLogReaderState *record);
extern void hash_desc(StringInfo buf, XLogReaderState *record);
extern const char *hash_identify(uint8 info);
extern bool IsHashVacuumPages(XLogReaderState *record);
#endif /* HASH_XLOG_H */

View File

@ -131,6 +131,22 @@ typedef struct {
int offset; /* offset of field in result struct */
} relopt_parse_elt;
struct TableCreateSupport {
bool compressType;
bool compressLevel;
bool compressChunkSize;
bool compressPreAllocChunks;
bool compressByteConvert;
bool compressDiffConvert;
};
inline bool HasCompressOption(TableCreateSupport *tableCreateSupport)
{
return tableCreateSupport->compressLevel || tableCreateSupport->compressChunkSize ||
tableCreateSupport->compressPreAllocChunks || tableCreateSupport->compressByteConvert ||
tableCreateSupport->compressDiffConvert;
}
/*
* The following are the table append modes currently supported.
* on: mark the table on-line scaleout mode, when it is set, later data write by append mode.
@ -285,5 +301,8 @@ extern void forbid_to_set_options_for_timeseries_tbl(List* options);
extern List* RemoveRelOption(List* options, const char* optName, bool* removed);
void RowTblCheckCompressionOption(List *options, int8 rowCompress = REL_CMPRS_PAGE_PLAIN);
void RowTblCheckHashBucketOption(List* options, StdRdOptions* std_opt);
void SetOneOfCompressOption(const char *defname, TableCreateSupport *tableCreateSupport);
void CheckCompressOption(TableCreateSupport *tableCreateSupport);
void ForbidUserToSetCompressedOptions(List *options);
#endif /* RELOPTIONS_H */

View File

@ -20,6 +20,7 @@
#include "storage/buf/block.h"
#include "storage/buf/buf.h"
#include "storage/buf/bufpage.h"
#include "storage/page_compression.h"
#include "storage/smgr/relfilenode.h"
struct XLogPhyBlock;

View File

@ -59,6 +59,7 @@ typedef void (*relasexlogreadstate)(void* record);
#define XLogBlockHeadGetForkNum(blockhead) ((blockhead)->forknum)
#define XLogBlockHeadGetBlockNum(blockhead) ((blockhead)->blkno)
#define XLogBlockHeadGetBucketId(blockhead) ((blockhead)->bucketNode)
#define XLogBlockHeadGetCompressOpt(blockhead) ((blockhead)->opt)
#define XLogBlockHeadGetValidInfo(blockhead) ((blockhead)->block_valid)
#define XLogBlockHeadGetPhysicalBlock(blockhead) ((blockhead)->pblk)
/* for common blockhead end */
@ -495,7 +496,8 @@ typedef struct {
TransactionId xl_xid; /* xact id */
Oid spcNode; /* tablespace */
Oid dbNode; /* database */
int4 bucketNode; /* bucket */
int2 bucketNode; /* bucket */
uint2 opt;
XLogPhyBlock pblk;
} XLogBlockHead;
@ -1002,6 +1004,47 @@ extern void UBTreeXlogUnlinkPageOperatorChildpage(RedoBufferInfo* cbuf, void* re
extern void UBTreeXlogClearIncompleteSplit(RedoBufferInfo* buffer);
void HashRedoInitMetaPageOperatorPage(RedoBufferInfo *metabuf, void *recorddata);
void HashRedoInitBitmapPageOperatorBitmapPage(RedoBufferInfo *bitmapbuf, void *recorddata);
void HashRedoInitBitmapPageOperatorMetaPage(RedoBufferInfo *metabuf);
void HashRedoInsertOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *data, Size datalen);
void HashRedoInsertOperatorMetaPage(RedoBufferInfo *metabuf);
void HashRedoAddOvflPageOperatorOvflPage(RedoBufferInfo *ovflbuf, BlockNumber leftblk, void *data, Size datalen);
void HashRedoAddOvflPageOperatorLeftPage(RedoBufferInfo *ovflbuf, BlockNumber rightblk);
void HashRedoAddOvflPageOperatorMapPage(RedoBufferInfo *mapbuf, void *data);
void HashRedoAddOvflPageOperatorNewmapPage(RedoBufferInfo *newmapbuf, void *recorddata);
void HashRedoAddOvflPageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *data, Size datalen);
void HashRedoSplitAllocatePageOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
void HashRedoSplitAllocatePageOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
void HashRedoSplitAllocatePageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *blkdata);
void HashRedoSplitCompleteOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
void HashRedoSplitCompleteOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
void HashXlogMoveAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
void HashXlogMoveDeleteOvflPageOperatorPage(RedoBufferInfo *redobuffer, void *blkdata, Size len);
void HashXlogSqueezeAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
void HashXlogSqueezeInitOvflbufOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogSqueezeUpdatePrevPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogSqueezeUpdateNextPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogSqueezeUpdateBitmapOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
void HashXlogSqueezeUpdateMateOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
void HashXlogDeleteBlockOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
void HashXlogSplitCleanupOperatorPage(RedoBufferInfo *redobuffer);
void HashXlogUpdateMetaOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogVacuumOnePageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, Size len);
void HashXlogVacuumMateOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void XLogRecSetBlockCommonState(XLogReaderState* record, XLogBlockParseEnum blockvalid,
RelFileNodeForkNum filenode, XLogRecParseState* recordblockstate, XLogPhyBlock *pblk = NULL);
@ -1047,6 +1090,7 @@ extern void UBTreeRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* bl
extern void UBTree2RedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec,
RedoBufferInfo *bufferinfo);
extern void HashRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
XLogRecParseState* XactXlogCsnlogParseToBlock(XLogReaderState* record, uint32* blocknum, TransactionId xid,
int nsubxids, TransactionId* subxids, CommitSeqNo csn, XLogRecParseState* recordstatehead);
extern void XLogRecSetVmBlockState(XLogReaderState* record, uint32 blockid, XLogRecParseState* recordblockstate);
@ -1189,6 +1233,7 @@ extern void XLogBlockSegDdlDoRealAction(XLogBlockHead* blockhead, void* blockrec
extern void GinRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
extern void GistRedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, RedoBufferInfo *bufferinfo);
extern bool IsCheckPoint(const XLogRecParseState *parseState);
void redo_atomic_xlog_dispatch(uint8 opCode, RedoBufferInfo *redo_buf, const char *data);
void seg_redo_new_page_copy_and_flush(BufferTag *tag, char *data, XLogRecPtr lsn);

View File

@ -37,7 +37,8 @@
*/
#define XLR_SPECIAL_REL_UPDATE 0x01
#define XLR_BTREE_UPGRADE_FLAG 0x02
/* If xlog record is the compress table creation */
#define XLR_REL_COMPRESS 0X04
#define XLR_IS_TOAST 0X08
/* If xlog record is from toast page */
@ -84,7 +85,7 @@ typedef struct XLogRecordBlockHeader {
#define BKID_HAS_TDE_PAGE (0x40)
#define BKID_GET_BKID(id) (id & 0x3F)
/*
/*
* In segment-page storage, RelFileNode and block number are logic for XLog. Thus, we need record
* physical location in xlog. This macro is used to check whether in such situation.
*/

View File

@ -80,12 +80,14 @@ extern Relation heap_create(const char *relname,
bool mapped_relation,
bool allow_system_table_mods,
int8 row_compress,
Datum reloptions,
Oid ownerid,
bool skip_create_storage,
TableAmType tam_type,
int8 relindexsplit = 0,
StorageType storage_type = HEAP_DISK,
bool newcbi = false);
bool newcbi = false,
Oid accessMethodObjectId = 0);
extern bool heap_is_matview_init_state(Relation rel);
@ -98,7 +100,9 @@ heapCreatePartition(const char* part_name,
Oid bucketOid,
Oid ownerid,
StorageType storage_type,
bool newcbi = false);
bool newcbi = false,
Datum reloptions = Datum(0));
extern Oid heap_create_with_catalog(const char *relname,
Oid relnamespace,
@ -120,7 +124,7 @@ extern Oid heap_create_with_catalog(const char *relname,
bool use_user_acl,
bool allow_system_table_mods,
PartitionState *partTableState,
int8 row_compress,
int8 row_compress,
HashBucketInfo *bucketinfo,
bool record_dependce = true,
List* ceLst = NULL,
@ -200,7 +204,7 @@ extern void CheckAttributeType(const char *attname, Oid atttypid, Oid attcollati
#ifdef PGXC
/* Functions related to distribution data of relations */
extern void AddRelationDistribution(const char *relname, Oid relid, DistributeBy *distributeby,
PGXCSubCluster *subcluster, List *parentOids, TupleDesc descriptor, bool isinstallationgroup,
PGXCSubCluster *subcluster, List *parentOids, TupleDesc descriptor, bool isinstallationgroup,
bool isbucket = false, int bucketmaplen = 0);
extern void GetRelationDistributionItems(Oid relid, DistributeBy *distributeby, TupleDesc descriptor, char *locatortype,
int *hashalgorithm, int *hashbuckets, AttrNumber *attnum);

View File

@ -20,6 +20,7 @@
#include "utils/tuplesort.h"
#define DEFAULT_INDEX_TYPE "btree"
#define DEFAULT_HASH_INDEX_TYPE "hash"
#define DEFAULT_CSTORE_INDEX_TYPE "psort"
#define DEFAULT_GIST_INDEX_TYPE "gist"
#define CSTORE_BTREE_INDEX_TYPE "cbtree"

View File

@ -38,11 +38,23 @@ typedef struct xl_smgr_create {
ForkNumber forkNum;
} xl_smgr_create;
typedef struct xl_smgr_create_compress {
xl_smgr_create xlrec;
uint2 pageCompressOpts;
} xl_smgr_create_compress;
typedef struct xl_smgr_truncate {
BlockNumber blkno;
RelFileNodeOld rnode;
} xl_smgr_truncate;
typedef struct xl_smgr_truncate_compress {
xl_smgr_truncate xlrec;
uint2 pageCompressOpts;
} xl_smgr_truncate_compress;
extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum);
extern void smgr_redo(XLogReaderState *record);

View File

@ -0,0 +1,2 @@
DROP FUNCTION IF EXISTS pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8) CASCADE;
DROP FUNCTION IF EXISTS pg_catalog.gs_read_block_from_remote(int4, int4, int4, int2, int2, int4, xid, int4, xid, boolean, int4) CASCADE;

View File

@ -0,0 +1,2 @@
DROP FUNCTION IF EXISTS pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8) CASCADE;
DROP FUNCTION IF EXISTS pg_catalog.gs_read_block_from_remote(int4, int4, int4, int2, int2, int4, xid, int4, xid, boolean, int4) CASCADE;

View File

@ -0,0 +1,23 @@
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 4768;
CREATE OR REPLACE FUNCTION pg_catalog.gs_read_block_from_remote
( int4,
int4,
int4,
int2,
int2,
int4,
xid,
int4,
xid,
boolean,
int4)
RETURNS SETOF record LANGUAGE INTERNAL ROWS 1 STRICT as 'gs_read_block_from_remote_compress';
-- pg_read_binary_file_blocks()
--
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 8413;
CREATE FUNCTION pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8,
OUT path text,
OUT blocknum int4,
OUT len int4,
OUT data bytea)
AS 'pg_read_binary_file_blocks' LANGUAGE INTERNAL IMMUTABLE STRICT;

View File

@ -0,0 +1,23 @@
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 4768;
CREATE OR REPLACE FUNCTION pg_catalog.gs_read_block_from_remote
( int4,
int4,
int4,
int2,
int2,
int4,
xid,
int4,
xid,
boolean,
int4)
RETURNS SETOF record LANGUAGE INTERNAL ROWS 1 STRICT as 'gs_read_block_from_remote_compress';
-- pg_read_binary_file_blocks()
--
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 8413;
CREATE FUNCTION pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8,
OUT path text,
OUT blocknum int4,
OUT len int4,
OUT data bytea)
AS 'pg_read_binary_file_blocks' LANGUAGE INTERNAL IMMUTABLE STRICT;

View File

@ -1199,6 +1199,7 @@ typedef struct knl_instance_context {
knl_g_archive_context archive_obs_cxt;
knl_g_archive_thread_info archive_thread_info;
struct HTAB* ngroup_hash_table;
struct HTAB* mmapCache;
knl_g_hypo_context hypo_cxt;
knl_g_segment_context segment_cxt;

View File

@ -88,6 +88,7 @@ extern const uint32 SUPPORT_DATA_REPAIR;
extern const uint32 SCAN_BATCH_MODE_VERSION_NUM;
extern const uint32 PUBLICATION_VERSION_NUM;
extern const uint32 ANALYZER_HOOK_VERSION_NUM;
extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM;
extern void register_backend_version(uint32 backend_version);
extern bool contain_backend_version(uint32 version_number);

View File

@ -1336,6 +1336,8 @@ typedef enum WaitEventIO {
WAIT_EVENT_OBS_READ,
WAIT_EVENT_OBS_WRITE,
WAIT_EVENT_LOGCTRL_SLEEP,
WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH,
WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC,
IO_EVENT_NUM = WAIT_EVENT_LOGCTRL_SLEEP - WAIT_EVENT_BUFFILE_READ + 1 // MUST be last, DO NOT use this value.
} WaitEventIO;

View File

@ -96,6 +96,13 @@ typedef struct buftag {
BlockNumber blockNum; /* blknum relative to begin of reln */
} BufferTag;
typedef struct buftagnocompress {
RelFileNodeV2 rnode;
ForkNumber forkNum;
BlockNumber blockNum; /* blknum relative to begin of reln */
} BufferTagSecondVer;
typedef struct buftagnohbkt {
RelFileNodeOld rnode; /* physical relation identifier */
ForkNumber forkNum;

View File

@ -325,6 +325,7 @@ extern bool ConditionalLockBuffer(Buffer buffer);
extern void LockBufferForCleanup(Buffer buffer);
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
extern bool ConditionalLockUHeapBufferForCleanup(Buffer buffer);
extern bool IsBufferCleanupOK(Buffer buffer);
extern bool HoldingBufferPinThatDelaysRecovery(void);
extern void AsyncUnpinBuffer(volatile void* bufHdr, bool forgetBuffer);
extern void AsyncCompltrPinBuffer(volatile void* bufHdr);

Some files were not shown because too many files have changed in this diff Show More