forked from huawei/openGauss-server
row compression & hash index
This commit is contained in:
parent
f9fc8c0d68
commit
15e3a99805
|
@ -2,7 +2,7 @@
|
|||
# pagehack
|
||||
AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} TGT_pagehack_SRC)
|
||||
set(TGT_pagehack_INC
|
||||
${TGT_pq_INC} ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SRC_DIR}/lib/gstrace
|
||||
${TGT_pq_INC} ${ZSTD_INCLUDE_PATH} ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_SRC_DIR}/lib/gstrace
|
||||
)
|
||||
|
||||
set(pagehack_DEF_OPTIONS ${MACRO_OPTIONS})
|
||||
|
@ -11,12 +11,13 @@ if(${ENABLE_DEBUG} STREQUAL "ON")
|
|||
endif()
|
||||
set(pagehack_COMPILE_OPTIONS ${OS_OPTIONS} ${PROTECT_OPTIONS} ${WARNING_OPTIONS} ${CHECK_OPTIONS} ${BIN_SECURE_OPTIONS} ${OPTIMIZE_OPTIONS})
|
||||
set(pagehack_LINK_OPTIONS ${BIN_LINK_OPTIONS})
|
||||
set(pagehack_LINK_LIBS -lpgport -lcrypt -ldl -lm -ledit -lssl -lcrypto -l${SECURE_C_CHECK} -lrt -lz -lminiunz)
|
||||
set(pagehack_LINK_LIBS -lpgport -lcrypt -ldl -lm -ledit -lssl -lcrypto -lsecurec -lrt -lz -lminiunz -lzstd)
|
||||
add_bintarget(pagehack TGT_pagehack_SRC TGT_pagehack_INC "${pagehack_DEF_OPTIONS}" "${pagehack_COMPILE_OPTIONS}" "${pagehack_LINK_OPTIONS}" "${pagehack_LINK_LIBS}")
|
||||
add_dependencies(pagehack pgport_static)
|
||||
target_link_directories(pagehack PUBLIC
|
||||
${LIBOPENSSL_LIB_PATH} ${PROTOBUF_LIB_PATH} ${LIBPARQUET_LIB_PATH} ${LIBCURL_LIB_PATH} ${SECURE_LIB_PATH}
|
||||
${ZLIB_LIB_PATH} ${LIBOBS_LIB_PATH} ${LIBEDIT_LIB_PATH} ${LIBCGROUP_LIB_PATH} ${CMAKE_BINARY_DIR}/lib
|
||||
${ZSTD_LIB_PATH}
|
||||
)
|
||||
install(TARGETS pagehack RUNTIME DESTINATION bin)
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# contrib/pagehack/Makefile
|
||||
MODULE_big = pagehack
|
||||
OBJS = pagehack.o
|
||||
OBJS = openGaussCompression.o pagehack.o
|
||||
|
||||
# executable program, even there is no database server/client
|
||||
PROGRAM = pagehack
|
||||
|
@ -13,7 +13,7 @@ else
|
|||
subdir = contrib/pagehack
|
||||
top_builddir = ../..
|
||||
include $(top_builddir)/src/Makefile.global
|
||||
enable_shared = false
|
||||
override CFLAGS += -lzstd
|
||||
|
||||
ifeq ($(enable_debug), yes)
|
||||
PG_CPPFLAGS += -DDEBUG
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
|
||||
*/
|
||||
|
||||
#include "openGaussCompression.h"
|
||||
#include "storage/checksum_impl.h"
|
||||
#include "storage/page_compression_impl.h"
|
||||
|
||||
void OpenGaussCompression::SetFilePath(const char *filePath, int segNo)
|
||||
{
|
||||
int rc = snprintf_s(pcaFilePath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, filePath);
|
||||
securec_check_ss_c(rc, "\0", "\0");
|
||||
rc = snprintf_s(pcdFilePath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, filePath);
|
||||
securec_check_ss_c(rc, "\0", "\0");
|
||||
|
||||
this->segmentNo = segNo;
|
||||
}
|
||||
|
||||
OpenGaussCompression::~OpenGaussCompression()
|
||||
{
|
||||
if (pcaFd != nullptr) {
|
||||
fclose(pcaFd);
|
||||
}
|
||||
if (pcdFd != nullptr) {
|
||||
fclose(pcdFd);
|
||||
}
|
||||
if (header != nullptr) {
|
||||
pc_munmap(header);
|
||||
}
|
||||
}
|
||||
|
||||
bool OpenGaussCompression::TryOpen()
|
||||
{
|
||||
if ((pcaFd = fopen(this->pcaFilePath, "rb+")) == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if ((pcdFd = fopen(this->pcdFilePath, "rb+")) == nullptr) {
|
||||
return false;
|
||||
}
|
||||
if (fseeko(pcaFd, (off_t)offsetof(PageCompressHeader, chunk_size), SEEK_SET) != 0) {
|
||||
return false;
|
||||
}
|
||||
if (fread(&chunkSize, sizeof(chunkSize), 1, this->pcaFd) <= 0) {
|
||||
return false;
|
||||
}
|
||||
header = pc_mmap(fileno(pcaFd), chunkSize, false);
|
||||
return true;
|
||||
}
|
||||
bool OpenGaussCompression::ReadChunkOfBlock(char *dst, size_t *dstLen, BlockNumber blockNumber)
|
||||
{
|
||||
auto currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
|
||||
do {
|
||||
auto chunkNum = currentAddr->nchunks;
|
||||
for (uint8 i = 0; i < chunkNum; i++) {
|
||||
off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]);
|
||||
uint8 start = i;
|
||||
while (i < chunkNum - 1 && currentAddr->chunknos[i + 1] == currentAddr->chunknos[i] + 1) {
|
||||
i++;
|
||||
}
|
||||
if (fseeko(this->pcdFd, seekPos, SEEK_SET) != 0) {
|
||||
return false;
|
||||
}
|
||||
size_t readAmount = chunkSize * (i - start + 1);
|
||||
if (fread(dst + start * chunkSize, 1, readAmount, this->pcdFd) != readAmount && ferror(this->pcdFd)) {
|
||||
return false;
|
||||
}
|
||||
*dstLen += readAmount;
|
||||
}
|
||||
if (chunkNum == 0 || DecompressPage(dst, decompressedBuffer, header->algorithm) == BLCKSZ) {
|
||||
break;
|
||||
}
|
||||
} while (true);
|
||||
if (PageIs8BXidHeapVersion(dst)) {
|
||||
byteConvert = ((HeapPageCompressData *)dst)->byte_convert;
|
||||
diffConvert = ((HeapPageCompressData *)dst)->diff_convert;
|
||||
} else {
|
||||
byteConvert = ((PageCompressData *)dst)->byte_convert;
|
||||
diffConvert = ((PageCompressData *)dst)->diff_convert;
|
||||
}
|
||||
this->blockNumber = blockNumber;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OpenGaussCompression::WriteBackCompressedData(char *source, size_t sourceLen, BlockNumber blockNumber)
|
||||
{
|
||||
auto currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
|
||||
for (size_t i = 0; i < currentAddr->nchunks; ++i) {
|
||||
off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]);
|
||||
if (fseeko(this->pcdFd, seekPos, SEEK_SET) != 0) {
|
||||
return false;
|
||||
}
|
||||
Assert(sourceLen >= i * chunkSize);
|
||||
auto writeCount = fwrite(source + i * chunkSize, 1, chunkSize, this->pcdFd);
|
||||
bool success = chunkSize == writeCount;
|
||||
if (!success) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fflush(this->pcdFd);
|
||||
return true;
|
||||
}
|
||||
|
||||
void OpenGaussCompression::MarkUncompressedDirty()
|
||||
{
|
||||
constexpr int writeLen = BLCKSZ / 2;
|
||||
unsigned char fill_byte[writeLen] = {0xFF};
|
||||
for (int i = 0; i < writeLen; i++)
|
||||
fill_byte[i] = 0xFF;
|
||||
auto rc = memcpy_s(decompressedBuffer + writeLen, BLCKSZ - writeLen, fill_byte, writeLen);
|
||||
securec_check(rc, "", "");
|
||||
}
|
||||
|
||||
BlockNumber OpenGaussCompression::GetMaxBlockNumber()
|
||||
{
|
||||
return (BlockNumber)pg_atomic_read_u32(&header->nblocks);
|
||||
}
|
||||
|
||||
char *OpenGaussCompression::GetPcdFilePath()
|
||||
{
|
||||
return this->pcdFilePath;
|
||||
}
|
||||
|
||||
char *OpenGaussCompression::GetDecompressedPage()
|
||||
{
|
||||
return this->decompressedBuffer;
|
||||
}
|
||||
|
||||
bool OpenGaussCompression::WriteBackUncompressedData()
|
||||
{
|
||||
auto algorithm = header->algorithm;
|
||||
auto workBufferSize = CompressPageBufferBound(decompressedBuffer, algorithm);
|
||||
if (workBufferSize < 0) {
|
||||
return false;
|
||||
}
|
||||
char *work_buffer = (char *)malloc(workBufferSize);
|
||||
RelFileCompressOption relFileCompressOption;
|
||||
relFileCompressOption.compressPreallocChunks = 0;
|
||||
relFileCompressOption.compressLevelSymbol = true;
|
||||
relFileCompressOption.compressLevel = 1;
|
||||
relFileCompressOption.compressAlgorithm = algorithm;
|
||||
relFileCompressOption.byteConvert = byteConvert;
|
||||
relFileCompressOption.diffConvert = diffConvert;
|
||||
|
||||
auto compress_buffer_size = CompressPage(decompressedBuffer, work_buffer, workBufferSize, relFileCompressOption);
|
||||
if (compress_buffer_size < 0) {
|
||||
return false;
|
||||
}
|
||||
uint8 nchunks = (compress_buffer_size - 1) / chunkSize + 1;
|
||||
auto bufferSize = chunkSize * nchunks;
|
||||
if (bufferSize >= BLCKSZ) {
|
||||
/* store original page if can not save space? */
|
||||
free(work_buffer);
|
||||
work_buffer = (char *)decompressedBuffer;
|
||||
nchunks = BLCKSZ / chunkSize;
|
||||
} else {
|
||||
/* fill zero in the last chunk */
|
||||
if (compress_buffer_size < bufferSize) {
|
||||
auto leftSize = bufferSize - compress_buffer_size;
|
||||
errno_t rc = memset_s(work_buffer + compress_buffer_size, leftSize, 0, leftSize);
|
||||
securec_check(rc, "", "");
|
||||
}
|
||||
}
|
||||
uint8 need_chunks = nchunks;
|
||||
PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
|
||||
if (pcAddr->allocated_chunks < need_chunks) {
|
||||
auto chunkno = pg_atomic_fetch_add_u32(&header->allocated_chunks, need_chunks - pcAddr->allocated_chunks);
|
||||
for (uint8 i = pcAddr->allocated_chunks; i < need_chunks; ++i) {
|
||||
pcAddr->chunknos[i] = ++chunkno;
|
||||
}
|
||||
pcAddr->allocated_chunks = need_chunks;
|
||||
pcAddr->nchunks = need_chunks;
|
||||
}
|
||||
return this->WriteBackCompressedData(work_buffer, compress_buffer_size, blockNumber);
|
||||
}
|
||||
|
||||
|
||||
#include "compression_algorithm.ini"
|
|
@ -0,0 +1,40 @@
|
|||
#ifndef OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H
|
||||
#define OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H
|
||||
#define FRONTEND 1
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "c.h"
|
||||
#include "storage/buf/block.h"
|
||||
#include "storage/page_compression.h"
|
||||
|
||||
class OpenGaussCompression {
|
||||
private:
|
||||
FILE* pcaFd = nullptr;
|
||||
FILE* pcdFd = nullptr;
|
||||
char pcaFilePath[MAXPGPATH];
|
||||
char pcdFilePath[MAXPGPATH];
|
||||
PageCompressHeader* header = nullptr;
|
||||
|
||||
private:
|
||||
int segmentNo;
|
||||
BlockNumber blockNumber;
|
||||
decltype(PageCompressHeader::chunk_size) chunkSize;
|
||||
char decompressedBuffer[BLCKSZ];
|
||||
bool byteConvert;
|
||||
bool diffConvert;
|
||||
|
||||
public:
|
||||
void SetFilePath(const char* filePath, int segNo);
|
||||
virtual ~OpenGaussCompression();
|
||||
bool TryOpen();
|
||||
bool ReadChunkOfBlock(char* dst, size_t* dstLen, BlockNumber blockNumber);
|
||||
bool WriteBackCompressedData(char* source, size_t sourceLen, BlockNumber blockNumber);
|
||||
bool WriteBackUncompressedData();
|
||||
void MarkUncompressedDirty();
|
||||
BlockNumber GetMaxBlockNumber();
|
||||
char* GetPcdFilePath();
|
||||
char* GetDecompressedPage();
|
||||
};
|
||||
|
||||
#endif // OPENGAUSS_SERVER_OPENGAUSSCOMPRESSION_H
|
|
@ -91,6 +91,9 @@
|
|||
#include "tsdb/utils/constant_def.h"
|
||||
#endif
|
||||
|
||||
#include "openGaussCompression.h"
|
||||
|
||||
|
||||
/* Max number of pg_class oid, currently about 4000 */
|
||||
#define MAX_PG_CLASS_ID 10000
|
||||
/* Number of pg_class types */
|
||||
|
@ -139,6 +142,7 @@ static const char* PgHeapRelName[] = {"pg_class",
|
|||
"pg_am",
|
||||
"pg_statistic",
|
||||
"pg_toast"};
|
||||
typedef enum SegmentType { SEG_HEAP, SEG_FSM, SEG_UHEAP, SEG_INDEX_BTREE, SEG_UNDO, SEG_UNKNOWN } SegmentType;
|
||||
|
||||
static void ParsePgClassTupleData(binary tupdata, int len, binary nullBitmap, int natrrs);
|
||||
static void ParsePgIndexTupleData(binary tupdata, int len, binary nullBitmap, int nattrs);
|
||||
|
@ -156,6 +160,8 @@ static void ParseToastTupleData(binary tupdata, int len, binary nullBitmap, int
|
|||
static void ParseTDSlot(const char *page);
|
||||
|
||||
static void ParseToastIndexTupleData(binary tupdata, int len, binary nullBitmap, int nattrs);
|
||||
static int parse_uncompressed_page_file(const char *filename, SegmentType type, const uint32 start_point,
|
||||
const uint32 number_read);
|
||||
|
||||
static ParseHeapTupleData PgHeapRelTupleParser[] = {
|
||||
ParsePgClassTupleData, // pg_class
|
||||
|
@ -899,8 +905,6 @@ static const char* HACKINGTYPE[] = {"heap",
|
|||
"segment"
|
||||
};
|
||||
|
||||
typedef enum SegmentType { SEG_HEAP, SEG_FSM, SEG_UHEAP, SEG_INDEX_BTREE, SEG_UNDO, SEG_UNKNOWN } SegmentType;
|
||||
|
||||
const char* PageTypeNames[] = {"DATA", "FSM", "VM"};
|
||||
|
||||
#define GETHEAPSTRUCT(TUP) ((unsigned char*)(TUP) + (TUP)->t_hoff)
|
||||
|
@ -3145,7 +3149,78 @@ static int parse_a_page(const char* buffer, int blkno, int blknum, SegmentType t
|
|||
return true;
|
||||
}
|
||||
|
||||
static BlockNumber CalculateMaxBlockNumber(BlockNumber blknum, BlockNumber start, BlockNumber number)
|
||||
{
|
||||
/* parse */
|
||||
if (start >= blknum) {
|
||||
fprintf(stderr, "start point exceeds the total block number of relation.\n");
|
||||
return InvalidBlockNumber;
|
||||
} else if ((start + number) > blknum) {
|
||||
fprintf(stderr, "don't have %d blocks from block %d in the relation, only %d blocks\n", number, start,
|
||||
(blknum - start));
|
||||
number = blknum;
|
||||
} else if (number == 0) {
|
||||
number = blknum;
|
||||
} else {
|
||||
number += start;
|
||||
}
|
||||
return number;
|
||||
}
|
||||
|
||||
static int parse_page_file(const char* filename, SegmentType type, const uint32 start_point, const uint32 number_read)
|
||||
{
|
||||
if (type != SEG_HEAP && type != SEG_INDEX_BTREE) {
|
||||
return parse_uncompressed_page_file(filename, type, start_point, number_read);
|
||||
}
|
||||
|
||||
auto openGaussCompression = new OpenGaussCompression();
|
||||
openGaussCompression->SetFilePath(filename, SegNo);
|
||||
bool success = openGaussCompression->TryOpen();
|
||||
if (!success) {
|
||||
delete openGaussCompression;
|
||||
return parse_uncompressed_page_file(filename, type, start_point, number_read);
|
||||
}
|
||||
|
||||
BlockNumber start = start_point;
|
||||
BlockNumber blknum = openGaussCompression->GetMaxBlockNumber();
|
||||
BlockNumber number = CalculateMaxBlockNumber(blknum, start, number_read);
|
||||
if (number == InvalidBlockNumber) {
|
||||
delete openGaussCompression;
|
||||
return false;
|
||||
}
|
||||
char compressed[BLCKSZ];
|
||||
size_t compressedLen;
|
||||
while (start < number) {
|
||||
if (!openGaussCompression->ReadChunkOfBlock(compressed, &compressedLen, start)) {
|
||||
fprintf(stderr, "read block %d failed, filename: %s: %s\n", start, openGaussCompression->GetPcdFilePath(),
|
||||
strerror(errno));
|
||||
delete openGaussCompression;
|
||||
return false;
|
||||
}
|
||||
if (!parse_a_page(openGaussCompression->GetDecompressedPage(), start, blknum, type)) {
|
||||
fprintf(stderr, "Error during parsing block %d/%d\n", start, blknum);
|
||||
delete openGaussCompression;
|
||||
return false;
|
||||
}
|
||||
if ((write_back && num_item) || dirty_page) {
|
||||
if (dirty_page) {
|
||||
openGaussCompression->MarkUncompressedDirty();
|
||||
}
|
||||
if (!openGaussCompression->WriteBackUncompressedData()) {
|
||||
fprintf(stderr, "write back failed, filename: %s: %s\n", openGaussCompression->GetPcdFilePath(),
|
||||
strerror(errno));
|
||||
delete openGaussCompression;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
start++;
|
||||
}
|
||||
delete openGaussCompression;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int parse_uncompressed_page_file(const char *filename, SegmentType type, const uint32 start_point,
|
||||
const uint32 number_read)
|
||||
{
|
||||
char buffer[BLCKSZ];
|
||||
FILE* fd = NULL;
|
||||
|
@ -3173,9 +3248,8 @@ static int parse_page_file(const char* filename, SegmentType type, const uint32
|
|||
blknum = size / BLCKSZ;
|
||||
|
||||
/* parse */
|
||||
if (start >= blknum) {
|
||||
fprintf(stderr, "start point exceeds the total block number of relation.\n");
|
||||
fclose(fd);
|
||||
number = CalculateMaxBlockNumber(blknum, start, number);
|
||||
if (number == InvalidBlockNumber) {
|
||||
return false;
|
||||
} else if ((start + number) > blknum) {
|
||||
fprintf(stderr,
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "access/gin.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/multixact.h"
|
||||
#include "access/nbtree.h"
|
||||
|
|
|
@ -366,7 +366,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
|
|||
Page page;
|
||||
OffsetNumber maxoff;
|
||||
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
|
@ -393,7 +392,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
|
|||
}
|
||||
|
||||
_hash_relbuf(rel, buf);
|
||||
_hash_droplock(rel, blkno, HASH_SHARE);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -761,7 +761,7 @@ else # not PGXS
|
|||
endif
|
||||
endif
|
||||
|
||||
override CPPFLAGS := $(CPPFLAGS) -I$(LIBODBC_INCLUDE_PATH) -I$(LIBOBS_INCLUDE_PATH) -I$(LIBCGROUP_INCLUDE_PATH) -I$(LIBOPENSSL_INCLUDE_PATH) -I${LIBORC_INCLUDE_PATH} -I${LIBPARQUET_INCLUDE_PATH} -I${PROTOBUF_INCLUDE_PATH} -I${BOOST_INCLUDE_PATH} -I$(LIBLLVM_INCLUDE_PATH) -I$(KERBEROS_INCLUDE_PATH) -I$(CJSON_INCLUDE_PATH) -I$(NUMA_INCLUDE_PATH) -I$(ZLIB_INCLUDE_PATH) -I$(LZ4_INCLUDE_PATH) -I$(LIBCURL_INCLUDE_PATH) -I$(DCF_INCLUDE_PATH)
|
||||
override CPPFLAGS := $(CPPFLAGS) -I$(LIBODBC_INCLUDE_PATH) -I$(LIBOBS_INCLUDE_PATH) -I$(LIBCGROUP_INCLUDE_PATH) -I$(LIBOPENSSL_INCLUDE_PATH) -I${LIBORC_INCLUDE_PATH} -I${LIBPARQUET_INCLUDE_PATH} -I${PROTOBUF_INCLUDE_PATH} -I${BOOST_INCLUDE_PATH} -I$(LIBLLVM_INCLUDE_PATH) -I$(KERBEROS_INCLUDE_PATH) -I$(CJSON_INCLUDE_PATH) -I$(NUMA_INCLUDE_PATH) -I$(ZLIB_INCLUDE_PATH) -I$(LZ4_INCLUDE_PATH) -I$(LIBCURL_INCLUDE_PATH) -I$(DCF_INCLUDE_PATH) -I$(ZSTD_INCLUDE_PATH)
|
||||
|
||||
# GDS links to libevent
|
||||
ifeq ($(enable_multiple_nodes), yes)
|
||||
|
@ -895,6 +895,9 @@ endif
|
|||
# append zlib for compression: zlib
|
||||
LDFLAGS += -L$(ZLIB_LIB_PATH) -I$(ZLIB_INCLUDE_PATH)
|
||||
|
||||
#append zstd for compression: zstd
|
||||
LDFLAGS += -L$(ZSTD_LIB_PATH) -I$(ZSTD_INCLUDE_PATH)
|
||||
|
||||
LDFLAGS += -L$(SECURE_LIB_PATH)
|
||||
LDFLAGS += -L$(LIBOPENSSL_LIB_PATH)
|
||||
LDFLAGS += -L$(LIBSTD_LIB_PATH)
|
||||
|
|
|
@ -5,6 +5,7 @@ set(TGT_rewind_SRC ${CMAKE_CURRENT_SOURCE_DIR}/datapagemap.cpp
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/filemap.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/file_ops.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/logging.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/compressed_rewind.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/parsexlog.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/pg_rewind.cpp
|
||||
)
|
||||
|
@ -15,7 +16,8 @@ set(TGT_rewind_INC
|
|||
${PROJECT_SRC_DIR}/common/port
|
||||
${PROJECT_SRC_DIR}/common/interfaces/libpq
|
||||
${PROJECT_SRC_DIR}/include/libpq
|
||||
${LIBOPENSSL_INCLUDE_PATH}
|
||||
${LIBOPENSSL_INCLUDE_PATH}
|
||||
${ZSTD_INCLUDE_PATH}
|
||||
)
|
||||
|
||||
set(rewind_DEF_OPTIONS ${MACRO_OPTIONS})
|
||||
|
|
|
@ -26,7 +26,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
OBJS = file_ops.o datapagemap.o fetch.o filemap.o logging.o parsexlog.o pg_rewind.o
|
||||
OBJS = file_ops.o datapagemap.o fetch.o filemap.o logging.o parsexlog.o pg_rewind.o compressed_rewind.o
|
||||
|
||||
#all:gs_rewind.a
|
||||
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
/* -------------------------------------------------------------------------
|
||||
*
|
||||
* compressed_common.h
|
||||
*
|
||||
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H
|
||||
#define OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H
|
||||
|
||||
#include "utils/atomic.h"
|
||||
|
||||
|
||||
|
||||
struct RewindCompressInfo {
|
||||
bool compressed = false; /* compressed table or not */
|
||||
uint32 oldBlockNumber = 0;
|
||||
uint32 newBlockNumber = 0;
|
||||
uint8 algorithm = 0; /* compressed algorithm */
|
||||
uint16 chunkSize = 0; /* compressed chunk size */
|
||||
};
|
||||
|
||||
struct CompressedPcaInfo {
|
||||
char *pcaMap = NULL;
|
||||
int pcaFd = -1;
|
||||
char path[MAXPGPATH];
|
||||
int32 chunkSize = 0;
|
||||
int32 algorithm = 0;
|
||||
};
|
||||
|
||||
#define COPY_REWIND_COMPRESS_INFO(entry, infoPointer, oldBlock, newBlock) \
|
||||
(entry)->rewindCompressInfo.oldBlockNumber = 0; \
|
||||
(entry)->rewindCompressInfo.newBlockNumber = 0; \
|
||||
(entry)->rewindCompressInfo.compressed = false; \
|
||||
(entry)->rewindCompressInfo.algorithm = 0; \
|
||||
(entry)->rewindCompressInfo.chunkSize = 0; \
|
||||
if ((infoPointer) != NULL && (infoPointer)->compressed) { \
|
||||
(entry)->rewindCompressInfo.oldBlockNumber = (oldBlock); \
|
||||
(entry)->rewindCompressInfo.newBlockNumber = (newBlock); \
|
||||
(entry)->rewindCompressInfo.compressed = true; \
|
||||
(entry)->rewindCompressInfo.algorithm = (infoPointer)->algorithm; \
|
||||
(entry)->rewindCompressInfo.chunkSize = (infoPointer)->chunkSize; \
|
||||
}
|
||||
|
||||
#endif // OPENGAUSS_SERVER_COMPRESS_COMPRESSED_COMMON_H
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
|
||||
*
|
||||
* openGauss is licensed under Mulan PSL v2.
|
||||
* You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
* You may obtain a copy of Mulan PSL v2 at:
|
||||
*
|
||||
* http://license.coscl.org.cn/MulanPSL2
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
* See the Mulan PSL v2 for more details.
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* compressed_rewind.cpp
|
||||
* Functions for fetching compressed table.
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* ./src/bin/pg_rewind/compressed_rewind.cpp
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
#include "compressed_rewind.h"
|
||||
#include "libpq/libpq-fe.h"
|
||||
#include "lib/string.h"
|
||||
#include "logging.h"
|
||||
#include "filemap.h"
|
||||
#include "utils/elog.h"
|
||||
#include "file_ops.h"
|
||||
|
||||
void FormatPathToPca(const char* path, char* dst, size_t len, bool withPrefix)
|
||||
{
|
||||
errno_t rc;
|
||||
if (withPrefix) {
|
||||
rc = snprintf_s(dst, len, len - 1, "%s/" PCA_SUFFIX, pg_data, path);
|
||||
} else {
|
||||
rc = snprintf_s(dst, len, len - 1, PCA_SUFFIX, path);
|
||||
}
|
||||
securec_check_ss_c(rc, "\0", "\0");
|
||||
}
|
||||
|
||||
void FormatPathToPcd(const char* path, char* dst, size_t len, bool withPrefix)
|
||||
{
|
||||
errno_t rc;
|
||||
if (withPrefix) {
|
||||
rc = snprintf_s(dst, len, len - 1, "%s/" PCD_SUFFIX, pg_data, path);
|
||||
} else {
|
||||
rc = snprintf_s(dst, len, len - 1, PCD_SUFFIX, path);
|
||||
}
|
||||
securec_check_ss_c(rc, "\0", "\0");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool ReadCompressedInfo(T& t, off_t offset, FILE* file, char* pcaFilePath, size_t len)
|
||||
{
|
||||
if (fseeko(file, offset, SEEK_SET) != 0) {
|
||||
pg_fatal("could not seek in file \"%s\": \"%lu\": %s\n", pcaFilePath, len, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
if (fread(&t, sizeof(t), 1, file) <= 0) {
|
||||
pg_fatal("could not open file \"%s\": \"%lu\": %s\n", pcaFilePath, len, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* write RewindCompressInfo
|
||||
* @param file file fp
|
||||
* @param pcaFilePath file path,for ereport
|
||||
* @param rewindCompressInfo pointer of return
|
||||
* @return sucesss or not
|
||||
*/
|
||||
static bool ReadRewindCompressedInfo(FILE* file, char* pcaFilePath, size_t len, RewindCompressInfo* rewindCompressInfo)
|
||||
{
|
||||
off_t offset = (off_t)offsetof(PageCompressHeader, chunk_size);
|
||||
if (!ReadCompressedInfo(rewindCompressInfo->chunkSize, offset, file, pcaFilePath, len)) {
|
||||
return false;
|
||||
}
|
||||
offset = (off_t)offsetof(PageCompressHeader, algorithm);
|
||||
if (!ReadCompressedInfo(rewindCompressInfo->algorithm, offset, file, pcaFilePath, len)) {
|
||||
return false;
|
||||
}
|
||||
offset = (off_t)offsetof(PageCompressHeader, nblocks);
|
||||
if (!ReadCompressedInfo(rewindCompressInfo->oldBlockNumber, offset, file, pcaFilePath, len)) {
|
||||
return false;
|
||||
}
|
||||
rewindCompressInfo->compressed = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool FetchSourcePca(const char* strValue, RewindCompressInfo* rewindCompressInfo)
|
||||
{
|
||||
size_t length = 0;
|
||||
PageCompressHeader* ptr = (PageCompressHeader*)PQunescapeBytea((const unsigned char*)strValue, &length);
|
||||
rewindCompressInfo->compressed = false;
|
||||
if (length == sizeof(PageCompressHeader)) {
|
||||
rewindCompressInfo->compressed = true;
|
||||
rewindCompressInfo->algorithm = ptr->algorithm;
|
||||
rewindCompressInfo->newBlockNumber = ptr->nblocks;
|
||||
rewindCompressInfo->oldBlockNumber = 0;
|
||||
rewindCompressInfo->chunkSize = ptr->chunk_size;
|
||||
}
|
||||
PQfreemem(ptr);
|
||||
return rewindCompressInfo->compressed;
|
||||
}
|
||||
|
||||
bool ProcessLocalPca(const char* tablePath, RewindCompressInfo* rewindCompressInfo)
|
||||
{
|
||||
rewindCompressInfo->compressed = false;
|
||||
if (!isRelDataFile(tablePath)) {
|
||||
return false;
|
||||
}
|
||||
char pcaFilePath[MAXPGPATH];
|
||||
FormatPathToPca(tablePath, pcaFilePath, MAXPGPATH, true);
|
||||
FILE* file = fopen(pcaFilePath, "rb");
|
||||
if (file == NULL) {
|
||||
if (errno == ENOENT) {
|
||||
return false;
|
||||
}
|
||||
pg_fatal("could not open file \"%s\": %s\n", pcaFilePath, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
bool success = ReadRewindCompressedInfo(file, pcaFilePath, MAXPGPATH, rewindCompressInfo);
|
||||
fclose(file);
|
||||
return success;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/* -------------------------------------------------------------------------
|
||||
*
|
||||
* compressed_rewind.h
|
||||
*
|
||||
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H
|
||||
#define OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H
|
||||
|
||||
#include "compressed_common.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/smgr/relfilenode.h"
|
||||
|
||||
extern bool FetchSourcePca(const char* strValue, RewindCompressInfo* rewindCompressInfo);
|
||||
extern bool ProcessLocalPca(const char* tablePath, RewindCompressInfo* rewindCompressInfo);
|
||||
extern void FormatPathToPca(const char* path, char* dst, size_t len, bool withPrefix = false);
|
||||
extern void FormatPathToPcd(const char* path, char* dst, size_t len, bool withPrefix = false);
|
||||
|
||||
#endif // OPENGAUSS_SERVER_COMPRESS_COMPRESSED_REWIND_H
|
|
@ -23,6 +23,7 @@
|
|||
#include "libpq/libpq-fe.h"
|
||||
#include "libpq/libpq-int.h"
|
||||
#include "common/fe_memutils.h"
|
||||
#include "compressed_rewind.h"
|
||||
#include "catalog/catalog.h"
|
||||
#include "catalog/pg_type.h"
|
||||
|
||||
|
@ -47,11 +48,11 @@ const uint64 MAX_FILE_SIZE = 0xFFFFFFFF;
|
|||
#define MAX_PARAM_LEN 1024
|
||||
|
||||
static BuildErrorCode receiveFileChunks(const char* sql, FILE* file);
|
||||
static BuildErrorCode execute_pagemap(datapagemap_t* pagemap, const char* path, FILE* file);
|
||||
static BuildErrorCode execute_pagemap(file_entry_t* entry, FILE* file);
|
||||
static char* run_simple_query(const char* sql);
|
||||
static BuildErrorCode recurse_dir(const char* datadir, const char* path, process_file_callback_t callback);
|
||||
static void get_slot_name_by_app_name(void);
|
||||
|
||||
static BuildErrorCode CheckResultSet(PGresult* pgResult);
|
||||
BuildErrorCode libpqConnect(const char* connstr)
|
||||
{
|
||||
PGresult* res = NULL;
|
||||
|
@ -254,10 +255,22 @@ BuildErrorCode fetchSourceFileList()
|
|||
* general, so if the admin has put any custom symbolic links in the data
|
||||
* directory, they won't be copied correctly.
|
||||
*/
|
||||
sql = "SELECT path, size, isdir, pg_tablespace_location(pg_tablespace.oid) AS link_target \n"
|
||||
/* skip pca/pcd files and concat pca with table file */
|
||||
sql = "WITH tmp_table AS (\n"
|
||||
"SELECT path, size, isdir, pg_tablespace_location(pg_tablespace.oid) AS link_target \n"
|
||||
"FROM (SELECT * FROM pg_stat_file_recursive('.')) AS files \n"
|
||||
"LEFT OUTER JOIN pg_tablespace ON files.path like 'pg_tblspc/%' AND oid::text = files.filename\n";
|
||||
res = PQexec(conn, sql);
|
||||
"LEFT OUTER JOIN pg_tablespace ON files.path ~ '^pg_tblspc/' AND oid :: text = files.filename\n"
|
||||
"),compressed_address AS (SELECT path pca_path, substr(path, 0, length(path) - 4) AS table_path\n"
|
||||
"FROM pg_stat_file_recursive('.') WHERE path ~ '_pca$' AND length(path) > 4)\n"
|
||||
"SELECT path, size, isdir, link_target,\n"
|
||||
"CASE WHEN pca_path IS NOT NULL THEN pg_read_binary_file(pca_path, 0, %d, true)\n"
|
||||
"ELSE NULL END AS pchdr\n"
|
||||
"FROM tmp_table LEFT JOIN compressed_address\n"
|
||||
"ON tmp_table.path = compressed_address.table_path\nWHERE path !~ '_pca$' AND path !~ '_pcd$'\n";
|
||||
char sqlbuf[1024];
|
||||
int rc = snprintf_s(sqlbuf, sizeof(sqlbuf), sizeof(sqlbuf) - 1, sql, SIZE_OF_PAGE_COMPRESS_HEADER_DATA);
|
||||
securec_check_ss_c(rc, "\0", "\0");
|
||||
res = PQexec(conn, (const char*)sqlbuf);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK) {
|
||||
pg_log(PG_ERROR, "could not fetch file list: %s", PQresultErrorMessage(res));
|
||||
|
@ -265,7 +278,7 @@ BuildErrorCode fetchSourceFileList()
|
|||
}
|
||||
|
||||
/* sanity check the result set */
|
||||
if (PQnfields(res) != 4) {
|
||||
if (PQnfields(res) != 5) {
|
||||
pg_fatal("unexpected result set while fetching file list\n");
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
}
|
||||
|
@ -308,7 +321,13 @@ BuildErrorCode fetchSourceFileList()
|
|||
}
|
||||
}
|
||||
}
|
||||
process_source_file(path, type, filesize, link_target);
|
||||
RewindCompressInfo rewindCompressInfo;
|
||||
RewindCompressInfo *pointer = NULL;
|
||||
if (!PQgetisnull(res, i, 4) && FetchSourcePca(PQgetvalue(res, i, 4), &rewindCompressInfo)) {
|
||||
filesize = rewindCompressInfo.newBlockNumber * BLCKSZ;
|
||||
pointer = &rewindCompressInfo;
|
||||
}
|
||||
process_source_file(path, type, filesize, link_target, pointer);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
}
|
||||
PQclear(res);
|
||||
|
@ -364,7 +383,7 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file)
|
|||
}
|
||||
|
||||
/* sanity check the result set */
|
||||
if (PQnfields(res) != 4 || PQntuples(res) != 1) {
|
||||
if (PQnfields(res) != 7 || PQntuples(res) != 1) {
|
||||
pg_fatal("unexpected result set size while fetching remote files\n");
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
}
|
||||
|
@ -393,6 +412,8 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file)
|
|||
pg_fatal("unexpected result length while fetching remote files\n");
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
}
|
||||
/* check compressed result set */
|
||||
CheckResultSet(res);
|
||||
|
||||
/* Read result set to local variables */
|
||||
errorno = memcpy_s(&chunkoff, sizeof(int32), PQgetvalue(res, 0, 1), sizeof(int32));
|
||||
|
@ -429,17 +450,63 @@ static BuildErrorCode receiveFileChunks(const char* sql, FILE* file)
|
|||
continue;
|
||||
}
|
||||
|
||||
pg_log(PG_DEBUG, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
|
||||
fprintf(file, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
|
||||
int32 algorithm;
|
||||
errorno = memcpy_s(&algorithm, sizeof(int32), PQgetvalue(res, 0, 4), sizeof(int32));
|
||||
securec_check_c(errorno, "\0", "\0");
|
||||
algorithm = ntohl(algorithm);
|
||||
if (algorithm == 0) {
|
||||
pg_log(PG_DEBUG, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
|
||||
fprintf(file, "received chunk for file \"%s\", offset %d, size %d\n", filename, chunkoff, chunksize);
|
||||
open_target_file(filename, false);
|
||||
pg_free(filename);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
write_target_range(chunk, chunkoff, chunksize, chunkspace);
|
||||
} else {
|
||||
int32 chunkSize;
|
||||
int errorno = memcpy_s(&chunkSize, sizeof(int32), PQgetvalue(res, 0, 5), sizeof(int32));
|
||||
securec_check_c(errorno, "\0", "\0");
|
||||
chunkSize = ntohl(chunkSize);
|
||||
bool rebuild = *PQgetvalue(res, 0, 6) != 0;
|
||||
char dst[MAXPGPATH];
|
||||
/* open pca */
|
||||
FormatPathToPca(filename, dst, MAXPGPATH, false);
|
||||
OpenCompressedPcaFile(dst, chunkSize, algorithm, rebuild);
|
||||
|
||||
open_target_file(filename, false);
|
||||
pg_free(filename);
|
||||
filename = NULL;
|
||||
/* open pcd */
|
||||
FormatPathToPcd(filename, dst, MAXPGPATH, false);
|
||||
open_target_file(dst, false);
|
||||
BlockNumber blockNumber = chunkoff;
|
||||
size_t blockSize = chunkspace;
|
||||
|
||||
/* fetch result */
|
||||
FetchCompressedFile(chunk, blockNumber, blockSize);
|
||||
}
|
||||
}
|
||||
return BUILD_SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* check result set of compressed tables
|
||||
* @param pgResult result
|
||||
* @return success or not
|
||||
*/
|
||||
static BuildErrorCode CheckResultSet(PGresult* res)
|
||||
{
|
||||
#define PQ_TYPE(index, type) (PQftype(res, (index)) != (type))
|
||||
if (PQ_TYPE(4, INT4OID) || PQ_TYPE(5, INT4OID) || PQ_TYPE(6, BOOLOID)) {
|
||||
pg_fatal(
|
||||
"FetchCompressedFile:unexpected data types: %u %u %u\n", PQftype(res, 4), PQftype(res, 5), PQftype(res, 6));
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
write_target_range(chunk, chunkoff, chunksize, chunkspace);
|
||||
}
|
||||
#define PQ_FORMAT(index) (PQfformat(res, 0) != 1)
|
||||
if (PQ_FORMAT(4) && PQ_FORMAT(5) && PQ_FORMAT(6)) {
|
||||
pg_fatal("unexpected result format while fetching remote files\n");
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
}
|
||||
#define PQ_ISNULL(index) (PQgetisnull(res, 0, (index)))
|
||||
if (PQ_ISNULL(4) || PQ_ISNULL(5) || PQ_ISNULL(6)) {
|
||||
pg_fatal("unexpected null values in result while fetching remote files\n");
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
PQclear(res);
|
||||
res = NULL;
|
||||
}
|
||||
return BUILD_SUCCESS;
|
||||
}
|
||||
|
@ -497,6 +564,43 @@ error:
|
|||
return result;
|
||||
}
|
||||
|
||||
static void CompressedFileCopy(const file_entry_t* entry, bool rebuild)
|
||||
{
|
||||
Assert(!rebuild || entry->rewindCompressInfo.oldBlockNumber == 0);
|
||||
if (dry_run) {
|
||||
return;
|
||||
}
|
||||
|
||||
char linebuf[MAXPGPATH + 47];
|
||||
int ret = snprintf_s(linebuf,
|
||||
sizeof(linebuf),
|
||||
sizeof(linebuf) - 1,
|
||||
"%s\t%u\t%u\t%u\t%u\t%u\n",
|
||||
entry->path,
|
||||
entry->rewindCompressInfo.oldBlockNumber,
|
||||
entry->rewindCompressInfo.newBlockNumber - entry->rewindCompressInfo.oldBlockNumber,
|
||||
entry->rewindCompressInfo.algorithm,
|
||||
entry->rewindCompressInfo.chunkSize,
|
||||
rebuild);
|
||||
securec_check_ss_c(ret, "\0", "\0");
|
||||
if (PQputCopyData(conn, linebuf, strlen(linebuf)) != 1) {
|
||||
pg_fatal("could not send COPY data: %s", PQerrorMessage(conn));
|
||||
}
|
||||
pg_log(PG_PROGRESS, "CompressedFileCopy:%s", linebuf);
|
||||
}
|
||||
|
||||
static void CompressedFileRemove(const file_entry_t* entry)
|
||||
{
|
||||
remove_target((file_entry_t*) entry);
|
||||
char* path = entry->path;
|
||||
char dst[MAXPGPATH];
|
||||
FormatPathToPca(path, dst, MAXPGPATH);
|
||||
remove_target_file(dst, false);
|
||||
FormatPathToPcd(path, dst, MAXPGPATH);
|
||||
remove_target_file(dst, false);
|
||||
pg_log(PG_PROGRESS, "CompressedFileRemove: %s\n", path);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write a file range to a temporary table in the server.
|
||||
*
|
||||
|
@ -506,7 +610,7 @@ error:
|
|||
*/
|
||||
static void fetch_file_range(const char* path, unsigned int begin, unsigned int end)
|
||||
{
|
||||
char linebuf[MAXPGPATH + 23];
|
||||
char linebuf[MAXPGPATH + 47];
|
||||
int ss_c = 0;
|
||||
|
||||
/* Split the range into CHUNKSIZE chunks */
|
||||
|
@ -518,12 +622,12 @@ static void fetch_file_range(const char* path, unsigned int begin, unsigned int
|
|||
} else {
|
||||
len = end - begin;
|
||||
}
|
||||
ss_c = snprintf_s(linebuf, sizeof(linebuf), sizeof(linebuf) - 1, "%s\t%u\t%u\n", path, begin, len);
|
||||
ss_c = snprintf_s(
|
||||
linebuf, sizeof(linebuf), sizeof(linebuf) - 1, "%s\t%u\t%u\t%u\t%u\t%u\n", path, begin, len, 0, 0, 0);
|
||||
securec_check_ss_c(ss_c, "\0", "\0");
|
||||
|
||||
if (PQputCopyData(conn, linebuf, strlen(linebuf)) != 1)
|
||||
pg_fatal("could not send COPY data: %s", PQerrorMessage(conn));
|
||||
|
||||
begin += len;
|
||||
}
|
||||
}
|
||||
|
@ -542,7 +646,8 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
|
|||
* First create a temporary table, and load it with the blocks that we
|
||||
* need to fetch.
|
||||
*/
|
||||
sql = "CREATE TEMPORARY TABLE fetchchunks(path text, begin int4, len int4);";
|
||||
sql = "CREATE TEMPORARY TABLE fetchchunks(path text, begin int4, len int4, "
|
||||
"algorithm int4, chunksize int4, rebuild bool);";
|
||||
res = PQexec(conn, sql);
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK) {
|
||||
pg_fatal("could not create temporary table: %s", PQresultErrorMessage(res));
|
||||
|
@ -571,11 +676,16 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
|
|||
}
|
||||
|
||||
/* report all the path to check whether it's correct */
|
||||
if (entry->rewindCompressInfo.compressed) {
|
||||
pg_log(PG_PROGRESS, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action);
|
||||
|
||||
}
|
||||
pg_log(PG_DEBUG, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action);
|
||||
fprintf(file, "path: %s, type: %d, action: %d\n", entry->path, entry->type, entry->action);
|
||||
|
||||
/* If this is a relation file, copy the modified blocks */
|
||||
execute_pagemap(&entry->pagemap, entry->path, file);
|
||||
bool compressed = entry->rewindCompressInfo.compressed;
|
||||
execute_pagemap(entry, file);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
|
||||
switch (entry->action) {
|
||||
|
@ -584,29 +694,47 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
|
|||
break;
|
||||
|
||||
case FILE_ACTION_COPY:
|
||||
/* Truncate the old file out of the way, if any */
|
||||
open_target_file(entry->path, true);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
fetch_file_range(entry->path, 0, entry->newsize);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
if (compressed) {
|
||||
CompressedFileCopy(entry, true);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
} else {
|
||||
/* Truncate the old file out of the way, if any */
|
||||
open_target_file(entry->path, true);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
fetch_file_range(entry->path, 0, entry->newsize);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
}
|
||||
break;
|
||||
|
||||
case FILE_ACTION_TRUNCATE:
|
||||
truncate_target_file(entry->path, entry->newsize);
|
||||
if (compressed) {
|
||||
CompressedFileTruncate(entry->path, &entry->rewindCompressInfo);
|
||||
} else {
|
||||
truncate_target_file(entry->path, entry->newsize);
|
||||
}
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
break;
|
||||
|
||||
case FILE_ACTION_COPY_TAIL:
|
||||
fetch_file_range(entry->path, entry->oldsize, entry->newsize);
|
||||
if (compressed) {
|
||||
CompressedFileCopy(entry, false);
|
||||
} else {
|
||||
fetch_file_range(entry->path, entry->oldsize, entry->newsize);
|
||||
}
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
break;
|
||||
|
||||
case FILE_ACTION_REMOVE:
|
||||
remove_target(entry);
|
||||
if (compressed) {
|
||||
CompressedFileRemove(entry);
|
||||
} else {
|
||||
remove_target(entry);
|
||||
}
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
break;
|
||||
|
||||
case FILE_ACTION_CREATE:
|
||||
Assert(!compressed);
|
||||
create_target(entry);
|
||||
PG_CHECKBUILD_AND_FREE_PGRESULT_RETURN(res);
|
||||
break;
|
||||
|
@ -638,9 +766,14 @@ BuildErrorCode executeFileMap(filemap_t* map, FILE* file)
|
|||
* temporary table. Now, actually fetch all of those ranges.
|
||||
*/
|
||||
sql = "SELECT path, begin, \n"
|
||||
" pg_read_binary_file(path, begin, len, true) AS chunk,\n"
|
||||
" len \n"
|
||||
"FROM fetchchunks\n";
|
||||
" pg_read_binary_file(path, begin, len, true) AS chunk, len, algorithm, chunksize,rebuild \n"
|
||||
"FROM fetchchunks where algorithm =0 \n"
|
||||
"union all \n"
|
||||
"select (json->>'path')::text as path, (json->>'blocknum')::int4 as begin, (json->>'data')::bytea as chunk,\n"
|
||||
"(json->>'len')::int4 as len, algorithm, chunksize,rebuild \n"
|
||||
"from (select row_to_json(pg_read_binary_file_blocks(path,begin,len)) json, algorithm, chunksize,rebuild \n"
|
||||
"from fetchchunks where algorithm !=0) \n"
|
||||
"order by path, begin;";
|
||||
|
||||
fprintf(file, "fetch and write file based on temporary table fetchchunks.\n");
|
||||
return receiveFileChunks(sql, file);
|
||||
|
@ -700,7 +833,7 @@ BuildErrorCode backupFileMap(filemap_t* map)
|
|||
/* to be supported later */
|
||||
break;
|
||||
|
||||
case FILE_ACTION_COPY:
|
||||
case FILE_ACTION_COPY: {
|
||||
/* create fake file for restore when file not exist, otherwise, backup file */
|
||||
file_entry_t statbuf;
|
||||
if (targetFilemapSearch(entry->path, &statbuf) < 0) {
|
||||
|
@ -709,6 +842,7 @@ BuildErrorCode backupFileMap(filemap_t* map)
|
|||
backup_target_file(entry->path, divergeXlogFileName);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case FILE_ACTION_COPY_TAIL:
|
||||
case FILE_ACTION_TRUNCATE:
|
||||
|
@ -732,17 +866,60 @@ BuildErrorCode backupFileMap(filemap_t* map)
|
|||
return BUILD_SUCCESS;
|
||||
}
|
||||
|
||||
static BuildErrorCode execute_pagemap(datapagemap_t* pagemap, const char* path, FILE* file)
|
||||
/**
|
||||
* combine continue blocks numbers and copy file
|
||||
* @param entry file entry
|
||||
* @param file file
|
||||
*/
|
||||
static void CompressedFileCopy(file_entry_t* entry, FILE* file)
|
||||
{
|
||||
datapagemap_t* pagemap = &entry->pagemap;
|
||||
datapagemap_iterator_t* iter = datapagemap_iterate(pagemap);
|
||||
|
||||
BlockNumber blkno;
|
||||
file_entry_t fileEntry;
|
||||
fileEntry.path = entry->path;
|
||||
fileEntry.rewindCompressInfo = entry->rewindCompressInfo;
|
||||
int invalidNumber = -1;
|
||||
long int before = invalidNumber;
|
||||
while (datapagemap_next(iter, &blkno)) {
|
||||
fprintf(file, " block %u\n", blkno);
|
||||
if (before == -1) {
|
||||
fileEntry.rewindCompressInfo.oldBlockNumber = blkno;
|
||||
before = blkno;
|
||||
} else {
|
||||
if (before == blkno - 1) {
|
||||
before = blkno;
|
||||
} else {
|
||||
fileEntry.rewindCompressInfo.newBlockNumber = before + 1;
|
||||
CompressedFileCopy(&fileEntry, false);
|
||||
fileEntry.rewindCompressInfo.oldBlockNumber = blkno;
|
||||
before = blkno;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (before != invalidNumber) {
|
||||
fileEntry.rewindCompressInfo.newBlockNumber = before + 1;
|
||||
CompressedFileCopy(&fileEntry, false);
|
||||
}
|
||||
}
|
||||
static BuildErrorCode execute_pagemap(file_entry_t* entry, FILE* file)
|
||||
{
|
||||
datapagemap_iterator_t* iter = NULL;
|
||||
BlockNumber blkno;
|
||||
off_t offset;
|
||||
|
||||
datapagemap_t* pagemap = &entry->pagemap;
|
||||
char* path = entry->path;
|
||||
iter = datapagemap_iterate(pagemap);
|
||||
while (datapagemap_next(iter, &blkno)) {
|
||||
fprintf(file, " block %u\n", blkno);
|
||||
offset = blkno * BLCKSZ;
|
||||
fetch_file_range(path, offset, offset + BLCKSZ);
|
||||
if (entry->rewindCompressInfo.compressed) {
|
||||
CompressedFileCopy(entry, file);
|
||||
} else {
|
||||
while (datapagemap_next(iter, &blkno)) {
|
||||
fprintf(file, " block %u\n", blkno);
|
||||
offset = blkno * BLCKSZ;
|
||||
fetch_file_range(path, offset, offset + BLCKSZ);
|
||||
}
|
||||
}
|
||||
pg_free(iter);
|
||||
iter = NULL;
|
||||
|
@ -789,9 +966,19 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
|
|||
struct stat fst;
|
||||
char fullpath[MAXPGPATH];
|
||||
char path[MAXPGPATH];
|
||||
const size_t MINPCANAMESIZE = 4;
|
||||
|
||||
if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0)
|
||||
continue;
|
||||
/* Skip compressed page files */
|
||||
size_t dirNamePath = strlen(xlde->d_name);
|
||||
if (dirNamePath >= MINPCANAMESIZE) {
|
||||
const char* suffix = xlde->d_name + dirNamePath - MINPCANAMESIZE;
|
||||
if (strncmp(suffix, "_pca", MINPCANAMESIZE) == 0 || strncmp(suffix, "_pcd", MINPCANAMESIZE) == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ss_c = snprintf_s(fullpath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", fullparentpath, xlde->d_name);
|
||||
securec_check_ss_c(ss_c, "\0", "\0");
|
||||
|
@ -822,8 +1009,15 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
|
|||
continue;
|
||||
|
||||
if (S_ISREG(fst.st_mode)) {
|
||||
if ((uint64)fst.st_size <= MAX_FILE_SIZE) {
|
||||
callback(path, FILE_TYPE_REGULAR, fst.st_size, NULL);
|
||||
uint64 fileSize = (uint64)fst.st_size;
|
||||
RewindCompressInfo rewindCompressInfo;
|
||||
RewindCompressInfo *pointer = NULL;
|
||||
if (ProcessLocalPca(path, &rewindCompressInfo)) {
|
||||
fileSize = rewindCompressInfo.oldBlockNumber * BLCKSZ;
|
||||
pointer = &rewindCompressInfo;
|
||||
}
|
||||
if (fileSize <= MAX_FILE_SIZE) {
|
||||
callback(path, FILE_TYPE_REGULAR, fileSize, NULL, pointer);
|
||||
if (increment_return_code != BUILD_SUCCESS) {
|
||||
(void)closedir(xldir);
|
||||
}
|
||||
|
@ -832,7 +1026,7 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
|
|||
pg_log(PG_WARNING, "file size of \"%s\" is over %ld\n", fullpath, MAX_FILE_SIZE);
|
||||
}
|
||||
} else if (S_ISDIR(fst.st_mode)) {
|
||||
callback(path, FILE_TYPE_DIRECTORY, 0, NULL);
|
||||
callback(path, FILE_TYPE_DIRECTORY, 0, NULL, NULL);
|
||||
if (increment_return_code != BUILD_SUCCESS) {
|
||||
(void)closedir(xldir);
|
||||
}
|
||||
|
@ -857,7 +1051,7 @@ static BuildErrorCode recurse_dir(const char* datadir, const char* parentpath, p
|
|||
}
|
||||
link_target[len] = '\0';
|
||||
|
||||
callback(path, FILE_TYPE_SYMLINK, 0, link_target);
|
||||
callback(path, FILE_TYPE_SYMLINK, 0, link_target, NULL);
|
||||
|
||||
/*
|
||||
* If it's a symlink within pg_tblspc, we need to recurse into it,
|
||||
|
|
|
@ -42,7 +42,9 @@ extern XLogRecPtr libpqGetCurrentXlogInsertLocation(void);
|
|||
|
||||
extern void libpqRequestCheckpoint(void);
|
||||
|
||||
typedef void (*process_file_callback_t)(const char* path, file_type_t type, size_t size, const char* link_target);
|
||||
typedef void (*process_file_callback_t)(const char* path, file_type_t type, size_t oldsize, const char* link_target,
|
||||
const RewindCompressInfo* rewindCompressInfo);
|
||||
|
||||
extern BuildErrorCode traverse_datadir(const char* datadir, process_file_callback_t callback);
|
||||
|
||||
extern void get_source_slotname(void);
|
||||
|
|
|
@ -25,6 +25,8 @@
|
|||
|
||||
#include "common/fe_memutils.h"
|
||||
#include "common/build_query/build_query.h"
|
||||
#include "compressed_rewind.h"
|
||||
#include "storage/page_compression_impl.h"
|
||||
#include "replication/replicainternal.h"
|
||||
|
||||
#define BLOCKSIZE (8 * 1024)
|
||||
|
@ -36,6 +38,8 @@ static int dstfd = -1;
|
|||
static char dstpath[MAXPGPATH] = "";
|
||||
static bool g_isRelDataFile = false;
|
||||
|
||||
static CompressedPcaInfo g_compressedPcaInfo;
|
||||
|
||||
static void create_target_dir(const char* path);
|
||||
static void remove_target_dir(const char* path);
|
||||
static void create_target_symlink(const char* path, const char* slink);
|
||||
|
@ -100,7 +104,7 @@ void close_target_file(void)
|
|||
dstfd = -1;
|
||||
}
|
||||
|
||||
void write_target_range(char* buf, off_t begin, size_t size, int space)
|
||||
void write_target_range(char* buf, off_t begin, size_t size, int space, bool compressed)
|
||||
{
|
||||
int writeleft;
|
||||
char* p = NULL;
|
||||
|
@ -111,7 +115,7 @@ void write_target_range(char* buf, off_t begin, size_t size, int space)
|
|||
if (dry_run)
|
||||
return;
|
||||
|
||||
if (begin % BLOCKSIZE != 0) {
|
||||
if (!compressed && begin % BLOCKSIZE != 0) {
|
||||
(void)close(dstfd);
|
||||
dstfd = -1;
|
||||
pg_fatal("seek position %ld in target file \"%s\" is not in BLOCKSIZEs\n", size, dstpath);
|
||||
|
@ -1225,3 +1229,142 @@ bool tablespaceDataIsValid(const char* path)
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
void CompressedFileTruncate(const char *path, const RewindCompressInfo *rewindCompressInfo)
|
||||
{
|
||||
if (dry_run) {
|
||||
return;
|
||||
}
|
||||
|
||||
uint16 chunkSize = rewindCompressInfo->chunkSize;
|
||||
|
||||
BlockNumber oldBlockNumber = rewindCompressInfo->oldBlockNumber;
|
||||
BlockNumber newBlockNumber = rewindCompressInfo->newBlockNumber;
|
||||
|
||||
Assert(oldBlockNumber > newBlockNumber);
|
||||
char pcaPath[MAXPGPATH];
|
||||
FormatPathToPca(path, pcaPath, MAXPGPATH, true);
|
||||
|
||||
int pcaFd = open(pcaPath, O_RDWR | PG_BINARY, 0600);
|
||||
if (pcaFd < 0) {
|
||||
pg_fatal("CompressedFileTruncate: could not open file \"%s\": %s\n", pcaPath, strerror(errno));
|
||||
return;
|
||||
}
|
||||
|
||||
PageCompressHeader* map = pc_mmap(pcaFd, chunkSize, false);
|
||||
if (map == MAP_FAILED) {
|
||||
pg_fatal("CompressedFileTruncate: Failed to mmap file \"%s\": %s\n", pcaPath, strerror(errno));
|
||||
return;
|
||||
}
|
||||
/* write zero to truncated addr */
|
||||
for (BlockNumber blockNumber = newBlockNumber; blockNumber < oldBlockNumber; ++blockNumber) {
|
||||
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(map, chunkSize, blockNumber);
|
||||
for (size_t i = 0; i < addr->allocated_chunks; ++i) {
|
||||
addr->chunknos[i] = 0;
|
||||
}
|
||||
addr->nchunks = 0;
|
||||
addr->allocated_chunks = 0;
|
||||
addr->checksum = 0;
|
||||
}
|
||||
map->last_synced_nblocks = map->nblocks = newBlockNumber;
|
||||
|
||||
/* find the max used chunk number */
|
||||
pc_chunk_number_t beforeUsedChunks = map->allocated_chunks;
|
||||
pc_chunk_number_t max_used_chunkno = 0;
|
||||
for (BlockNumber blockNumber = 0; blockNumber < newBlockNumber; ++blockNumber) {
|
||||
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(map, chunkSize, blockNumber);
|
||||
for (uint8 i = 0; i < addr->allocated_chunks; i++) {
|
||||
if (addr->chunknos[i] > max_used_chunkno) {
|
||||
max_used_chunkno = addr->chunknos[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
map->allocated_chunks = map->last_synced_allocated_chunks = max_used_chunkno;
|
||||
|
||||
/* truncate pcd qfile */
|
||||
if (beforeUsedChunks > max_used_chunkno) {
|
||||
char pcdPath[MAXPGPATH];
|
||||
FormatPathToPcd(path, pcdPath, MAXPGPATH, false);
|
||||
truncate_target_file(pcdPath, max_used_chunkno * chunkSize);
|
||||
}
|
||||
pc_munmap(map);
|
||||
pg_log(PG_PROGRESS, "CompressedFileTruncate: %s\n", path);
|
||||
}
|
||||
|
||||
void OpenCompressedPcaFile(const char* fileName, int32 chunkSize, int32 algorithm, bool rebuild)
|
||||
{
|
||||
if (dry_run) {
|
||||
return;
|
||||
}
|
||||
if (g_compressedPcaInfo.pcaFd != -1 && strcmp(fileName, &g_compressedPcaInfo.path[strlen(pg_data) + 1]) == 0) {
|
||||
/* already open */
|
||||
return;
|
||||
}
|
||||
CloseCompressedPcaFile();
|
||||
int rc = snprintf_s(g_compressedPcaInfo.path, sizeof(g_compressedPcaInfo.path),
|
||||
sizeof(g_compressedPcaInfo.path) - 1,
|
||||
"%s/%s", pg_data, fileName);
|
||||
securec_check_ss_c(rc, "\0", "\0");
|
||||
|
||||
int mode = O_RDWR | PG_BINARY;
|
||||
mode = rebuild ? (mode | O_TRUNC | O_CREAT) : mode;
|
||||
|
||||
g_compressedPcaInfo.pcaFd = open(g_compressedPcaInfo.path, mode, S_IRUSR | S_IWUSR);
|
||||
if (g_compressedPcaInfo.pcaFd < 0) {
|
||||
pg_fatal("could not open compressed pca file \"%s\": %s\n", g_compressedPcaInfo.path, strerror(errno));
|
||||
return;
|
||||
}
|
||||
g_compressedPcaInfo.algorithm = algorithm;
|
||||
g_compressedPcaInfo.chunkSize = chunkSize;
|
||||
g_compressedPcaInfo.pcaMap = (char*) pc_mmap(g_compressedPcaInfo.pcaFd, chunkSize, false);
|
||||
if ((void*)g_compressedPcaInfo.pcaMap == MAP_FAILED) {
|
||||
pg_fatal("OpenCompressedPcaFile: Failed to mmap file \"%s\": %s\n", g_compressedPcaInfo.path, strerror(errno));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void CloseCompressedPcaFile()
|
||||
{
|
||||
if (g_compressedPcaInfo.pcaFd == -1) {
|
||||
return;
|
||||
}
|
||||
pc_munmap((PageCompressHeader*)g_compressedPcaInfo.pcaMap);
|
||||
if (close(g_compressedPcaInfo.pcaFd) != 0) {
|
||||
pg_fatal("could not close target file \"%s\": %s\n", g_compressedPcaInfo.path, gs_strerror(errno));
|
||||
}
|
||||
g_compressedPcaInfo.pcaFd = -1;
|
||||
g_compressedPcaInfo.pcaMap = NULL;
|
||||
g_compressedPcaInfo.chunkSize = 0;
|
||||
g_compressedPcaInfo.algorithm = 0;
|
||||
}
|
||||
|
||||
void FetchCompressedFile(char* buf, BlockNumber blockNumber, int32 size)
|
||||
{
|
||||
int32 chunkSize = g_compressedPcaInfo.chunkSize;
|
||||
int needChunks = size / chunkSize;
|
||||
|
||||
PageCompressHeader* pcMap = (PageCompressHeader*) g_compressedPcaInfo.pcaMap;
|
||||
PageCompressAddr* pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunkSize, blockNumber);
|
||||
|
||||
// 2. allocate chunks
|
||||
if (pcAddr->allocated_chunks < needChunks) {
|
||||
auto chunkno = pg_atomic_fetch_add_u32(&pcMap->allocated_chunks, needChunks - pcAddr->allocated_chunks);
|
||||
for (int i = pcAddr->allocated_chunks; i < needChunks; i++) {
|
||||
pcAddr->chunknos[i] = ++chunkno;
|
||||
}
|
||||
pcAddr->allocated_chunks = needChunks;
|
||||
}
|
||||
for (int32 i = 0; i < needChunks; ++i) {
|
||||
auto buffer_pos = buf + chunkSize * i;
|
||||
off_t seekpos = (off_t) OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, pcAddr->chunknos[i]);
|
||||
int32 start = i;
|
||||
while (i < needChunks - 1 && pcAddr->chunknos[i + 1] == pcAddr->chunknos[i] + 1) {
|
||||
i++;
|
||||
}
|
||||
int write_amount = chunkSize * (i - start + 1);
|
||||
// open file dstfd
|
||||
write_target_range(buffer_pos, seekpos, write_amount, 0, true);
|
||||
}
|
||||
pcAddr->nchunks = pcAddr->allocated_chunks;
|
||||
pcAddr->checksum = AddrChecksum32(blockNumber, pcAddr, chunkSize);
|
||||
}
|
||||
|
|
|
@ -11,10 +11,11 @@
|
|||
#define FILE_OPS_H
|
||||
|
||||
#include "filemap.h"
|
||||
#include "compressed_common.h"
|
||||
extern char* pg_data;
|
||||
|
||||
extern void open_target_file(const char* path, bool trunc);
|
||||
extern void write_target_range(char* buf, off_t begin, size_t size, int space);
|
||||
extern void write_target_range(char* buf, off_t begin, size_t size, int space, bool compressed = false);
|
||||
extern void close_target_file(void);
|
||||
extern void truncate_target_file(const char* path, off_t newsize);
|
||||
extern void create_target(file_entry_t* t);
|
||||
|
@ -41,6 +42,9 @@ extern void delete_target_file(const char* file);
|
|||
extern bool isPathInFilemap(const char* path);
|
||||
extern bool tablespaceDataIsValid(const char* path);
|
||||
extern void copy_file(const char* fromfile, char* tofile);
|
||||
|
||||
extern void CompressedFileTruncate(const char* path, const RewindCompressInfo* rewindCompressInfo);
|
||||
void FetchCompressedFile(char* buf, BlockNumber begin, int32 size);
|
||||
void OpenCompressedPcaFile(const char* fileName, int32 chunkSize, int32 algorithm, bool rebuild);
|
||||
void CloseCompressedPcaFile();
|
||||
#endif /* FILE_OPS_H */
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "catalog/catalog.h"
|
||||
#include "catalog/pg_tablespace.h"
|
||||
#include "common/fe_memutils.h"
|
||||
#include "compressed_rewind.h"
|
||||
#include "storage/cu.h"
|
||||
#include "storage/smgr/fd.h"
|
||||
|
||||
|
@ -147,7 +148,8 @@ void filemapInit(void)
|
|||
filemaptarget = filemap_create();
|
||||
}
|
||||
|
||||
void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, const char* link_target)
|
||||
void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, const char* link_target,
|
||||
const RewindCompressInfo* info)
|
||||
{
|
||||
file_entry_t* entry = NULL;
|
||||
filemap_t* map = filemaptarget;
|
||||
|
@ -163,6 +165,8 @@ void processTargetFileMap(const char* path, file_type_t type, size_t oldsize, co
|
|||
entry->pagemap.bitmap = NULL;
|
||||
entry->pagemap.bitmapsize = 0;
|
||||
|
||||
COPY_REWIND_COMPRESS_INFO(entry, info, info == NULL ? 0 : info->oldBlockNumber, 0)
|
||||
|
||||
if (map->last != NULL) {
|
||||
map->last->next = entry;
|
||||
map->last = entry;
|
||||
|
@ -231,7 +235,7 @@ BuildErrorCode targetFilemapProcess(void)
|
|||
filemap_t* map = filemaptarget;
|
||||
for (i = 0; i < map->narray; i++) {
|
||||
entry = map->array[i];
|
||||
process_target_file(entry->path, entry->type, entry->oldsize, entry->link_target);
|
||||
process_target_file(entry->path, entry->type, entry->oldsize, entry->link_target, &entry->rewindCompressInfo);
|
||||
}
|
||||
return BUILD_SUCCESS;
|
||||
}
|
||||
|
@ -342,7 +346,8 @@ static bool process_source_file_sanity_check(const char* path, file_type_t type)
|
|||
* action needs to be taken for the file, depending on whether the file
|
||||
* exists in the target and whether the size matches.
|
||||
*/
|
||||
void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target)
|
||||
void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target,
|
||||
RewindCompressInfo* info)
|
||||
{
|
||||
bool exists = false;
|
||||
char localpath[MAXPGPATH];
|
||||
|
@ -350,6 +355,7 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
|
|||
filemap_t* map = filemap;
|
||||
file_action_t action = FILE_ACTION_NONE;
|
||||
size_t oldsize = 0;
|
||||
BlockNumber oldBlockNumber = 0;
|
||||
file_entry_t* entry = NULL;
|
||||
int ss_c = 0;
|
||||
bool isreldatafile = false;
|
||||
|
@ -500,7 +506,21 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
|
|||
* replayed.
|
||||
*/
|
||||
/* mod blocksize 8k to avoid half page write */
|
||||
oldsize = statbuf.oldsize;
|
||||
RewindCompressInfo oldRewindCompressInfo;
|
||||
bool sourceCompressed = info != NULL;
|
||||
bool targetCompressed = ProcessLocalPca(path, &oldRewindCompressInfo);
|
||||
if (sourceCompressed && !targetCompressed) {
|
||||
info->compressed = false;
|
||||
action = FILE_ACTION_REMOVE;
|
||||
break;
|
||||
} else if (!sourceCompressed && targetCompressed) {
|
||||
info = &oldRewindCompressInfo;
|
||||
action = FILE_ACTION_REMOVE;
|
||||
break;
|
||||
} else if (sourceCompressed && targetCompressed) {
|
||||
oldBlockNumber = oldRewindCompressInfo.oldBlockNumber;
|
||||
oldsize = oldBlockNumber * BLCKSZ;
|
||||
}
|
||||
if (oldsize % BLOCKSIZE != 0) {
|
||||
oldsize = oldsize - (oldsize % BLOCKSIZE);
|
||||
pg_log(PG_PROGRESS, "target file size mod BLOCKSIZE not equal 0 %s %ld \n", path, statbuf.oldsize);
|
||||
|
@ -531,6 +551,8 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
|
|||
entry->pagemap.bitmapsize = 0;
|
||||
entry->isrelfile = isreldatafile;
|
||||
|
||||
COPY_REWIND_COMPRESS_INFO(entry, info, oldBlockNumber, info == NULL ? 0 : info->newBlockNumber)
|
||||
|
||||
if (map->last != NULL) {
|
||||
map->last->next = entry;
|
||||
map->last = entry;
|
||||
|
@ -546,7 +568,8 @@ void process_source_file(const char* path, file_type_t type, size_t newsize, con
|
|||
* marks target data directory's files that didn't exist in the source for
|
||||
* deletion.
|
||||
*/
|
||||
void process_target_file(const char* path, file_type_t type, size_t oldsize, const char* link_target)
|
||||
void process_target_file(const char* path, file_type_t type, size_t oldsize, const char* link_target,
|
||||
const RewindCompressInfo* info)
|
||||
{
|
||||
bool exists = false;
|
||||
file_entry_t key;
|
||||
|
@ -575,7 +598,7 @@ void process_target_file(const char* path, file_type_t type, size_t oldsize, con
|
|||
*/
|
||||
for (int excludeIdx = 0; excludeFiles[excludeIdx] != NULL; excludeIdx++) {
|
||||
if (strstr(path, excludeFiles[excludeIdx]) != NULL) {
|
||||
pg_log(PG_DEBUG, "entry \"%s\" excluded from target file list", path);
|
||||
pg_log(PG_DEBUG, "entry \"%s\" excluded from target file list\n", path);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -627,6 +650,9 @@ void process_target_file(const char* path, file_type_t type, size_t oldsize, con
|
|||
entry->pagemap.bitmapsize = 0;
|
||||
entry->isrelfile = isRelDataFile(path);
|
||||
|
||||
COPY_REWIND_COMPRESS_INFO(entry, info, info == NULL ? 0 : info->oldBlockNumber, 0)
|
||||
RewindCompressInfo *rewindCompressInfo = NULL;
|
||||
COPY_REWIND_COMPRESS_INFO(entry, rewindCompressInfo, 0, 0)
|
||||
if (map->last == NULL)
|
||||
map->first = entry;
|
||||
else
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#ifndef FILEMAP_H
|
||||
#define FILEMAP_H
|
||||
|
||||
#include "compressed_common.h"
|
||||
#include "storage/smgr/relfilenode.h"
|
||||
#include "storage/buf/block.h"
|
||||
|
||||
|
@ -42,6 +43,9 @@ typedef struct file_entry_t {
|
|||
|
||||
file_action_t action;
|
||||
|
||||
/* for compressed table */
|
||||
RewindCompressInfo rewindCompressInfo;
|
||||
|
||||
/* for a regular file */
|
||||
size_t oldsize;
|
||||
size_t newsize;
|
||||
|
@ -96,8 +100,10 @@ extern void print_filemap(void);
|
|||
extern void print_filemap_to_file(FILE* file);
|
||||
|
||||
/* Functions for populating the filemap */
|
||||
extern void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target);
|
||||
extern void process_target_file(const char* path, file_type_t type, size_t newsize, const char* link_target);
|
||||
extern void process_source_file(const char* path, file_type_t type, size_t newsize, const char* link_target,
|
||||
RewindCompressInfo* rewindCompressInfo = nullptr);
|
||||
extern void process_target_file(const char* path, file_type_t type, size_t newsize, const char* link_target,
|
||||
const RewindCompressInfo* rewindCompressInfo = nullptr);
|
||||
extern void process_block_change(ForkNumber forknum, RelFileNode rnode, BlockNumber blkno);
|
||||
extern void filemap_finalize(void);
|
||||
extern int targetFilemapSearch(const char* path, file_entry_t* entry);
|
||||
|
|
|
@ -161,7 +161,7 @@ BuildErrorCode findCommonCheckpoint(const char* datadir, TimeLineID tli, XLogRec
|
|||
pg_fatal("find max lsn fail, errmsg:%s\n", returnmsg);
|
||||
return BUILD_FATAL;
|
||||
}
|
||||
pg_log(PG_PROGRESS, "find max lsn success, %s\n", returnmsg);
|
||||
pg_log(PG_PROGRESS, "find max lsn success, %s", returnmsg);
|
||||
|
||||
readprivate.datadir = datadir;
|
||||
readprivate.tli = tli;
|
||||
|
|
|
@ -3626,8 +3626,9 @@
|
|||
AddBuiltinFunc(_0(9038), _1("gs_query_standby_cluster_barrier_id_exist"), _2(1), _3(true), _4(false), _5(gs_query_standby_cluster_barrier_id_exist), _6(16), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(1, 25), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_query_standby_cluster_barrier_id_exist"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
|
||||
),
|
||||
AddFuncGroup(
|
||||
"gs_read_block_from_remote", 1,
|
||||
AddBuiltinFunc(_0(4767), _1("gs_read_block_from_remote"), _2(10), _3(true), _4(false), _5(gs_read_block_from_remote), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(10, 26, 26, 26, 21, 23, 28, 23, 28, 16, 23), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
|
||||
"gs_read_block_from_remote", 2,
|
||||
AddBuiltinFunc(_0(4767), _1("gs_read_block_from_remote"), _2(10), _3(true), _4(false), _5(gs_read_block_from_remote), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(10, 26, 26, 26, 21, 23, 28, 23, 28, 16, 23), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)),
|
||||
AddBuiltinFunc(_0(5843), _1("gs_read_block_from_remote"), _2(11), _3(true), _4(false), _5(gs_read_block_from_remote_compress), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(11, 23, 23, 23, 21, 21, 23, 28, 23, 28, 16, 23), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("gs_read_block_from_remote_compress"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
|
||||
),
|
||||
AddFuncGroup(
|
||||
"gs_respool_exception_info", 1,
|
||||
|
@ -8082,6 +8083,10 @@
|
|||
AddBuiltinFunc(_0(3827), _1("pg_read_binary_file"), _2(4), _3(true), _4(false), _5(pg_read_binary_file), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(4, 25, 20, 20, 16), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_binary_file"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("read bytea from a file"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)),
|
||||
AddBuiltinFunc(_0(3828), _1("pg_read_binary_file"), _2(1), _3(true), _4(false), _5(pg_read_binary_file_all), _6(17), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(1, 25), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_binary_file_all"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("read bytea from a file"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0))
|
||||
),
|
||||
AddFuncGroup(
|
||||
"pg_read_binary_file_blocks", 1,
|
||||
AddBuiltinFunc(_0(8413), _1("pg_read_binary_file_blocks"), _2(3), _3(true), _4(true), _5(pg_read_binary_file_blocks), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(100), _11(20), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(3, 25, 20, 20), _21(7, 25, 20, 20, 25, 23, 23, 17), _22(7, 'i', 'i', 'i', 'o', 'o', 'o', 'o'), _23(7, "input", "blocknum", "blockcount", "path", "blocknum", "len", "data"), _24(NULL), _25("pg_read_binary_file_blocks"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'))
|
||||
),
|
||||
AddFuncGroup(
|
||||
"pg_read_file", 2,
|
||||
AddBuiltinFunc(_0(2624), _1("pg_read_file"), _2(3), _3(true), _4(false), _5(pg_read_file), _6(25), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('v'), _19(0), _20(3, 25, 20, 20), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_read_file"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("read text from a file - old version for adminpack 1.0"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)),
|
||||
|
|
|
@ -87,6 +87,7 @@
|
|||
#include "pgxc/groupmgr.h"
|
||||
#include "storage/buf/buf.h"
|
||||
#include "storage/predicate.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/buf/bufmgr.h"
|
||||
#include "storage/lmgr.h"
|
||||
#include "storage/smgr/smgr.h"
|
||||
|
@ -514,8 +515,9 @@ static void InitSubPartitionDef(Partition newPartition, Oid partOid, char strate
|
|||
*/
|
||||
Relation heap_create(const char* relname, Oid relnamespace, Oid reltablespace, Oid relid, Oid relfilenode,
|
||||
Oid bucketOid, TupleDesc tupDesc, char relkind, char relpersistence, bool partitioned_relation, bool rowMovement,
|
||||
bool shared_relation, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, Oid ownerid,
|
||||
bool skip_create_storage, TableAmType tam_type, int8 relindexsplit, StorageType storage_type, bool newcbi)
|
||||
bool shared_relation, bool mapped_relation, bool allow_system_table_mods, int8 row_compress, Datum reloptions,
|
||||
Oid ownerid, bool skip_create_storage, TableAmType tam_type, int8 relindexsplit, StorageType storage_type,
|
||||
bool newcbi, Oid accessMethodObjectId)
|
||||
{
|
||||
bool create_storage = false;
|
||||
Relation rel;
|
||||
|
@ -626,9 +628,11 @@ Relation heap_create(const char* relname, Oid relnamespace, Oid reltablespace, O
|
|||
relpersistence,
|
||||
relkind,
|
||||
row_compress,
|
||||
reloptions,
|
||||
tam_type,
|
||||
relindexsplit,
|
||||
storage_type
|
||||
storage_type,
|
||||
accessMethodObjectId
|
||||
);
|
||||
|
||||
if (partitioned_relation) {
|
||||
|
@ -2712,6 +2716,7 @@ Oid heap_create_with_catalog(const char *relname, Oid relnamespace, Oid reltable
|
|||
mapped_relation,
|
||||
allow_system_table_mods,
|
||||
row_compress,
|
||||
reloptions,
|
||||
ownerid,
|
||||
false,
|
||||
tam,
|
||||
|
@ -5248,7 +5253,7 @@ void dropDeltaTableOnPartition(Oid partId)
|
|||
*
|
||||
*/
|
||||
Partition heapCreatePartition(const char* part_name, bool for_partitioned_table, Oid part_tablespace, Oid part_id,
|
||||
Oid partFileNode, Oid bucketOid, Oid ownerid, StorageType storage_type, bool newcbi)
|
||||
Oid partFileNode, Oid bucketOid, Oid ownerid, StorageType storage_type, bool newcbi, Datum reloptions)
|
||||
{
|
||||
Partition new_part_desc = NULL;
|
||||
bool createStorage = false;
|
||||
|
@ -5301,7 +5306,8 @@ Partition heapCreatePartition(const char* part_name, bool for_partitioned_table,
|
|||
part_id, /* partition oid */
|
||||
partFileNode, /* partition's file node, same as partition oid*/
|
||||
part_tablespace,
|
||||
for_partitioned_table ? HEAP_DISK : storage_type);
|
||||
for_partitioned_table ? HEAP_DISK : storage_type,
|
||||
reloptions);
|
||||
|
||||
/*
|
||||
* Save newcbi as a context indicator to
|
||||
|
@ -5805,7 +5811,9 @@ Oid heapAddRangePartition(Relation pgPartRel, Oid partTableOid, Oid partTablespa
|
|||
newPartrelfileOid,
|
||||
bucketOid,
|
||||
ownerid,
|
||||
storage_type);
|
||||
storage_type,
|
||||
false,
|
||||
reloptions);
|
||||
|
||||
Assert(newPartitionOid == PartitionGetPartid(newPartition));
|
||||
if (isSubpartition) {
|
||||
|
@ -6012,7 +6020,9 @@ Oid HeapAddIntervalPartition(Relation pgPartRel, Relation rel, Oid partTableOid,
|
|||
partrelfileOid,
|
||||
bucketOid,
|
||||
ownerid,
|
||||
storage_type);
|
||||
storage_type,
|
||||
false,
|
||||
reloptions);
|
||||
pfree(partName);
|
||||
|
||||
Assert(newPartitionOid == PartitionGetPartid(newPartition));
|
||||
|
@ -6100,7 +6110,8 @@ Oid HeapAddListPartition(Relation pgPartRel, Oid partTableOid, Oid partTablespac
|
|||
forPartitionTable = false;
|
||||
}
|
||||
newListPartition = heapCreatePartition(newListPartDef->partitionName, forPartitionTable, newPartitionTableSpaceOid,
|
||||
newListPartitionOid, partrelfileOid, bucketOid, ownerid, storage_type);
|
||||
newListPartitionOid, partrelfileOid, bucketOid, ownerid, storage_type,false,
|
||||
reloptions);
|
||||
|
||||
Assert(newListPartitionOid == PartitionGetPartid(newListPartition));
|
||||
|
||||
|
@ -6386,7 +6397,9 @@ Oid HeapAddHashPartition(Relation pgPartRel, Oid partTableOid, Oid partTablespac
|
|||
partrelfileOid,
|
||||
bucketOid,
|
||||
ownerid,
|
||||
storage_type);
|
||||
storage_type,
|
||||
false,
|
||||
reloptions);
|
||||
|
||||
Assert(newHashPartitionOid == PartitionGetPartid(newHashPartition));
|
||||
if (isSubpartition) {
|
||||
|
@ -6561,7 +6574,9 @@ static void addNewPartitionTupleForTable(Relation pg_partition_rel, const char*
|
|||
new_partition_rfoid,
|
||||
InvalidOid,
|
||||
ownerid,
|
||||
HEAP_DISK);
|
||||
HEAP_DISK,
|
||||
false,
|
||||
reloptions);
|
||||
|
||||
Assert(new_partition_oid == PartitionGetPartid(new_partition));
|
||||
new_partition->pd_part->parttype = PART_OBJ_TYPE_PARTED_TABLE;
|
||||
|
|
|
@ -912,9 +912,9 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid index
|
|||
indexRelation = heap_create(indexRelationName, namespaceId, tableSpaceId, indexRelationId, relFileNode,
|
||||
RELATION_CREATE_BUCKET(heapRelation) ? heapRelation->rd_bucketoid : InvalidOid, indexTupDesc, relKind,
|
||||
relpersistence, isLocalPart, false, shared_relation, mapped_relation, allow_system_table_mods,
|
||||
REL_CMPRS_NOT_SUPPORT, heapRelation->rd_rel->relowner, skip_create_storage,
|
||||
REL_CMPRS_NOT_SUPPORT, (Datum)reloptions, heapRelation->rd_rel->relowner, skip_create_storage,
|
||||
isUstore ? TAM_USTORE : TAM_HEAP, /* XXX: Index tables are by default HEAP Table Type */
|
||||
relindexsplit, storage_type, extra->crossBucket);
|
||||
relindexsplit, storage_type, extra->crossBucket, accessMethodObjectId);
|
||||
|
||||
Assert(indexRelationId == RelationGetRelid(indexRelation));
|
||||
|
||||
|
@ -932,7 +932,6 @@ Oid index_create(Relation heapRelation, const char *indexRelationName, Oid index
|
|||
* XXX should have a cleaner way to create cataloged indexes
|
||||
*/
|
||||
indexRelation->rd_rel->relowner = heapRelation->rd_rel->relowner;
|
||||
indexRelation->rd_rel->relam = accessMethodObjectId;
|
||||
indexRelation->rd_rel->relhasoids = false;
|
||||
|
||||
if (accessMethodObjectId == PSORT_AM_OID) {
|
||||
|
@ -1244,7 +1243,8 @@ Oid partition_index_create(const char* partIndexName, /* the name of partition i
|
|||
parentIndex->rd_bucketoid,
|
||||
parentIndex->rd_rel->relowner,
|
||||
RelationGetStorageType(parentIndex),
|
||||
extra->crossbucket);
|
||||
extra->crossbucket,
|
||||
indexRelOptions);
|
||||
partitionIndex->pd_part->parttype = PART_OBJ_TYPE_INDEX_PARTITION;
|
||||
partitionIndex->pd_part->rangenum = 0;
|
||||
partitionIndex->pd_part->parentid = parentIndexId;
|
||||
|
@ -1282,9 +1282,13 @@ Oid partition_index_create(const char* partIndexName, /* the name of partition i
|
|||
partitionIndex->pd_part->relfrozenxid = (ShortTransactionId)InvalidTransactionId;
|
||||
|
||||
/* insert into pg_partition */
|
||||
#ifndef ENABLE_MULTIPLE_NODES
|
||||
insertPartitionEntry(pg_partition_rel, partitionIndex, partitionIndex->pd_id, NULL, NULL, 0, 0, 0, indexRelOptions,
|
||||
PART_OBJ_TYPE_INDEX_PARTITION);
|
||||
#else
|
||||
insertPartitionEntry(
|
||||
pg_partition_rel, partitionIndex, partitionIndex->pd_id, NULL, NULL, 0, 0, 0, 0, PART_OBJ_TYPE_INDEX_PARTITION);
|
||||
|
||||
#endif
|
||||
/* Make the above change visible */
|
||||
CommandCounterIncrement();
|
||||
|
||||
|
|
|
@ -319,17 +319,30 @@ void log_smgrcreate(RelFileNode* rnode, ForkNumber forkNum)
|
|||
if (IsSegmentFileNode(*rnode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
xl_smgr_create_compress xlrec;
|
||||
uint size;
|
||||
uint8 info = XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE;
|
||||
/*
|
||||
* compressOptions Copy
|
||||
*/
|
||||
if (rnode->opt != 0) {
|
||||
xlrec.pageCompressOpts = rnode->opt;
|
||||
size = sizeof(xl_smgr_create_compress);
|
||||
info |= XLR_REL_COMPRESS;
|
||||
} else {
|
||||
size = sizeof(xl_smgr_create);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make an XLOG entry reporting the file creation.
|
||||
*/
|
||||
xl_smgr_create xlrec;
|
||||
xlrec.forkNum = forkNum;
|
||||
RelFileNodeRelCopy(xlrec.rnode, *rnode);
|
||||
xlrec.xlrec.forkNum = forkNum;
|
||||
RelFileNodeRelCopy(xlrec.xlrec.rnode, *rnode);
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char*)&xlrec, sizeof(xlrec));
|
||||
XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE, rnode->bucketNode);
|
||||
XLogRegisterData((char*)&xlrec, size);
|
||||
XLogInsert(RM_SMGR_ID, info, rnode->bucketNode);
|
||||
}
|
||||
|
||||
static void CStoreRelDropStorage(Relation rel, RelFileNode* rnode, Oid ownerid)
|
||||
|
@ -691,14 +704,24 @@ void RelationTruncate(Relation rel, BlockNumber nblocks)
|
|||
* Make an XLOG entry reporting the file truncation.
|
||||
*/
|
||||
XLogRecPtr lsn;
|
||||
xl_smgr_truncate xlrec;
|
||||
xl_smgr_truncate_compress xlrec;
|
||||
uint size;
|
||||
uint8 info = XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE;
|
||||
|
||||
xlrec.blkno = nblocks;
|
||||
RelFileNodeRelCopy(xlrec.rnode, rel->rd_node);
|
||||
xlrec.xlrec.blkno = nblocks;
|
||||
|
||||
if (rel->rd_node.opt != 0) {
|
||||
xlrec.pageCompressOpts = rel->rd_node.opt;
|
||||
size = sizeof(xl_smgr_truncate_compress);
|
||||
info |= XLR_REL_COMPRESS;
|
||||
} else {
|
||||
size = sizeof(xl_smgr_truncate);
|
||||
}
|
||||
|
||||
RelFileNodeRelCopy(xlrec.xlrec.rnode, rel->rd_node);
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char*)&xlrec, sizeof(xlrec));
|
||||
|
||||
XLogRegisterData((char*)&xlrec, size);
|
||||
lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE, rel->rd_node.bucketNode);
|
||||
|
||||
/*
|
||||
|
@ -1213,7 +1236,7 @@ void smgr_redo(XLogReaderState* record)
|
|||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
||||
|
||||
bool compress = XLogRecGetInfo(record) & XLR_REL_COMPRESS;
|
||||
/* Backup blocks are not used in smgr records */
|
||||
Assert(!XLogRecHasAnyBlockRefs(record));
|
||||
|
||||
|
@ -1222,14 +1245,14 @@ void smgr_redo(XLogReaderState* record)
|
|||
|
||||
RelFileNode rnode;
|
||||
RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record));
|
||||
smgr_redo_create(rnode, xlrec->forkNum, (char *)xlrec);
|
||||
/* Redo column file, attid is hidden in forkNum */
|
||||
|
||||
rnode.opt = compress ? ((xl_smgr_create_compress*)XLogRecGetData(record))->pageCompressOpts : 0;
|
||||
smgr_redo_create(rnode, xlrec->forkNum, (char *)xlrec);
|
||||
/* Redo column file, attid is hidden in forkNum */
|
||||
} else if (info == XLOG_SMGR_TRUNCATE) {
|
||||
xl_smgr_truncate* xlrec = (xl_smgr_truncate*)XLogRecGetData(record);
|
||||
RelFileNode rnode;
|
||||
RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record));
|
||||
|
||||
rnode.opt = compress ? ((xl_smgr_truncate_compress*)XLogRecGetData(record))->pageCompressOpts : 0;
|
||||
/*
|
||||
* Forcibly create relation if it doesn't exist (which suggests that
|
||||
* it was dropped somewhere later in the WAL sequence). As in
|
||||
|
|
|
@ -3695,12 +3695,21 @@ IndexStmt* transformIndexStmt(Oid relid, IndexStmt* stmt, const char* queryStrin
|
|||
if (!isColStore && (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIN_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_USTORE_INDEX_TYPE))) {
|
||||
/* row store only support btree/ubtree/gin/gist index */
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_USTORE_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE))) {
|
||||
/* row store only support btree/ubtree/gin/gist/hash index */
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
|
||||
}
|
||||
|
||||
if (0 == pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE) &&
|
||||
t_thrd.proc->workingVersionNum < SUPPORT_HASH_XLOG_VERSION_NUM) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
|
||||
}
|
||||
|
||||
if (isColStore && (!isPsortMothed && !isCBtreeMethod && !isCGinBtreeMethod)) {
|
||||
/* column store support psort/cbtree/gin index */
|
||||
ereport(ERROR,
|
||||
|
|
|
@ -70,6 +70,7 @@
|
|||
#include "storage/custorage.h"
|
||||
#include "storage/smgr/segment.h"
|
||||
#include "storage/cstore/cstore_compress.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "vecexecutor/vecnodes.h"
|
||||
|
||||
#ifdef PGXC
|
||||
|
@ -792,6 +793,7 @@ int64 calculate_relation_size(RelFileNode* rfn, BackendId backend, ForkNumber fo
|
|||
|
||||
relationpath = relpathbackend(*rfn, backend, forknum);
|
||||
|
||||
bool rowCompress = IS_COMPRESSED_RNODE((*rfn), forknum);
|
||||
for (segcount = 0;; segcount++) {
|
||||
struct stat fst;
|
||||
|
||||
|
@ -808,7 +810,7 @@ int64 calculate_relation_size(RelFileNode* rfn, BackendId backend, ForkNumber fo
|
|||
else
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathname)));
|
||||
}
|
||||
totalsize += fst.st_size;
|
||||
totalsize += rowCompress ? CalculateMainForkSize((char*)pathname, rfn, forknum) : fst.st_size;
|
||||
}
|
||||
|
||||
pfree_ext(relationpath);
|
||||
|
|
|
@ -316,6 +316,132 @@ Datum pg_read_binary_file_all(PG_FUNCTION_ARGS)
|
|||
|
||||
PG_RETURN_BYTEA_P(read_binary_file(filename, 0, -1, false));
|
||||
}
|
||||
struct CompressAddressItemState {
|
||||
uint32 blkno;
|
||||
int segmentNo;
|
||||
ReadBlockChunksStruct rbStruct;
|
||||
FILE *pcaFile;
|
||||
};
|
||||
|
||||
static void ReadBinaryFileBlocksFirstCall(PG_FUNCTION_ARGS, int32 startBlockNum, int32 blockCount)
|
||||
{
|
||||
char* path = convert_and_check_filename(PG_GETARG_TEXT_PP(0));
|
||||
int segmentNo = 0;
|
||||
UndoFileType undoFileType = UNDO_INVALID;
|
||||
if (!is_row_data_file(path, &segmentNo, &undoFileType)) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("%s is not a relation file.", path)));
|
||||
}
|
||||
/* create a function context for cross-call persistence */
|
||||
FuncCallContext* fctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
/* switch to memory context appropriate for multiple function calls */
|
||||
MemoryContext mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
|
||||
|
||||
/* initialize file scanning code */
|
||||
CompressAddressItemState* itemState = (CompressAddressItemState*)palloc(sizeof(CompressAddressItemState));
|
||||
|
||||
/* save mmap to inter_call_data->pcMap */
|
||||
char pcaFilePath[MAXPGPATH];
|
||||
errno_t rc = snprintf_s(pcaFilePath, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, path);
|
||||
securec_check_ss(rc, "\0", "\0");
|
||||
FILE* pcaFile = AllocateFile((const char*)pcaFilePath, "rb");
|
||||
if (pcaFile == NULL) {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcaFilePath)));
|
||||
}
|
||||
PageCompressHeader* map = pc_mmap(fileno(pcaFile), ReadChunkSize(pcaFile, pcaFilePath, MAXPGPATH), true);
|
||||
if (map == MAP_FAILED) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to mmap %s: %m", pcaFilePath)));
|
||||
}
|
||||
if ((BlockNumber)startBlockNum + (BlockNumber)blockCount > map->nblocks) {
|
||||
auto blockNum = map->nblocks;
|
||||
ReleaseMap(map, pcaFilePath);
|
||||
ereport(ERROR,
|
||||
(ERRCODE_INVALID_PARAMETER_VALUE,
|
||||
errmsg("invalid blocknum \"%d\" and block count \"%d\", the max blocknum is \"%u\"",
|
||||
startBlockNum,
|
||||
blockCount,
|
||||
blockNum)));
|
||||
}
|
||||
/* construct ReadBlockChunksStruct */
|
||||
char* pcdFilePath = (char*)palloc0(MAXPGPATH);
|
||||
rc = snprintf_s(pcdFilePath, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, path);
|
||||
securec_check_ss(rc, "\0", "\0");
|
||||
FILE* fp = AllocateFile(pcdFilePath, "rb");
|
||||
if (fp == NULL) {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcdFilePath)));
|
||||
}
|
||||
char* pageBuffer = (char*)palloc(BLCKSZ);
|
||||
itemState->pcaFile = pcaFile;
|
||||
itemState->rbStruct.header = map;
|
||||
itemState->rbStruct.pageBuffer = pageBuffer;
|
||||
itemState->rbStruct.pageBufferLen = BLCKSZ;
|
||||
itemState->rbStruct.fp = fp;
|
||||
itemState->rbStruct.segmentNo = segmentNo;
|
||||
itemState->rbStruct.fileName = pcdFilePath;
|
||||
|
||||
/*
|
||||
* build tupdesc for result tuples. This must match this function's
|
||||
* pg_proc entry!
|
||||
*/
|
||||
TupleDesc tupdesc = CreateTemplateTupleDesc(4, false, TAM_HEAP);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber)1, "path", TEXTOID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber)2, "blocknum", INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber)3, "len", INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber)4, "data", BYTEAOID, -1, 0);
|
||||
fctx->tuple_desc = BlessTupleDesc(tupdesc);
|
||||
|
||||
itemState->blkno = startBlockNum;
|
||||
fctx->max_calls = blockCount;
|
||||
fctx->user_fctx = itemState;
|
||||
|
||||
MemoryContextSwitchTo(mctx);
|
||||
}
|
||||
|
||||
Datum pg_read_binary_file_blocks(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 startBlockNum = PG_GETARG_INT32(1);
|
||||
int32 blockCount = PG_GETARG_INT32(2);
|
||||
|
||||
if (startBlockNum < 0 || blockCount <= 0 || startBlockNum + blockCount > RELSEG_SIZE) {
|
||||
ereport(ERROR, (ERRCODE_INVALID_PARAMETER_VALUE,
|
||||
errmsg("invalid blocknum \"%d\" or block count \"%d\"", startBlockNum, blockCount)));
|
||||
}
|
||||
|
||||
/* stuff done only on the first call of the function */
|
||||
if (SRF_IS_FIRSTCALL()) {
|
||||
ReadBinaryFileBlocksFirstCall(fcinfo, startBlockNum, blockCount);
|
||||
}
|
||||
|
||||
/* stuff done on every call of the function */
|
||||
FuncCallContext *fctx = SRF_PERCALL_SETUP();
|
||||
CompressAddressItemState *itemState = (CompressAddressItemState *)fctx->user_fctx;
|
||||
|
||||
if (fctx->call_cntr < fctx->max_calls) {
|
||||
bytea *buf = (bytea *)palloc(BLCKSZ + VARHDRSZ);
|
||||
size_t len = ReadAllChunkOfBlock(VARDATA(buf), BLCKSZ, itemState->blkno, itemState->rbStruct);
|
||||
SET_VARSIZE(buf, len + VARHDRSZ);
|
||||
Datum values[4];
|
||||
values[0] = PG_GETARG_DATUM(0);
|
||||
values[1] = Int32GetDatum(itemState->blkno);
|
||||
values[2] = Int32GetDatum(len);
|
||||
values[3] = PointerGetDatum(buf);
|
||||
|
||||
/* Build and return the result tuple. */
|
||||
bool nulls[4];
|
||||
securec_check(memset_s(nulls, sizeof(nulls), 0, sizeof(nulls)), "\0", "\0");
|
||||
HeapTuple tuple = heap_form_tuple(fctx->tuple_desc, (Datum*)values, (bool*)nulls);
|
||||
Datum result = HeapTupleGetDatum(tuple);
|
||||
itemState->blkno++;
|
||||
SRF_RETURN_NEXT(fctx, result);
|
||||
} else {
|
||||
if (itemState->rbStruct.header != NULL) {
|
||||
pc_munmap(itemState->rbStruct.header);
|
||||
}
|
||||
FreeFile(itemState->pcaFile);
|
||||
FreeFile(itemState->rbStruct.fp);
|
||||
SRF_RETURN_DONE(fctx);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* stat a file
|
||||
|
|
|
@ -664,3 +664,281 @@ void pglz_decompress(const PGLZ_Header* source, char* dest)
|
|||
* That's it.
|
||||
*/
|
||||
}
|
||||
|
||||
/* ----------
|
||||
* lz_compress -
|
||||
*
|
||||
* Compresses source into dest using strategy. Returns the number of
|
||||
* bytes written in buffer dest, or -1 if compression fails.
|
||||
* ----------
|
||||
*/
|
||||
int32 lz_compress(const char* source, int32 slen, char* dest)
|
||||
{
|
||||
unsigned char* bp = (unsigned char*) dest;
|
||||
unsigned char* bstart = bp;
|
||||
int hist_next = 0;
|
||||
bool hist_recycle = false;
|
||||
const char* dp = source;
|
||||
const char* dend = source + slen;
|
||||
unsigned char ctrl_dummy = 0;
|
||||
unsigned char* ctrlp = &ctrl_dummy;
|
||||
unsigned char ctrlb = 0;
|
||||
unsigned char ctrl = 0;
|
||||
bool found_match = false;
|
||||
int32 match_len;
|
||||
int32 match_off;
|
||||
int32 good_match;
|
||||
int32 good_drop;
|
||||
int32 result_size;
|
||||
int32 result_max;
|
||||
int32 need_rate;
|
||||
errno_t rc;
|
||||
|
||||
const PGLZ_Strategy* strategy = PGLZ_strategy_always;
|
||||
/*
|
||||
* Our fallback strategy is the default.
|
||||
*/
|
||||
if (strategy == NULL) {
|
||||
strategy = PGLZ_strategy_default;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the strategy forbids compression (at all or if source chunk size out
|
||||
* of range), fail.
|
||||
*/
|
||||
if (strategy->match_size_good <= 0 || slen < strategy->min_input_size || slen > strategy->max_input_size) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Limit the match parameters to the supported range.
|
||||
*/
|
||||
good_match = strategy->match_size_good;
|
||||
if (good_match > PGLZ_MAX_MATCH) {
|
||||
good_match = PGLZ_MAX_MATCH;
|
||||
} else if (good_match < 17) {
|
||||
good_match = 17;
|
||||
}
|
||||
|
||||
good_drop = strategy->match_size_drop;
|
||||
if (good_drop < 0) {
|
||||
good_drop = 0;
|
||||
} else if (good_drop > 100) {
|
||||
good_drop = 100;
|
||||
}
|
||||
|
||||
need_rate = strategy->min_comp_rate;
|
||||
if (need_rate < 0) {
|
||||
need_rate = 0;
|
||||
} else if (need_rate > 99) {
|
||||
need_rate = 99;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the maximum result size allowed by the strategy, namely the
|
||||
* input size minus the minimum wanted compression rate. This had better
|
||||
* be <= slen, else we might overrun the provided output buffer.
|
||||
*/
|
||||
if (slen > (INT_MAX / 100)) {
|
||||
/* Approximate to avoid overflow */
|
||||
result_max = (slen / 100) * (100 - need_rate);
|
||||
} else {
|
||||
result_max = (slen * (100 - need_rate)) / 100;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the history lists to empty. We do not need to zero the
|
||||
* hist_entries[] array; its entries are initialized as they are used.
|
||||
*/
|
||||
rc = memset_s(u_sess->utils_cxt.hist_start, HIST_START_LEN, 0, HIST_START_LEN);
|
||||
securec_check(rc, "\0", "\0");
|
||||
|
||||
/*
|
||||
* Compress the source directly into the output buffer.
|
||||
*/
|
||||
while (dp < dend) {
|
||||
/*
|
||||
* If we already exceeded the maximum result size, fail.
|
||||
*
|
||||
* We check once per loop; since the loop body could emit as many as 4
|
||||
* bytes (a control byte and 3-byte tag), PGLZ_MAX_OUTPUT() had better
|
||||
* allow 4 slop bytes.
|
||||
*/
|
||||
if (bp - bstart >= result_max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we've emitted more than first_success_by bytes without finding
|
||||
* anything compressible at all, fail. This lets us fall out
|
||||
* reasonably quickly when looking at incompressible input (such as
|
||||
* pre-compressed data).
|
||||
*/
|
||||
if (!found_match && bp - bstart >= strategy->first_success_by) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to find a match in the history
|
||||
*/
|
||||
if (pglz_find_match(u_sess->utils_cxt.hist_start, dp, dend, &match_len, &match_off, good_match, good_drop)) {
|
||||
/*
|
||||
* Create the tag and add history entries for all matched
|
||||
* characters.
|
||||
*/
|
||||
pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
|
||||
while (match_len--) {
|
||||
pglz_hist_add(
|
||||
u_sess->utils_cxt.hist_start, u_sess->utils_cxt.hist_entries, hist_next, hist_recycle, dp,
|
||||
dend);
|
||||
dp++; /* Do not do this ++ in the line above! */
|
||||
/* The macro would do it four times - Jan. */
|
||||
}
|
||||
found_match = true;
|
||||
} else {
|
||||
/*
|
||||
* No match found. Copy one literal byte.
|
||||
*/
|
||||
pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
|
||||
pglz_hist_add(
|
||||
u_sess->utils_cxt.hist_start, u_sess->utils_cxt.hist_entries, hist_next, hist_recycle, dp, dend);
|
||||
dp++; /* Do not do this ++ in the line above! */
|
||||
/* The macro would do it four times - Jan. */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out the last control byte and check that we haven't overrun the
|
||||
* output size allowed by the strategy.
|
||||
*/
|
||||
*ctrlp = ctrlb;
|
||||
result_size = bp - bstart;
|
||||
if (result_size >= result_max) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* success */
|
||||
return result_size;
|
||||
}
|
||||
|
||||
/* ----------
|
||||
* pglz_decompress -
|
||||
*
|
||||
* Decompresses source into dest. Returns the number of bytes
|
||||
* decompressed in the destination buffer, and *optionally*
|
||||
* checks that both the source and dest buffers have been
|
||||
* fully read and written to, respectively.
|
||||
* ----------
|
||||
*/
|
||||
int32 lz_decompress(const char* source, int32 slen, char* dest, int32 rawsize, bool check_complete)
|
||||
{
|
||||
const unsigned char* sp;
|
||||
const unsigned char* srcend;
|
||||
unsigned char* dp;
|
||||
unsigned char* destend;
|
||||
errno_t rc = 0;
|
||||
|
||||
sp = (const unsigned char*) source;
|
||||
srcend = ((const unsigned char*) source) + slen;
|
||||
dp = (unsigned char*) dest;
|
||||
destend = dp + rawsize;
|
||||
|
||||
while (sp < srcend && dp < destend) {
|
||||
/*
|
||||
* Read one control byte and process the next 8 items (or as many as
|
||||
* remain in the compressed input).
|
||||
*/
|
||||
unsigned char ctrl = *sp++;
|
||||
int ctrlc;
|
||||
|
||||
for (ctrlc = 0; ctrlc < 8 && sp < srcend && dp < destend; ctrlc++) {
|
||||
if (ctrl & 1) {
|
||||
/*
|
||||
* Set control bit means we must read a match tag. The match
|
||||
* is coded with two bytes. First byte uses lower nibble to
|
||||
* code length - 3. Higher nibble contains upper 4 bits of the
|
||||
* offset. The next following byte contains the lower 8 bits
|
||||
* of the offset. If the length is coded as 18, another
|
||||
* extension tag byte tells how much longer the match really
|
||||
* was (0-255).
|
||||
*/
|
||||
int32 len;
|
||||
int32 off;
|
||||
|
||||
len = (sp[0] & 0x0f) + 3;
|
||||
off = ((sp[0] & 0xf0) << 4) | sp[1];
|
||||
sp += 2;
|
||||
if (len == 18) {
|
||||
len += *sp++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we copy the bytes specified by the tag from OUTPUT to
|
||||
* OUTPUT (copy len bytes from dp - off to dp). The copied
|
||||
* areas could overlap, to preven possible uncertainty, we
|
||||
* copy only non-overlapping regions.
|
||||
*/
|
||||
len = Min(len, destend - dp);
|
||||
while (off < len) {
|
||||
/*---------
|
||||
* When offset is smaller than length - source and
|
||||
* destination regions overlap. memmove() is resolving
|
||||
* this overlap in an incompatible way with pglz. Thus we
|
||||
* resort to memcpy()-ing non-overlapping regions.
|
||||
*
|
||||
* Consider input: 112341234123412341234
|
||||
* At byte 5 here ^ we have match with length 16 and
|
||||
* offset 4. 11234M(len=16, off=4)
|
||||
* We are decoding first period of match and rewrite match
|
||||
* 112341234M(len=12, off=8)
|
||||
*
|
||||
* The same match is now at position 9, it points to the
|
||||
* same start byte of output, but from another position:
|
||||
* the offset is doubled.
|
||||
*
|
||||
* We iterate through this offset growth until we can
|
||||
* proceed to usual memcpy(). If we would try to decode
|
||||
* the match at byte 5 (len=16, off=4) by memmove() we
|
||||
* would issue memmove(5, 1, 16) which would produce
|
||||
* 112341234XXXXXXXXXXXX, where series of X is 12
|
||||
* undefined bytes, that were at bytes [5:17].
|
||||
* ---------
|
||||
*/
|
||||
errno_t rc = memcpy_s(dp, off + 1, dp - off, off);
|
||||
securec_check(rc, "", "");
|
||||
len -= off;
|
||||
dp += off;
|
||||
off += off;
|
||||
}
|
||||
rc = memcpy_s(dp, len + 1, dp - off, len);
|
||||
securec_check(rc, "", "");
|
||||
dp += len;
|
||||
} else {
|
||||
/*
|
||||
* An unset control bit means LITERAL BYTE. So we just copy
|
||||
* one from INPUT to OUTPUT.
|
||||
*/
|
||||
*dp++ = *sp++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance the control bit
|
||||
*/
|
||||
ctrl >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check we decompressed the right amount. If we are slicing, then we
|
||||
* won't necessarily be at the end of the source or dest buffers when we
|
||||
* hit a stop, so we don't test them.
|
||||
*/
|
||||
if (check_complete && (dp != destend || sp != srcend)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* That's it.
|
||||
*/
|
||||
return (char*) dp - dest;
|
||||
}
|
||||
|
|
|
@ -59,6 +59,7 @@
|
|||
#include "rewrite/rewriteDefine.h"
|
||||
#include "rewrite/rewriteHandler.h"
|
||||
#include "storage/lmgr.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/smgr/smgr.h"
|
||||
#include "storage/smgr/segment.h"
|
||||
#include "catalog/storage.h"
|
||||
|
@ -326,6 +327,7 @@ Partition PartitionBuildDesc(Oid targetPartId, StorageType storage_type, bool in
|
|||
return partition;
|
||||
}
|
||||
|
||||
|
||||
void PartitionInitPhysicalAddr(Partition partition)
|
||||
{
|
||||
partition->pd_node.spcNode = ConvertToRelfilenodeTblspcOid(partition->pd_part->reltablespace);
|
||||
|
@ -350,6 +352,12 @@ void PartitionInitPhysicalAddr(Partition partition)
|
|||
partition->pd_id)));
|
||||
}
|
||||
}
|
||||
|
||||
partition->pd_node.opt = 0;
|
||||
if (partition->rd_options) {
|
||||
SetupPageCompressForRelation(&partition->pd_node, &((StdRdOptions*)(partition->rd_options))->compress,
|
||||
PartitionGetPartitionName(partition));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -441,7 +449,7 @@ void PartitionClose(Partition partition)
|
|||
}
|
||||
|
||||
Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid partfilenode, Oid parttablespace,
|
||||
StorageType storage_type)
|
||||
StorageType storage_type, Datum reloptions)
|
||||
{
|
||||
Partition part;
|
||||
MemoryContext oldcxt;
|
||||
|
@ -490,6 +498,11 @@ Partition PartitionBuildLocalPartition(const char *relname, Oid partid, Oid part
|
|||
|
||||
if (partfilenode != InvalidOid) {
|
||||
PartitionInitPhysicalAddr(part);
|
||||
/* compressed option was set by PartitionInitPhysicalAddr if part->rd_options != NULL */
|
||||
if (part->rd_options == NULL && reloptions) {
|
||||
StdRdOptions* options = (StdRdOptions*)default_reloptions(reloptions, false, RELOPT_KIND_HEAP);
|
||||
SetupPageCompressForRelation(&part->pd_node, &options->compress, PartitionGetPartitionName(part));
|
||||
}
|
||||
}
|
||||
|
||||
if (storage_type == SEGMENT_PAGE) {
|
||||
|
|
|
@ -176,6 +176,7 @@
|
|||
#include "rewrite/rewriteDefine.h"
|
||||
#include "rewrite/rewriteRlsPolicy.h"
|
||||
#include "storage/lmgr.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/smgr/smgr.h"
|
||||
#include "storage/smgr/segment.h"
|
||||
#include "threadpool/threadpool.h"
|
||||
|
@ -1284,7 +1285,6 @@ static void IndexSupportInitialize(Relation relation, oidvector* indclass, Strat
|
|||
static OpClassCacheEnt* LookupOpclassInfo(Relation relation, Oid operatorClassOid, StrategyNumber numSupport);
|
||||
static void RelationCacheInitFileRemoveInDir(const char* tblspcpath);
|
||||
static void unlink_initfile(const char* initfilename);
|
||||
|
||||
/*
|
||||
* ScanPgRelation
|
||||
*
|
||||
|
@ -2499,6 +2499,12 @@ void RelationInitPhysicalAddr(Relation relation)
|
|||
if (!RelationIsPartitioned(relation) && relation->storage_type == SEGMENT_PAGE) {
|
||||
relation->rd_node.bucketNode = SegmentBktId;
|
||||
}
|
||||
|
||||
// setup page compression options
|
||||
relation->rd_node.opt = 0;
|
||||
if (relation->rd_options && REL_SUPPORT_COMPRESSED(relation)) {
|
||||
SetupPageCompressForRelation(&relation->rd_node, &((StdRdOptions*)(relation->rd_options))->compress, RelationGetRelationName(relation));
|
||||
}
|
||||
}
|
||||
|
||||
static void IndexRelationInitKeyNums(Relation relation)
|
||||
|
@ -4335,8 +4341,9 @@ void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTrans
|
|||
* and enter it into the relcache.
|
||||
*/
|
||||
Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, TupleDesc tupDesc, Oid relid,
|
||||
Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence, char relkind,
|
||||
int8 row_compress, TableAmType tam_type, int8 relindexsplit, StorageType storage_type)
|
||||
Oid relfilenode, Oid reltablespace, bool shared_relation, bool mapped_relation, char relpersistence,
|
||||
char relkind, int8 row_compress, Datum reloptions, TableAmType tam_type, int8 relindexsplit,
|
||||
StorageType storage_type, Oid accessMethodObjectId)
|
||||
{
|
||||
Relation rel;
|
||||
MemoryContext oldcxt;
|
||||
|
@ -4452,6 +4459,7 @@ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, Tuple
|
|||
rel->rd_rel->relowner = BOOTSTRAP_SUPERUSERID;
|
||||
rel->rd_rel->parttype = PARTTYPE_NON_PARTITIONED_RELATION;
|
||||
rel->rd_rel->relrowmovement = false;
|
||||
rel->rd_rel->relam = accessMethodObjectId;
|
||||
|
||||
/* set up persistence and relcache fields dependent on it */
|
||||
rel->rd_rel->relpersistence = relpersistence;
|
||||
|
@ -4508,6 +4516,13 @@ Relation RelationBuildLocalRelation(const char* relname, Oid relnamespace, Tuple
|
|||
|
||||
RelationInitPhysicalAddr(rel);
|
||||
|
||||
/* compressed option was set by RelationInitPhysicalAddr if rel->rd_options != NULL */
|
||||
if (rel->rd_options == NULL && reloptions && SUPPORT_COMPRESSED(relkind, rel->rd_rel->relam)) {
|
||||
StdRdOptions *options = (StdRdOptions *) default_reloptions(reloptions, false, RELOPT_KIND_HEAP);
|
||||
SetupPageCompressForRelation(&rel->rd_node, &options->compress, RelationGetRelationName(rel));
|
||||
}
|
||||
|
||||
|
||||
/* materialized view not initially scannable */
|
||||
if (relkind == RELKIND_MATVIEW)
|
||||
rel->rd_isscannable = false;
|
||||
|
@ -8106,6 +8121,45 @@ void GetTdeInfoFromRel(Relation rel, TdeInfo *tde_info)
|
|||
}
|
||||
}
|
||||
|
||||
void SetupPageCompressForRelation(RelFileNode* node, PageCompressOpts* compress_options, const char* relationName)
|
||||
{
|
||||
uint1 algorithm = compress_options->compressType;
|
||||
if (algorithm == COMPRESS_TYPE_NONE) {
|
||||
node->opt = 0;
|
||||
} else {
|
||||
if (!SUPPORT_PAGE_COMPRESSION) {
|
||||
ereport(ERROR, (errmsg("unsupported page compression on this platform")));
|
||||
}
|
||||
|
||||
uint1 compressLevel;
|
||||
bool symbol = false;
|
||||
if (compress_options->compressLevel >= 0) {
|
||||
symbol = true;
|
||||
compressLevel = compress_options->compressLevel;
|
||||
} else {
|
||||
symbol = false;
|
||||
compressLevel = -compress_options->compressLevel;
|
||||
}
|
||||
bool success = false;
|
||||
uint1 chunkSize = ConvertChunkSize(compress_options->compressChunkSize, &success);
|
||||
if (!success) {
|
||||
ereport(ERROR, (errmsg("invalid compress_chunk_size %d , must be one of %d, %d, %d or %d for %s",
|
||||
compress_options->compressChunkSize, BLCKSZ / 16, BLCKSZ / 8, BLCKSZ / 4, BLCKSZ / 2,
|
||||
relationName)));
|
||||
}
|
||||
uint1 preallocChunks;
|
||||
if (compress_options->compressPreallocChunks >= BLCKSZ / compress_options->compressChunkSize) {
|
||||
ereport(ERROR, (errmsg("invalid compress_prealloc_chunks %d , must be less than %d for %s",
|
||||
compress_options->compressPreallocChunks,
|
||||
BLCKSZ / compress_options->compressChunkSize, relationName)));
|
||||
} else {
|
||||
preallocChunks = (uint1)(compress_options->compressPreallocChunks);
|
||||
}
|
||||
node->opt = 0;
|
||||
SET_COMPRESS_OPTION((*node), compress_options->compressByteConvert, compress_options->compressDiffConvert,
|
||||
preallocChunks, symbol, compressLevel, algorithm, chunkSize);
|
||||
}
|
||||
}
|
||||
char RelationGetRelReplident(Relation r)
|
||||
{
|
||||
bool isNull = false;
|
||||
|
|
|
@ -59,7 +59,7 @@ bool open_join_children = true;
|
|||
bool will_shutdown = false;
|
||||
|
||||
/* hard-wired binary version number */
|
||||
const uint32 GRAND_VERSION_NUM = 92602;
|
||||
const uint32 GRAND_VERSION_NUM = 92603;
|
||||
|
||||
const uint32 PREDPUSH_SAME_LEVEL_VERSION_NUM = 92522;
|
||||
const uint32 UPSERT_WHERE_VERSION_NUM = 92514;
|
||||
|
@ -108,6 +108,7 @@ const uint32 V5R2C00_START_VERSION_NUM = 92350;
|
|||
const uint32 V5R2C00_BACKEND_VERSION_NUM = 92412;
|
||||
|
||||
const uint32 ANALYZER_HOOK_VERSION_NUM = 92592;
|
||||
const uint32 SUPPORT_HASH_XLOG_VERSION_NUM = 92603;
|
||||
|
||||
/* This variable indicates wheather the instance is in progress of upgrade as a whole */
|
||||
uint32 volatile WorkingGrandVersionNum = GRAND_VERSION_NUM;
|
||||
|
|
|
@ -971,6 +971,7 @@ const char* const config_group_names[] = {
|
|||
/* INSTRUMENTS_OPTIONS */
|
||||
gettext_noop("Instruments Options"),
|
||||
gettext_noop("Column Encryption"),
|
||||
gettext_noop("Compress Options"),
|
||||
#ifdef PGXC
|
||||
/* DATA_NODES */
|
||||
gettext_noop("Datanodes and Connection Pooling"),
|
||||
|
|
|
@ -114,6 +114,7 @@ bool gs_memory_enjection(void)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* check if the node is on heavy memory status now?
|
||||
* is strict is true, we'll do some pre-judgement.
|
||||
|
@ -907,6 +908,36 @@ int MemoryProtectFunctions::gs_posix_memalign(void** memptr, Size alignment, Siz
|
|||
return ENOMEM; /* insufficient memory */
|
||||
}
|
||||
|
||||
/**
|
||||
* reseve memory for mmap of compressed table
|
||||
* @tparam mem_type MEM_SHRD is supported only
|
||||
* @param sz reserved size(bytes)
|
||||
* @param needProtect
|
||||
* @return success or not
|
||||
*/
|
||||
template <MemType type>
|
||||
bool MemoryProtectFunctions::gs_memprot_reserve(Size sz, bool needProtect)
|
||||
{
|
||||
if (type != MEM_SHRD) {
|
||||
return false;
|
||||
}
|
||||
return memTracker_ReserveMem<type>(sz, needProtect);
|
||||
}
|
||||
|
||||
/**
|
||||
* release the momery allocated by gs_memprot_reserve
|
||||
* @tparam type MEM_SHRD is supported only
|
||||
* @param sz free size(bytes)
|
||||
*/
|
||||
template <MemType type>
|
||||
void MemoryProtectFunctions::gs_memprot_release(Size sz)
|
||||
{
|
||||
if (type != MEM_SHRD) {
|
||||
return;
|
||||
}
|
||||
memTracker_ReleaseMem<type>(sz);
|
||||
}
|
||||
|
||||
/* thread level initialization */
|
||||
void gs_memprot_thread_init(void)
|
||||
{
|
||||
|
|
|
@ -452,9 +452,6 @@ static void ResourceOwnerReleaseInternal(
|
|||
MemoryContextDelete(memContext);
|
||||
ResourceOwnerForgetGMemContext(t_thrd.utils_cxt.TopTransactionResourceOwner, memContext);
|
||||
}
|
||||
|
||||
/* Clean up index scans too */
|
||||
ReleaseResources_hash();
|
||||
}
|
||||
|
||||
t_thrd.utils_cxt.CurrentResourceOwner = save;
|
||||
|
|
|
@ -110,6 +110,7 @@
|
|||
#include <limits.h>
|
||||
|
||||
#include "access/nbtree.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/tableam.h"
|
||||
#include "access/ustore/knl_utuple.h"
|
||||
#include "access/tableam.h"
|
||||
|
@ -415,6 +416,7 @@ struct Tuplesortstate {
|
|||
* These variables are specific to the IndexTuple case; they are set by
|
||||
* tuplesort_begin_index_xxx and used only by the IndexTuple routines.
|
||||
*/
|
||||
Relation heapRel; /* table the index is being built on */
|
||||
Relation indexRel; /* index being built */
|
||||
|
||||
/* These are specific to the index_btree subcase: */
|
||||
|
@ -422,7 +424,9 @@ struct Tuplesortstate {
|
|||
bool enforceUnique; /* complain if we find duplicate tuples */
|
||||
|
||||
/* These are specific to the index_hash subcase: */
|
||||
uint32 hash_mask; /* mask for sortable part of hash code */
|
||||
uint32 high_mask; /* masks for sortable part of hash code */
|
||||
uint32 low_mask;
|
||||
uint32 max_buckets;
|
||||
|
||||
/*
|
||||
* These variables are specific to the Datum case; they are set by
|
||||
|
@ -970,7 +974,8 @@ Tuplesortstate* tuplesort_begin_index_btree(
|
|||
}
|
||||
|
||||
Tuplesortstate* tuplesort_begin_index_hash(
|
||||
Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem)
|
||||
Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask,
|
||||
uint32 max_buckets, int workMem, bool randomAccess, int maxMem)
|
||||
{
|
||||
Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess);
|
||||
MemoryContext oldcontext;
|
||||
|
@ -980,11 +985,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
|
|||
#ifdef TRACE_SORT
|
||||
if (u_sess->attr.attr_common.trace_sort) {
|
||||
elog(LOG,
|
||||
"begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c, maxMem = %d",
|
||||
hash_mask,
|
||||
workMem,
|
||||
randomAccess ? 't' : 'f',
|
||||
maxMem);
|
||||
"begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
|
||||
"max_buckets = 0x%x, workMem = %d, randomAccess = %c",
|
||||
high_mask,
|
||||
low_mask,
|
||||
max_buckets,
|
||||
workMem, randomAccess ? 't' : 'f');
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -999,9 +1005,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
|
|||
#endif
|
||||
state->reversedirection = reversedirection_index_hash;
|
||||
|
||||
state->heapRel = heapRel;
|
||||
state->indexRel = indexRel;
|
||||
|
||||
state->hash_mask = hash_mask;
|
||||
state->high_mask = high_mask;
|
||||
state->low_mask = low_mask;
|
||||
state->max_buckets = max_buckets;
|
||||
state->maxMem = maxMem * 1024L;
|
||||
|
||||
(void)MemoryContextSwitchTo(oldcontext);
|
||||
|
@ -3810,8 +3819,8 @@ static int comparetup_index_btree(const SortTuple* a, const SortTuple* b, Tuples
|
|||
|
||||
static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tuplesortstate* state)
|
||||
{
|
||||
uint32 hash1;
|
||||
uint32 hash2;
|
||||
Bucket bucket1;
|
||||
Bucket bucket2;
|
||||
IndexTuple tuple1;
|
||||
IndexTuple tuple2;
|
||||
|
||||
|
@ -3820,13 +3829,17 @@ static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tupleso
|
|||
* that the first column of the index tuple is the hash key.
|
||||
*/
|
||||
Assert(!a->isnull1);
|
||||
hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
|
||||
bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
|
||||
state->max_buckets, state->high_mask,
|
||||
state->low_mask);
|
||||
Assert(!b->isnull1);
|
||||
hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
|
||||
bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
|
||||
state->max_buckets, state->high_mask,
|
||||
state->low_mask);
|
||||
|
||||
if (hash1 > hash2) {
|
||||
if (bucket1 > bucket2) {
|
||||
return 1;
|
||||
} else if (hash1 < hash2) {
|
||||
} else if (bucket1 < bucket2) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
|
@ -696,6 +696,7 @@ ifneq ($(with_openeuler_os), yes)
|
|||
cp -d $(with_3rd)/$(BINARYPATH)/event/$(LIB_SUPPORT_LLT)/lib/libevent* '$(DESTDIR)$(libdir)/'
|
||||
cp $(SECUREDYNAMICLIB_HOME)/libsecurec* '$(DESTDIR)$(libdir)/'
|
||||
ifneq (, $(findstring __USE_NUMA, $(CFLAGS)))
|
||||
cp $(ZSTD_LIB_PATH)/libzstd* '$(DESTDIR)$(libdir)/'
|
||||
cp $(NUMA_LIB_PATH)/* '$(DESTDIR)$(libdir)/'
|
||||
endif
|
||||
ifeq ($(enable_mot), yes)
|
||||
|
|
|
@ -234,6 +234,7 @@ Boot_CreateStmt:
|
|||
mapped_relation,
|
||||
true,
|
||||
REL_CMPRS_NOT_SUPPORT,
|
||||
(Datum)0,
|
||||
BOOTSTRAP_SUPERUSERID,
|
||||
false,
|
||||
TAM_HEAP,
|
||||
|
|
|
@ -246,6 +246,8 @@ int GetRemoteConnInfo(char* remoteAddress, char* remoteReadConnInfo, int len)
|
|||
* @IN spcnode: tablespace id
|
||||
* @IN dbnode: database id
|
||||
* @IN relnode: relfilenode
|
||||
* @IN bucketnode: bucketnode
|
||||
* @IN opt: compressed table options
|
||||
* @IN/OUT forknum: forknum
|
||||
* @IN/OUT blocknum: block number
|
||||
* @IN/OUT blocksize: block size
|
||||
|
@ -284,7 +286,7 @@ extern int RemoteGetPage(char* remoteAddress, RepairBlockKey *key, uint32 blocks
|
|||
tnRet = snprintf_s(sqlCommands, MAX_PATH_LEN, MAX_PATH_LEN - 1,
|
||||
"SELECT gs_read_block_from_remote(%u, %u, %u, %d, %d, '%lu', %u, '%lu', false, %d);",
|
||||
key->relfilenode.spcNode, key->relfilenode.dbNode, key->relfilenode.relNode,
|
||||
key->relfilenode.bucketNode, key->forknum, key->blocknum, blocksize, lsn, timeout);
|
||||
key->relfilenode.bucketNode, key->relfilenode.opt, key->forknum, key->blocknum, blocksize, lsn, timeout);
|
||||
}
|
||||
|
||||
securec_check_ss(tnRet, "", "");
|
||||
|
|
|
@ -1054,6 +1054,14 @@ Oid DefineIndex(Oid relationId, IndexStmt* stmt, Oid indexRelationId, bool is_al
|
|||
}
|
||||
}
|
||||
|
||||
TableCreateSupport indexCreateSupport{false,false,false,false,false,false};
|
||||
ListCell* cell = NULL;
|
||||
foreach (cell, stmt->options) {
|
||||
DefElem* defElem = (DefElem*)lfirst(cell);
|
||||
SetOneOfCompressOption(defElem->defname, &indexCreateSupport);
|
||||
}
|
||||
|
||||
CheckCompressOption(&indexCreateSupport);
|
||||
/*
|
||||
* Parse AM-specific options, convert to text array form, validate.
|
||||
*/
|
||||
|
|
|
@ -125,6 +125,7 @@
|
|||
#include "storage/freespace.h"
|
||||
#include "storage/lmgr.h"
|
||||
#include "storage/lock/lock.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/predicate.h"
|
||||
#include "storage/remote_read.h"
|
||||
#include "storage/smgr/segment.h"
|
||||
|
@ -1090,10 +1091,10 @@ static bool isOrientationSet(List* options, bool* isCUFormat, bool isDfsTbl)
|
|||
* @Param [IN] relkind: table's kind(ordinary table or other database object).
|
||||
* @return: option with defalut options.
|
||||
*/
|
||||
static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 relcmprs, Oid relnamespace)
|
||||
static List* AddDefaultOptionsIfNeed(List* options, const char relkind, CreateStmt* stmt, Oid relnamespace)
|
||||
{
|
||||
List* res = options;
|
||||
|
||||
int8 relcmprs = stmt->row_compress;
|
||||
ListCell* cell = NULL;
|
||||
bool isCStore = false;
|
||||
bool isTsStore = false;
|
||||
|
@ -1102,6 +1103,7 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
|
|||
bool isUstore = false;
|
||||
bool assignedStorageType = false;
|
||||
|
||||
TableCreateSupport tableCreateSupport{false,false,false,false,false,false};
|
||||
(void)isOrientationSet(options, NULL, false);
|
||||
foreach (cell, options) {
|
||||
DefElem* def = (DefElem*)lfirst(cell);
|
||||
|
@ -1131,6 +1133,8 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
|
|||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_OPTION),
|
||||
errmsg("It is not allowed to assign version option for non-dfs table.")));
|
||||
} else {
|
||||
SetOneOfCompressOption(def->defname, &tableCreateSupport);
|
||||
}
|
||||
|
||||
if (pg_strcasecmp(def->defname, "orientation") == 0 && pg_strcasecmp(defGetString(def), ORIENTATION_ORC) == 0) {
|
||||
|
@ -1156,6 +1160,15 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
|
|||
res = lappend(options, def);
|
||||
}
|
||||
|
||||
bool noSupportTable = isCStore || isTsStore || relkind != RELKIND_RELATION ||
|
||||
stmt->relation->relpersistence == RELPERSISTENCE_UNLOGGED ||
|
||||
stmt->relation->relpersistence == RELPERSISTENCE_TEMP ||
|
||||
stmt->relation->relpersistence == RELPERSISTENCE_GLOBAL_TEMP;
|
||||
if (noSupportTable && tableCreateSupport.compressType) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION), errmsg("only row orientation table support compresstype.")));
|
||||
}
|
||||
CheckCompressOption(&tableCreateSupport);
|
||||
|
||||
if (isUstore && !isCStore && !hasCompression) {
|
||||
DefElem* def = makeDefElem("compression", (Node *)makeString(COMPRESSION_NO));
|
||||
res = lappend(options, def);
|
||||
|
@ -1191,7 +1204,7 @@ static List* AddDefaultOptionsIfNeed(List* options, const char relkind, int8 rel
|
|||
DefElem *def1 = makeDefElem("orientation", (Node *)makeString(ORIENTATION_ROW));
|
||||
res = lcons(def1, options);
|
||||
}
|
||||
if (!hasCompression) {
|
||||
if (!hasCompression && !tableCreateSupport.compressType) {
|
||||
DefElem *def2 = makeDefElem("compression", (Node *)rowCmprOpt);
|
||||
res = lappend(options, def2);
|
||||
}
|
||||
|
@ -2124,7 +2137,7 @@ Oid DefineRelation(CreateStmt* stmt, char relkind, Oid ownerId, bool isCTAS)
|
|||
/* Add default options for relation if need. */
|
||||
if (!dfsTablespace) {
|
||||
if (!u_sess->attr.attr_common.IsInplaceUpgrade) {
|
||||
stmt->options = AddDefaultOptionsIfNeed(stmt->options, relkind, stmt->row_compress, namespaceId);
|
||||
stmt->options = AddDefaultOptionsIfNeed(stmt->options, relkind, stmt, namespaceId);
|
||||
}
|
||||
} else {
|
||||
checkObjectCreatedinHDFSTblspc(stmt, relkind);
|
||||
|
@ -2364,10 +2377,13 @@ Oid DefineRelation(CreateStmt* stmt, char relkind, Oid ownerId, bool isCTAS)
|
|||
ereport(LOG, (errmodule(MOD_TIMESERIES), errmsg("use implicit distribution column method.")));
|
||||
}
|
||||
} else if (pg_strcasecmp(storeChar, TABLE_ACCESS_METHOD_USTORE) == 0) {
|
||||
if (pg_strcasecmp(COMPRESSION_NO, StdRdOptionsGetStringData(std_opt, compression, COMPRESSION_NO)) != 0 ||
|
||||
auto compression = StdRdOptionsGetStringData(std_opt, compression, COMPRESSION_NO);
|
||||
auto orientation = StdRdOptionsGetStringData(std_opt, orientation, ORIENTATION_ROW);
|
||||
if ((pg_strcasecmp(COMPRESSION_NO, compression) != 0 &&
|
||||
pg_strcasecmp(ORIENTATION_COLUMN, orientation) == 0) ||
|
||||
IsCompressedByCmprsInPgclass((RelCompressType)stmt->row_compress)) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("UStore tables do not support compression.")));
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("UStore tables do not support compression.")));
|
||||
}
|
||||
ForbidToSetOptionsForRowTbl(stmt->options);
|
||||
ForbidToSetOptionsForUstoreTbl(stmt->options);
|
||||
|
@ -14428,6 +14444,67 @@ static void ATExecSetRelOptionsToast(Oid toastid, List* defList, AlterTableType
|
|||
heap_close(pgclass, RowExclusiveLock);
|
||||
}
|
||||
|
||||
/**
|
||||
* Do not modify compression parameters.
|
||||
*/
|
||||
void static CheckSupportModifyCompression(Relation rel, bytea* relOoption, List* defList)
|
||||
{
|
||||
if (!relOoption) {
|
||||
return;
|
||||
}
|
||||
if (!REL_SUPPORT_COMPRESSED(rel) || rel->rd_node.opt == 0) {
|
||||
ForbidUserToSetCompressedOptions(defList);
|
||||
return;
|
||||
}
|
||||
PageCompressOpts* newCompressOpt = &(((StdRdOptions*)relOoption)->compress);
|
||||
RelFileCompressOption current;
|
||||
TransCompressOptions(rel->rd_node, ¤t);
|
||||
if (newCompressOpt) {
|
||||
if (newCompressOpt->compressType != (int)current.compressAlgorithm) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("change compresstype OPTION is not supported")));
|
||||
}
|
||||
if ((int)current.compressAlgorithm != COMPRESS_TYPE_NONE &&
|
||||
newCompressOpt->compressChunkSize != CHUNK_SIZE_LIST[current.compressChunkSize]) {
|
||||
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("change compress_chunk_size OPTION is not supported")));
|
||||
}
|
||||
if (!newCompressOpt->compressByteConvert && newCompressOpt->compressDiffConvert) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
|
||||
errmsg("compress_diff_convert should be used with compress_byte_convert.")));
|
||||
}
|
||||
if (current.compressAlgorithm == COMPRESS_TYPE_PGLZ) {
|
||||
ListCell *opt = NULL;
|
||||
foreach (opt, defList) {
|
||||
DefElem *def = (DefElem *)lfirst(opt);
|
||||
if (pg_strcasecmp(def->defname, "compress_level") == 0) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
|
||||
errmsg("compress_level should be used with ZSTD algorithm.")));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ((int)current.compressAlgorithm != COMPRESS_TYPE_NONE) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("change compresstype OPTION is not supported")));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* forbid modify partition CompressOption
|
||||
*/
|
||||
if (HEAP_IS_PARTITIONED(rel)) {
|
||||
if ((int)current.compressLevel != newCompressOpt->compressLevel) {
|
||||
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("change partition compressLevel OPTION is not supported")));
|
||||
}
|
||||
if ((int)current.compressPreallocChunks != newCompressOpt->compressPreallocChunks) {
|
||||
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("change partition compress_prealloc_chunks OPTION is not supported")));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set, reset, or replace reloptions.
|
||||
*/
|
||||
|
@ -14567,6 +14644,7 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper
|
|||
}
|
||||
|
||||
/* Validate */
|
||||
bytea* relOpt = NULL;
|
||||
switch (rel->rd_rel->relkind) {
|
||||
case RELKIND_RELATION: {
|
||||
/* this options only can be used when define a new relation.
|
||||
|
@ -14575,6 +14653,7 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper
|
|||
ForbidUserToSetDefinedOptions(defList);
|
||||
|
||||
bytea* heapRelOpt = heap_reloptions(rel->rd_rel->relkind, newOptions, true);
|
||||
relOpt = heapRelOpt;
|
||||
const char* algo = RelationGetAlgo(rel);
|
||||
newRelHasUids = StdRdOptionsHasUids(heapRelOpt, RELKIND_RELATION);
|
||||
if (rel->rd_rel->relhasoids && newRelHasUids) {
|
||||
|
@ -14617,18 +14696,21 @@ static void ATExecSetRelOptions(Relation rel, List* defList, AlterTableType oper
|
|||
break;
|
||||
}
|
||||
case RELKIND_INDEX:
|
||||
case RELKIND_GLOBAL_INDEX:
|
||||
case RELKIND_GLOBAL_INDEX: {
|
||||
ForbidUserToSetDefinedIndexOptions(defList);
|
||||
Assert(oldRelHasUids == false);
|
||||
(void)index_reloptions(rel->rd_am->amoptions, newOptions, true);
|
||||
relOpt = index_reloptions(rel->rd_am->amoptions, newOptions, true);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
||||
errmsg("\"%s\" is not a table, view, materialized view, index, or TOAST table", RelationGetRelationName(rel))));
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
CheckSupportModifyCompression(rel, relOpt, defList);
|
||||
|
||||
/*
|
||||
* All we need do here is update the pg_class row; the new options will be
|
||||
* propagated into relcaches during post-commit cache inval.
|
||||
|
@ -22257,6 +22339,11 @@ static void checkCompressForExchange(Relation partTableRel, Relation ordTableRel
|
|||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress")));
|
||||
}
|
||||
if (partTableRel->rd_node.opt != ordTableRel->rd_node.opt) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("tables in ALTER TABLE EXCHANGE PARTITION must have the same type of compress")));
|
||||
}
|
||||
}
|
||||
|
||||
// Description : Check number, type of column
|
||||
|
@ -24317,9 +24404,16 @@ static char* GenTemporaryPartitionName(Relation partTableRel, int sequence)
|
|||
return pstrdup(tmpName);
|
||||
}
|
||||
|
||||
#ifndef ENABLE_MULTIPLE_NODES
|
||||
static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *partDef, Oid bucketOid,
|
||||
bool *isTimestamptz, StorageType stype, Datum new_reloptions)
|
||||
{
|
||||
#else
|
||||
static Oid GetNewPartitionOid(Relation pgPartRel, Relation partTableRel, Node *partDef,
|
||||
Oid bucketOid, bool *isTimestamptz, StorageType stype)
|
||||
{
|
||||
Datum new_reloptions = (Datum)0;
|
||||
#endif
|
||||
Oid newPartOid = InvalidOid;
|
||||
switch (nodeTag(partDef)) {
|
||||
case T_RangePartitionDefState:
|
||||
|
@ -24412,9 +24506,13 @@ static Oid AddTemporaryPartition(Relation partTableRel, Node* partDef)
|
|||
}
|
||||
|
||||
/* Temporary tables do not use segment-page */
|
||||
#ifndef ENABLE_MULTIPLE_NODES
|
||||
newPartOid = GetNewPartitionOid(pgPartRel, partTableRel, partDef, bucketOid,
|
||||
isTimestamptz, RelationGetStorageType(partTableRel), (Datum)new_reloptions);
|
||||
|
||||
isTimestamptz, RelationGetStorageType(partTableRel), new_reloptions);
|
||||
#else
|
||||
newPartOid = GetNewPartitionOid(
|
||||
pgPartRel, partTableRel, partDef, bucketOid, isTimestamptz, RelationGetStorageType(partTableRel));
|
||||
#endif
|
||||
// We must bump the command counter to make the newly-created
|
||||
// partition tuple visible for opening.
|
||||
CommandCounterIncrement();
|
||||
|
@ -24736,6 +24834,7 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List*
|
|||
|
||||
pgPartRel = relation_open(PartitionRelationId, RowExclusiveLock);
|
||||
|
||||
#ifndef ENABLE_MULTIPLE_NODES
|
||||
bool isNull = false;
|
||||
HeapTuple tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(partTableRel->rd_id));
|
||||
Datum relOptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions, &isNull);
|
||||
|
@ -24743,6 +24842,7 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List*
|
|||
Datum newRelOptions = transformRelOptions((Datum)0, oldRelOptions, NULL, NULL, false, false);
|
||||
ReleaseSysCache(tuple);
|
||||
list_free_ext(oldRelOptions);
|
||||
#endif
|
||||
|
||||
foreach (cell, destPartDefList) {
|
||||
RangePartitionDefState* partDef = (RangePartitionDefState*)lfirst(cell);
|
||||
|
@ -24753,7 +24853,11 @@ static void fastAddPartition(Relation partTableRel, List* destPartDefList, List*
|
|||
bucketOid,
|
||||
partDef,
|
||||
partTableRel->rd_rel->relowner,
|
||||
#ifndef ENABLE_MULTIPLE_NODES
|
||||
(Datum)newRelOptions,
|
||||
#else
|
||||
(Datum)0,
|
||||
#endif
|
||||
isTimestamptz,
|
||||
RelationGetStorageType(partTableRel),
|
||||
AccessExclusiveLock);
|
||||
|
|
|
@ -301,10 +301,12 @@ void PrepForRead(char* path, int64 blocknum, bool is_segment, RelFileNode *relno
|
|||
char* bucketNodestr = strstr(path, "_b");
|
||||
if (NULL != bucketNodestr) {
|
||||
bucketNodestr += 2; /* delete first two chars: _b */
|
||||
flag = StrToInt32(bucketNodestr, &(relfilenode.rnode.node.bucketNode));
|
||||
int _bucketNode;
|
||||
flag = StrToInt32(bucketNodestr, &_bucketNode); // carrottodo
|
||||
if (!flag) {
|
||||
ereport(ERROR, (errmsg("Can not covert %s to int32 type. \n", bucketNodestr)));
|
||||
}
|
||||
relfilenode.rnode.node.bucketNode = (int2)_bucketNode;
|
||||
rc = strncpy_s(pathFirstpart, MAXFNAMELEN, path, strlen(path) - strlen(bucketNodestr));
|
||||
securec_check(rc, "\0", "\0");
|
||||
}
|
||||
|
@ -852,13 +854,14 @@ Datum gs_read_segment_block_from_remote(PG_FUNCTION_ARGS)
|
|||
uint32 dbNode = PG_GETARG_UINT32(1);
|
||||
uint32 relNode = PG_GETARG_UINT32(2);
|
||||
int16 bucketNode = PG_GETARG_INT16(3);
|
||||
int32 forkNum = PG_GETARG_INT32(4);
|
||||
uint64 blockNum = (uint64)PG_GETARG_TRANSACTIONID(5);
|
||||
uint32 blockSize = PG_GETARG_UINT32(6);
|
||||
uint64 lsn = (uint64)PG_GETARG_TRANSACTIONID(7);
|
||||
uint32 seg_relNode = PG_GETARG_UINT32(8);
|
||||
uint32 seg_block = PG_GETARG_UINT32(9);
|
||||
int32 timeout = PG_GETARG_INT32(10);
|
||||
uint16 opt = PG_GETARG_INT16(4);
|
||||
int32 forkNum = PG_GETARG_INT32(5);
|
||||
uint64 blockNum = (uint64)PG_GETARG_TRANSACTIONID(6);
|
||||
uint32 blockSize = PG_GETARG_UINT32(7);
|
||||
uint64 lsn = (uint64)PG_GETARG_TRANSACTIONID(8);
|
||||
uint32 seg_relNode = PG_GETARG_UINT32(9);
|
||||
uint32 seg_block = PG_GETARG_UINT32(10);
|
||||
int32 timeout = PG_GETARG_INT32(11);
|
||||
|
||||
XLogPhyBlock pblk = {
|
||||
.relNode = seg_relNode,
|
||||
|
@ -871,6 +874,7 @@ Datum gs_read_segment_block_from_remote(PG_FUNCTION_ARGS)
|
|||
key.relfilenode.dbNode = dbNode;
|
||||
key.relfilenode.relNode = relNode;
|
||||
key.relfilenode.bucketNode = bucketNode;
|
||||
key.relfilenode.opt = opt;
|
||||
key.forknum = forkNum;
|
||||
key.blocknum = blockNum;
|
||||
|
||||
|
|
|
@ -626,7 +626,7 @@ static uint32 ckpt_qsort_dirty_page_for_flush(bool *is_new_relfilenode, uint32 f
|
|||
item->bucketNode = buf_desc->tag.rnode.bucketNode;
|
||||
item->forkNum = buf_desc->tag.forkNum;
|
||||
item->blockNum = buf_desc->tag.blockNum;
|
||||
if(IsSegmentFileNode(buf_desc->tag.rnode)) {
|
||||
if(IsSegmentFileNode(buf_desc->tag.rnode) || buf_desc->tag.rnode.opt != 0) {
|
||||
*is_new_relfilenode = true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4521,6 +4521,12 @@ const char* pgstat_get_wait_io(WaitEventIO w)
|
|||
case WAIT_EVENT_LOGCTRL_SLEEP:
|
||||
event_name = "LOGCTRL_SLEEP";
|
||||
break;
|
||||
case WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH:
|
||||
event_name = "PCA_FLUSH";
|
||||
break;
|
||||
case WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC:
|
||||
event_name = "PCA_SYNC";
|
||||
break;
|
||||
/* no default case, so that compiler will warn */
|
||||
case IO_EVENT_NUM:
|
||||
break;
|
||||
|
|
|
@ -2156,6 +2156,8 @@ int PostmasterMain(int argc, char* argv[])
|
|||
ngroup_info_hash_create();
|
||||
/*init Role id hash table*/
|
||||
InitRoleIdHashTable();
|
||||
/* pcmap */
|
||||
RealInitialMMapLockArray();
|
||||
/* init unique sql */
|
||||
InitUniqueSQL();
|
||||
/* init hypo index */
|
||||
|
|
|
@ -113,6 +113,10 @@ static relopt_bool boolRelOpts[] = {
|
|||
{{ "crossbucket", "Enables cross bucket index creation in this index relation", RELOPT_KIND_BTREE}, false },
|
||||
{{ "enable_tde", "enable table's level transparent data encryption", RELOPT_KIND_HEAP }, false },
|
||||
{{ "hasuids", "Enables uids in this relation", RELOPT_KIND_HEAP }, false },
|
||||
{{ "compress_byte_convert", "Whether do byte convert in compression", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
|
||||
false },
|
||||
{{ "compress_diff_convert", "Whether do diiffer convert in compression", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
|
||||
false },
|
||||
/* list terminator */
|
||||
{{NULL}}
|
||||
};
|
||||
|
@ -233,6 +237,16 @@ static relopt_int intRelOpts[] = {
|
|||
},
|
||||
0, 1, 32
|
||||
},
|
||||
{{ "compress_level", "Level of page compression.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, 0, -31, 31},
|
||||
{{ "compresstype", "compress type (none, pglz or zstd).", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE}, 0, 0, 2},
|
||||
{{ "compress_chunk_size", "Size of chunk to store compressed page.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
|
||||
BLCKSZ / 2,
|
||||
BLCKSZ / 16,
|
||||
BLCKSZ / 2},
|
||||
{{ "compress_prealloc_chunks", "Number of prealloced chunks for each block.", RELOPT_KIND_HEAP | RELOPT_KIND_BTREE},
|
||||
0,
|
||||
0,
|
||||
7},
|
||||
/* list terminator */
|
||||
{{NULL}}
|
||||
};
|
||||
|
@ -1948,7 +1962,20 @@ bytea *default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
|
|||
{ "cmk_id", RELOPT_TYPE_STRING, offsetof(StdRdOptions, cmk_id)},
|
||||
{ "encrypt_algo", RELOPT_TYPE_STRING, offsetof(StdRdOptions, encrypt_algo)},
|
||||
{ "enable_tde", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, enable_tde)},
|
||||
{ "hasuids", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, hasuids) }
|
||||
{ "hasuids", RELOPT_TYPE_BOOL, offsetof(StdRdOptions, hasuids) },
|
||||
{ "compresstype", RELOPT_TYPE_INT,
|
||||
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressType)},
|
||||
{ "compress_level", RELOPT_TYPE_INT,
|
||||
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressLevel)},
|
||||
{ "compress_chunk_size", RELOPT_TYPE_INT,
|
||||
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressChunkSize)},
|
||||
{"compress_prealloc_chunks", RELOPT_TYPE_INT,
|
||||
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressPreallocChunks)},
|
||||
{ "compress_byte_convert", RELOPT_TYPE_BOOL,
|
||||
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressByteConvert)},
|
||||
{ "compress_diff_convert", RELOPT_TYPE_BOOL,
|
||||
offsetof(StdRdOptions, compress) + offsetof(PageCompressOpts, compressDiffConvert)},
|
||||
|
||||
};
|
||||
|
||||
options = parseRelOptions(reloptions, validate, kind, &numoptions);
|
||||
|
@ -2594,6 +2621,25 @@ void ForbidUserToSetDefinedOptions(List *options)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @Description: compressed parameter cannot be changed by ALTER TABLE statement if table is uncompressed table.
|
||||
* this function do the checking work.
|
||||
* @Param[IN] options: input user options
|
||||
* @See also:
|
||||
*/
|
||||
void ForbidUserToSetCompressedOptions(List *options)
|
||||
{
|
||||
static const char *unSupportOptions[] = {"compresstype", "compress_chunk_size", "compress_prealloc_chunks",
|
||||
"compress_level", "compress_byte_convert", "compress_diff_convert"};
|
||||
int firstInvalidOpt = -1;
|
||||
if (FindInvalidOption(options, unSupportOptions, lengthof(unSupportOptions), &firstInvalidOpt)) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
(errmsg("Un-support feature"), errdetail("Option \"%s\" doesn't allow ALTER on uncompressed table",
|
||||
unSupportOptions[firstInvalidOpt]))));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @Description: forbid to change inner option
|
||||
* inner options only can be used by system itself.
|
||||
|
@ -2888,3 +2934,33 @@ bool is_cstore_option(char relkind, Datum reloptions)
|
|||
pfree_ext(std_opt);
|
||||
return result;
|
||||
}
|
||||
|
||||
void SetOneOfCompressOption(const char* defname, TableCreateSupport* tableCreateSupport)
|
||||
{
|
||||
if (pg_strcasecmp(defname, "compresstype") == 0) {
|
||||
tableCreateSupport->compressType = true;
|
||||
} else if (pg_strcasecmp(defname, "compress_chunk_size") == 0) {
|
||||
tableCreateSupport->compressChunkSize = true;
|
||||
} else if (pg_strcasecmp(defname, "compress_prealloc_chunks") == 0) {
|
||||
tableCreateSupport->compressPreAllocChunks = true;
|
||||
} else if (pg_strcasecmp(defname, "compress_level") == 0) {
|
||||
tableCreateSupport->compressLevel = true;
|
||||
} else if (pg_strcasecmp(defname, "compress_byte_convert") == 0) {
|
||||
tableCreateSupport->compressByteConvert = true;
|
||||
} else if (pg_strcasecmp(defname, "compress_diff_convert") == 0) {
|
||||
tableCreateSupport->compressDiffConvert = true;
|
||||
}
|
||||
}
|
||||
|
||||
void CheckCompressOption(TableCreateSupport *tableCreateSupport)
|
||||
{
|
||||
if (!tableCreateSupport->compressType && HasCompressOption(tableCreateSupport)) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
|
||||
errmsg("compress_chunk_size/compress_prealloc_chunks/compress_level/compress_byte_convert/"
|
||||
"compress_diff_convert should be used with compresstype.")));
|
||||
}
|
||||
if (!tableCreateSupport->compressByteConvert && tableCreateSupport->compressDiffConvert) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INVALID_OPTION),
|
||||
errmsg("compress_diff_convert should be used with compress_byte_convert.")));
|
||||
}
|
||||
}
|
|
@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
|
||||
hashsearch.o hashsort.o hashutil.o
|
||||
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o\
|
||||
hashsort.o hashutil.o hash_xlog.o
|
||||
|
||||
include $(top_srcdir)/src/gausskernel/common.mk
|
||||
|
|
|
@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
|
|||
move primary bucket pages around after they are created.
|
||||
|
||||
Primary bucket pages (henceforth just "bucket pages") are allocated in
|
||||
power-of-2 groups, called "split points" in the code. Buckets 0 and 1
|
||||
are created when the index is initialized. At the first split, buckets 2
|
||||
and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
|
||||
when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket
|
||||
pages of a power-of-2 group appear consecutively in the index. This
|
||||
addressing scheme allows the physical location of a bucket page to be
|
||||
computed from the bucket number relatively easily, using only a small
|
||||
amount of control information. We take the log2() of the bucket number
|
||||
to determine which split point S the bucket belongs to, and then simply
|
||||
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
|
||||
metapage) to compute the physical address. hashm_spares[S] can be
|
||||
interpreted as the total number of overflow pages that have been allocated
|
||||
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
|
||||
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
|
||||
block numbers 1 and 2, just after the meta page. We always have
|
||||
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
|
||||
former. The difference between the two represents the number of overflow
|
||||
pages appearing between the bucket page groups of splitpoints N and N+1.
|
||||
|
||||
power-of-2 groups, called "split points" in the code. That means at every new
|
||||
splitpoint we double the existing number of buckets. Allocating huge chunks
|
||||
of bucket pages all at once isn't optimal and we will take ages to consume
|
||||
those. To avoid this exponential growth of index size, we did use a trick to
|
||||
break up allocation of buckets at the splitpoint into 4 equal phases. If
|
||||
(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
|
||||
we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
|
||||
of total buckets at each phase of splitpoint group. Next quarter of allocation
|
||||
will only happen if buckets of the previous phase have been already consumed.
|
||||
For the initial splitpoint groups < 10 we will allocate all of their buckets in
|
||||
single phase only, as number of buckets allocated at initial groups are small
|
||||
in numbers. And for the groups >= 10 the allocation process is distributed
|
||||
among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4
|
||||
different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
|
||||
indicate the number of buckets allocated within each phase of splitpoint group
|
||||
10. And, for splitpoint group 11 and 12 allocation phases will be
|
||||
{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We
|
||||
can see that at each splitpoint group we double the total number of buckets
|
||||
from the previous group but in an incremental phase. The bucket pages
|
||||
allocated within one phase of a splitpoint group will appear consecutively in
|
||||
the index. This addressing scheme allows the physical location of a bucket
|
||||
page to be computed from the bucket number relatively easily, using only a
|
||||
small amount of control information. If we look at the function
|
||||
_hash_spareindex for a given bucket number we first compute the
|
||||
splitpoint group it belongs to and then the phase to which the bucket belongs
|
||||
to. Adding them we get the global splitpoint phase number S to which the
|
||||
bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
|
||||
is an array stored in the metapage) with given bucket number to compute its
|
||||
physical address. The hashm_spares[S] can be interpreted as the total number
|
||||
of overflow pages that have been allocated before the bucket pages of
|
||||
splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1
|
||||
always appear at block numbers 1 and 2, just after the meta page. We always
|
||||
have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
|
||||
former. The difference between the two represents the number of overflow pages
|
||||
appearing between the bucket page groups of splitpoints phase N and N+1.
|
||||
(Note: the above describes what happens when filling an initially minimally
|
||||
sized hash index. In practice, we try to estimate the required index size
|
||||
and allocate a suitable number of splitpoints immediately, to avoid
|
||||
sized hash index. In practice, we try to estimate the required index size and
|
||||
allocate a suitable number of splitpoints phases immediately, to avoid
|
||||
expensive re-splitting during initial index build.)
|
||||
|
||||
When S splitpoints exist altogether, the array entries hashm_spares[0]
|
||||
through hashm_spares[S] are valid; hashm_spares[S] records the current
|
||||
total number of overflow pages. New overflow pages are created as needed
|
||||
at the end of the index, and recorded by incrementing hashm_spares[S].
|
||||
When it is time to create a new splitpoint's worth of bucket pages, we
|
||||
When it is time to create a new splitpoint phase's worth of bucket pages, we
|
||||
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
|
||||
stored in the hashm_ovflpoint field of the meta page). This has the
|
||||
effect of reserving the correct number of bucket pages at the end of the
|
||||
|
@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
|
|||
an index extension we crash after allocating filesystem space and before
|
||||
updating the metapage. Note that on filesystems that allow "holes" in
|
||||
files, it's entirely likely that pages before the logical EOF are not yet
|
||||
allocated: when we allocate a new splitpoint's worth of bucket pages, we
|
||||
allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
|
||||
physically zero the last such page to force the EOF up, and the first such
|
||||
page will be used immediately, but the intervening pages are not written
|
||||
until needed.
|
||||
|
@ -126,61 +142,98 @@ the initially created buckets.
|
|||
Lock Definitions
|
||||
----------------
|
||||
|
||||
We use both lmgr locks ("heavyweight" locks) and buffer context locks
|
||||
(LWLocks) to control access to a hash index. lmgr locks are needed for
|
||||
long-term locking since there is a (small) risk of deadlock, which we must
|
||||
be able to detect. Buffer context locks are used for short-term access
|
||||
control to individual pages of the index.
|
||||
Concurrency control for hash indexes is provided using buffer content
|
||||
locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL,
|
||||
cleanup lock means that we hold an exclusive lock on the buffer and have
|
||||
observed at some point after acquiring the lock that we hold the only pin
|
||||
on that buffer. For hash indexes, a cleanup lock on a primary bucket page
|
||||
represents the right to perform an arbitrary reorganization of the entire
|
||||
bucket. Therefore, scans retain a pin on the primary bucket page for the
|
||||
bucket they are currently scanning. Splitting a bucket requires a cleanup
|
||||
lock on both the old and new primary bucket pages. VACUUM therefore takes
|
||||
a cleanup lock on every bucket page in order to remove tuples. It can also
|
||||
remove tuples copied to a new bucket by any previous split operation, because
|
||||
the cleanup lock taken on the primary bucket page guarantees that no scans
|
||||
which started prior to the most recent split can still be in progress. After
|
||||
cleaning each page individually, it attempts to take a cleanup lock on the
|
||||
primary bucket page in order to "squeeze" the bucket down to the minimum
|
||||
possible number of pages.
|
||||
|
||||
We define the following lmgr locks for a hash index:
|
||||
To avoid deadlocks, we must be consistent about the lock order in which we
|
||||
lock the buckets for operations that requires locks on two different buckets.
|
||||
We choose to always lock the lower-numbered bucket first. The metapage is
|
||||
only ever locked after all bucket locks have been taken.
|
||||
|
||||
LockPage(rel, 0) represents the right to modify the hash-code-to-bucket
|
||||
mapping. A process attempting to enlarge the hash table by splitting a
|
||||
bucket must exclusive-lock this lock before modifying the metapage data
|
||||
representing the mapping. Processes intending to access a particular
|
||||
bucket must share-lock this lock until they have acquired lock on the
|
||||
correct target bucket.
|
||||
|
||||
LockPage(rel, page), where page is the page number of a hash bucket page,
|
||||
represents the right to split or compact an individual bucket. A process
|
||||
splitting a bucket must exclusive-lock both old and new halves of the
|
||||
bucket until it is done. A process doing VACUUM must exclusive-lock the
|
||||
bucket it is currently purging tuples from. Processes doing scans or
|
||||
insertions must share-lock the bucket they are scanning or inserting into.
|
||||
(It is okay to allow concurrent scans and insertions.)
|
||||
Metapage Caching
|
||||
----------------
|
||||
|
||||
The lmgr lock IDs corresponding to overflow pages are currently unused.
|
||||
These are available for possible future refinements.
|
||||
Both scanning the index and inserting tuples require locating the bucket
|
||||
where a given tuple ought to be located. To do this, we need the bucket
|
||||
count, highmask, and lowmask from the metapage; however, it's undesirable
|
||||
for performance reasons to have to have to lock and pin the metapage for
|
||||
every such operation. Instead, we retain a cached copy of the metapage
|
||||
in each each backend's relcache entry. This will produce the correct
|
||||
bucket mapping as long as the target bucket hasn't been split since the
|
||||
last cache refresh.
|
||||
|
||||
Note that these lock definitions are conceptually distinct from any sort
|
||||
of lock on the pages whose numbers they share. A process must also obtain
|
||||
read or write buffer lock on the metapage or bucket page before accessing
|
||||
said page.
|
||||
To guard against the possibility that such a split has occurred, the
|
||||
primary page of each bucket chain stores the number of buckets that
|
||||
existed as of the time the bucket was last split, or if never split as
|
||||
of the time it was created, in the space normally used for the
|
||||
previous block number (that is, hasho_prevblkno). This doesn't cost
|
||||
anything because the primary bucket page is always the first page in
|
||||
the chain, and the previous block number is therefore always, in
|
||||
reality, InvalidBlockNumber.
|
||||
|
||||
Processes performing hash index scans must hold share lock on the bucket
|
||||
they are scanning throughout the scan. This seems to be essential, since
|
||||
there is no reasonable way for a scan to cope with its bucket being split
|
||||
underneath it. This creates a possibility of deadlock external to the
|
||||
hash index code, since a process holding one of these locks could block
|
||||
waiting for an unrelated lock held by another process. If that process
|
||||
then does something that requires exclusive lock on the bucket, we have
|
||||
deadlock. Therefore the bucket locks must be lmgr locks so that deadlock
|
||||
can be detected and recovered from. This also forces the page-zero lock
|
||||
to be an lmgr lock, because as we'll see below it is held while attempting
|
||||
to acquire a bucket lock, and so it could also participate in a deadlock.
|
||||
After computing the ostensibly-correct bucket number based on our cached
|
||||
copy of the metapage, we lock the corresponding primary bucket page and
|
||||
check whether the bucket count stored in hasho_prevblkno is greater than
|
||||
our the number of buckets stored in our cached copy of the metapage. If
|
||||
so, the bucket has certainly been split, because the must originally
|
||||
have been less than the number of buckets that existed at that time and
|
||||
can't have increased except due to a split. If not, the bucket can't have
|
||||
been split, because a split would have created a new bucket with a higher
|
||||
bucket number than any we'd seen previously. In the latter case, we've
|
||||
locked the correct bucket and can proceed; in the former case, we must
|
||||
release the lock on this bucket, lock the metapage, update our cache,
|
||||
unlock the metapage, and retry.
|
||||
|
||||
Processes must obtain read (share) buffer context lock on any hash index
|
||||
page while reading it, and write (exclusive) lock while modifying it.
|
||||
To prevent deadlock we enforce these coding rules: no buffer lock may be
|
||||
held long term (across index AM calls), nor may any buffer lock be held
|
||||
while waiting for an lmgr lock, nor may more than one buffer lock
|
||||
be held at a time by any one process. (The third restriction is probably
|
||||
stronger than necessary, but it makes the proof of no deadlock obvious.)
|
||||
Needing to retry occasionally might seem expensive, but the number of times
|
||||
any given bucket can be split is limited to a few dozen no matter how
|
||||
many times the hash index is accessed, because the total number of
|
||||
buckets is limited to less than 2^32. On the other hand, the number of
|
||||
times we access a bucket is unbounded and will be several orders of
|
||||
magnitude larger even in unsympathetic cases.
|
||||
|
||||
(The metapage cache is new in v10. Older hash indexes had the primary
|
||||
bucket page's hasho_prevblkno initialized to InvalidBuffer.)
|
||||
|
||||
Pseudocode Algorithms
|
||||
---------------------
|
||||
|
||||
Various flags that are used in hash index operations are described as below:
|
||||
|
||||
The bucket-being-split and bucket-being-populated flags indicate that split
|
||||
the operation is in progress for a bucket. During split operation, a
|
||||
bucket-being-split flag is set on the old bucket and bucket-being-populated
|
||||
flag is set on new bucket. These flags are cleared once the split operation
|
||||
is finished.
|
||||
|
||||
The split-cleanup flag indicates that a bucket which has been recently split
|
||||
still contains tuples that were also copied to the new bucket; it essentially
|
||||
marks the split as incomplete. Once we're certain that no scans which
|
||||
started before the new bucket was fully populated are still in progress, we
|
||||
can remove the copies from the old bucket and clear the flag. We insist that
|
||||
this flag must be clear before splitting a bucket; thus, a bucket can't be
|
||||
split again until the previous split is totally complete.
|
||||
|
||||
The moved-by-split flag on a tuple indicates that tuple is moved from old to
|
||||
new bucket. Concurrent scans will skip such tuples until the split operation
|
||||
is finished. Once the tuple is marked as moved-by-split, it will remain so
|
||||
forever but that does no harm. We have intentionally not cleared it as that
|
||||
can generate an additional I/O which is not necessary.
|
||||
|
||||
The operations we need to support are: readers scanning the index for
|
||||
entries of a particular hash code (which by definition are all in the same
|
||||
bucket); insertion of a new tuple into the correct bucket; enlarging the
|
||||
|
@ -195,57 +248,75 @@ track of available overflow pages.
|
|||
|
||||
The reader algorithm is:
|
||||
|
||||
share-lock page 0 (to prevent active split)
|
||||
read/sharelock meta page
|
||||
compute bucket number for target hash key
|
||||
release meta page
|
||||
share-lock bucket page (to prevent split/compact of this bucket)
|
||||
release page 0 share-lock
|
||||
lock the primary bucket page of the target bucket
|
||||
if the target bucket is still being populated by a split:
|
||||
release the buffer content lock on current bucket page
|
||||
pin and acquire the buffer content lock on old bucket in shared mode
|
||||
release the buffer content lock on old bucket, but not pin
|
||||
retake the buffer content lock on new bucket
|
||||
arrange to scan the old bucket normally and the new bucket for
|
||||
tuples which are not moved-by-split
|
||||
-- then, per read request:
|
||||
read/sharelock current page of bucket
|
||||
step to next page if necessary (no chaining of locks)
|
||||
reacquire content lock on current page
|
||||
step to next page if necessary (no chaining of content locks, but keep
|
||||
the pin on the primary bucket throughout the scan; we also maintain
|
||||
a pin on the page currently being scanned)
|
||||
get tuple
|
||||
release current page
|
||||
release content lock
|
||||
-- at scan shutdown:
|
||||
release bucket share-lock
|
||||
release all pins still held
|
||||
|
||||
By holding the page-zero lock until lock on the target bucket is obtained,
|
||||
the reader ensures that the target bucket calculation is valid (otherwise
|
||||
the bucket might be split before the reader arrives at it, and the target
|
||||
entries might go into the new bucket). Holding the bucket sharelock for
|
||||
the remainder of the scan prevents the reader's current-tuple pointer from
|
||||
being invalidated by splits or compactions. Notice that the reader's lock
|
||||
does not prevent other buckets from being split or compacted.
|
||||
Holding the buffer pin on the primary bucket page for the whole scan prevents
|
||||
the reader's current-tuple pointer from being invalidated by splits or
|
||||
compactions. (Of course, other buckets can still be split or compacted.)
|
||||
|
||||
To keep concurrency reasonably good, we require readers to cope with
|
||||
concurrent insertions, which means that they have to be able to re-find
|
||||
their current scan position after re-acquiring the page sharelock. Since
|
||||
deletion is not possible while a reader holds the bucket sharelock, and
|
||||
we assume that heap tuple TIDs are unique, this can be implemented by
|
||||
their current scan position after re-acquiring the buffer content lock on
|
||||
page. Since deletion is not possible while a reader holds the pin on bucket,
|
||||
and we assume that heap tuple TIDs are unique, this can be implemented by
|
||||
searching for the same heap tuple TID previously returned. Insertion does
|
||||
not move index entries across pages, so the previously-returned index entry
|
||||
should always be on the same page, at the same or higher offset number,
|
||||
as it was before.
|
||||
|
||||
To allow for scans during a bucket split, if at the start of the scan, the
|
||||
bucket is marked as bucket-being-populated, it scan all the tuples in that
|
||||
bucket except for those that are marked as moved-by-split. Once it finishes
|
||||
the scan of all the tuples in the current bucket, it scans the old bucket from
|
||||
which this bucket is formed by split.
|
||||
|
||||
The insertion algorithm is rather similar:
|
||||
|
||||
share-lock page 0 (to prevent active split)
|
||||
read/sharelock meta page
|
||||
compute bucket number for target hash key
|
||||
release meta page
|
||||
share-lock bucket page (to prevent split/compact of this bucket)
|
||||
release page 0 share-lock
|
||||
-- (so far same as reader)
|
||||
read/exclusive-lock current page of bucket
|
||||
if full, release, read/exclusive-lock next page; repeat as needed
|
||||
lock the primary bucket page of the target bucket
|
||||
-- (so far same as reader, except for acquisition of buffer content lock in
|
||||
exclusive mode on primary bucket page)
|
||||
if the bucket-being-split flag is set for a bucket and pin count on it is
|
||||
one, then finish the split
|
||||
release the buffer content lock on current bucket
|
||||
get the "new" bucket which was being populated by the split
|
||||
scan the new bucket and form the hash table of TIDs
|
||||
conditionally get the cleanup lock on old and new buckets
|
||||
if we get the lock on both the buckets
|
||||
finish the split using algorithm mentioned below for split
|
||||
release the pin on old bucket and restart the insert from beginning.
|
||||
if current page is full, first check if this page contains any dead tuples.
|
||||
if yes, remove dead tuples from the current page and again check for the
|
||||
availability of the space. If enough space found, insert the tuple else
|
||||
release lock but not pin, read/exclusive-lock
|
||||
next page; repeat as needed
|
||||
>> see below if no space in any page of bucket
|
||||
take buffer content lock in exclusive mode on metapage
|
||||
insert tuple at appropriate place in page
|
||||
write/release current page
|
||||
release bucket share-lock
|
||||
read/exclusive-lock meta page
|
||||
mark current page dirty
|
||||
increment tuple count, decide if split needed
|
||||
write/release meta page
|
||||
done if no split needed, else enter Split algorithm below
|
||||
mark meta page dirty
|
||||
write WAL for insertion of tuple
|
||||
release the buffer content lock on metapage
|
||||
release buffer content lock on current page
|
||||
if current page is not a bucket page, release the pin on bucket page
|
||||
if split is needed, enter Split algorithm below
|
||||
release the pin on metapage
|
||||
|
||||
To speed searches, the index entries within any individual index page are
|
||||
kept sorted by hash code; the insertion code must take care to insert new
|
||||
|
@ -254,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this
|
|||
as explained above. We only need the short-term buffer locks to ensure
|
||||
that readers do not see a partially-updated page.
|
||||
|
||||
It is clearly impossible for readers and inserters to deadlock, and in
|
||||
fact this algorithm allows them a very high degree of concurrency.
|
||||
(The exclusive metapage lock taken to update the tuple count is stronger
|
||||
than necessary, since readers do not care about the tuple count, but the
|
||||
lock is held for such a short time that this is probably not an issue.)
|
||||
To avoid deadlock between readers and inserters, whenever there is a need
|
||||
to lock multiple buckets, we always take in the order suggested in Lock
|
||||
Definitions above. This algorithm allows them a very high degree of
|
||||
concurrency. (The exclusive metapage lock taken to update the tuple count
|
||||
is stronger than necessary, since readers do not care about the tuple count,
|
||||
but the lock is held for such a short time that this is probably not an
|
||||
issue.)
|
||||
|
||||
When an inserter cannot find space in any existing page of a bucket, it
|
||||
must obtain an overflow page and add that page to the bucket's chain.
|
||||
|
@ -269,82 +342,95 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets).
|
|||
The algorithm attempts, but does not necessarily succeed, to split one
|
||||
existing bucket in two, thereby lowering the fill ratio:
|
||||
|
||||
exclusive-lock page 0 (assert the right to begin a split)
|
||||
read/exclusive-lock meta page
|
||||
check split still needed
|
||||
if split not needed anymore, drop locks and exit
|
||||
decide which bucket to split
|
||||
Attempt to X-lock old bucket number (definitely could fail)
|
||||
Attempt to X-lock new bucket number (shouldn't fail, but...)
|
||||
if above fail, drop locks and exit
|
||||
update meta page to reflect new number of buckets
|
||||
write/release meta page
|
||||
release X-lock on page 0
|
||||
-- now, accesses to all other buckets can proceed.
|
||||
Perform actual split of bucket, moving tuples as needed
|
||||
>> see below about acquiring needed extra space
|
||||
Release X-locks of old and new buckets
|
||||
pin meta page and take buffer content lock in exclusive mode
|
||||
check split still needed
|
||||
if split not needed anymore, drop buffer content lock and pin and exit
|
||||
decide which bucket to split
|
||||
try to take a cleanup lock on that bucket; if fail, give up
|
||||
if that bucket is still being split or has split-cleanup work:
|
||||
try to finish the split and the cleanup work
|
||||
if that succeeds, start over; if it fails, give up
|
||||
mark the old and new buckets indicating split is in progress
|
||||
mark both old and new buckets as dirty
|
||||
write WAL for allocation of new page for split
|
||||
copy the tuples that belongs to new bucket from old bucket, marking
|
||||
them as moved-by-split
|
||||
write WAL record for moving tuples to new page once the new page is full
|
||||
or all the pages of old bucket are finished
|
||||
release lock but not pin for primary bucket page of old bucket,
|
||||
read/shared-lock next page; repeat as needed
|
||||
clear the bucket-being-split and bucket-being-populated flags
|
||||
mark the old bucket indicating split-cleanup
|
||||
write WAL for changing the flags on both old and new buckets
|
||||
|
||||
Note the page zero and metapage locks are not held while the actual tuple
|
||||
rearrangement is performed, so accesses to other buckets can proceed in
|
||||
parallel; in fact, it's possible for multiple bucket splits to proceed
|
||||
in parallel.
|
||||
|
||||
Split's attempt to X-lock the old bucket number could fail if another
|
||||
process holds S-lock on it. We do not want to wait if that happens, first
|
||||
because we don't want to wait while holding the metapage exclusive-lock,
|
||||
and second because it could very easily result in deadlock. (The other
|
||||
process might be out of the hash AM altogether, and could do something
|
||||
that blocks on another lock this process holds; so even if the hash
|
||||
algorithm itself is deadlock-free, a user-induced deadlock could occur.)
|
||||
So, this is a conditional LockAcquire operation, and if it fails we just
|
||||
abandon the attempt to split. This is all right since the index is
|
||||
overfull but perfectly functional. Every subsequent inserter will try to
|
||||
split, and eventually one will succeed. If multiple inserters failed to
|
||||
split, the index might still be overfull, but eventually, the index will
|
||||
The split operation's attempt to acquire cleanup-lock on the old bucket number
|
||||
could fail if another process holds any lock or pin on it. We do not want to
|
||||
wait if that happens, because we don't want to wait while holding the metapage
|
||||
exclusive-lock. So, this is a conditional LWLockAcquire operation, and if
|
||||
it fails we just abandon the attempt to split. This is all right since the
|
||||
index is overfull but perfectly functional. Every subsequent inserter will
|
||||
try to split, and eventually one will succeed. If multiple inserters failed
|
||||
to split, the index might still be overfull, but eventually, the index will
|
||||
not be overfull and split attempts will stop. (We could make a successful
|
||||
splitter loop to see if the index is still overfull, but it seems better to
|
||||
distribute the split overhead across successive insertions.)
|
||||
|
||||
A problem is that if a split fails partway through (eg due to insufficient
|
||||
disk space) the index is left corrupt. The probability of that could be
|
||||
made quite low if we grab a free page or two before we update the meta
|
||||
page, but the only real solution is to treat a split as a WAL-loggable,
|
||||
must-complete action. I'm not planning to teach hash about WAL in this
|
||||
go-round.
|
||||
If a split fails partway through (e.g. due to insufficient disk space or an
|
||||
interrupt), the index will not be corrupted. Instead, we'll retry the split
|
||||
every time a tuple is inserted into the old bucket prior to inserting the new
|
||||
tuple; eventually, we should succeed. The fact that a split is left
|
||||
unfinished doesn't prevent subsequent buckets from being split, but we won't
|
||||
try to split the bucket again until the prior split is finished. In other
|
||||
words, a bucket can be in the middle of being split for some time, but it can't
|
||||
be in the middle of two splits at the same time.
|
||||
|
||||
The fourth operation is garbage collection (bulk deletion):
|
||||
|
||||
next bucket := 0
|
||||
read/sharelock meta page
|
||||
pin metapage and take buffer content lock in exclusive mode
|
||||
fetch current max bucket number
|
||||
release meta page
|
||||
release meta page buffer content lock and pin
|
||||
while next bucket <= max bucket do
|
||||
Acquire X lock on target bucket
|
||||
Scan and remove tuples, compact free space as needed
|
||||
Release X lock
|
||||
acquire cleanup lock on primary bucket page
|
||||
loop:
|
||||
scan and remove tuples
|
||||
mark the target page dirty
|
||||
write WAL for deleting tuples from target page
|
||||
if this is the last bucket page, break out of loop
|
||||
pin and x-lock next page
|
||||
release prior lock and pin (except keep pin on primary bucket page)
|
||||
if the page we have locked is not the primary bucket page:
|
||||
release lock and take exclusive lock on primary bucket page
|
||||
if there are no other pins on the primary bucket page:
|
||||
squeeze the bucket to remove free space
|
||||
release the pin on primary bucket page
|
||||
next bucket ++
|
||||
end loop
|
||||
exclusive-lock meta page
|
||||
pin metapage and take buffer content lock in exclusive mode
|
||||
check if number of buckets changed
|
||||
if so, release lock and return to for-each-bucket loop
|
||||
if so, release content lock and pin and return to for-each-bucket loop
|
||||
else update metapage tuple count
|
||||
write/release meta page
|
||||
mark meta page dirty and write WAL for update of metapage
|
||||
release buffer content lock and pin
|
||||
|
||||
Note that this is designed to allow concurrent splits. If a split occurs,
|
||||
tuples relocated into the new bucket will be visited twice by the scan,
|
||||
but that does no harm. (We must however be careful about the statistics
|
||||
Note that this is designed to allow concurrent splits and scans. If a split
|
||||
occurs, tuples relocated into the new bucket will be visited twice by the
|
||||
scan, but that does no harm. As we release the lock on bucket page during
|
||||
cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
|
||||
and ensures that scan will always be behind cleanup. It is must to keep scans
|
||||
behind cleanup, else vacuum could decrease the TIDs that are required to
|
||||
complete the scan. Now, as the scan that returns multiple tuples from the
|
||||
same bucket page always expect next valid TID to be greater than or equal to
|
||||
the current TID, it might miss the tuples. This holds true for backward scans
|
||||
as well (backward scans first traverse each bucket starting from first bucket
|
||||
to last overflow page in the chain). We must be careful about the statistics
|
||||
reported by the VACUUM operation. What we can do is count the number of
|
||||
tuples scanned, and believe this in preference to the stored tuple count
|
||||
if the stored tuple count and number of buckets did *not* change at any
|
||||
time during the scan. This provides a way of correcting the stored tuple
|
||||
count if it gets out of sync for some reason. But if a split or insertion
|
||||
does occur concurrently, the scan count is untrustworthy; instead,
|
||||
subtract the number of tuples deleted from the stored tuple count and
|
||||
use that.)
|
||||
|
||||
The exclusive lock request could deadlock in some strange scenarios, but
|
||||
we can just error out without any great harm being done.
|
||||
tuples scanned, and believe this in preference to the stored tuple count if
|
||||
the stored tuple count and number of buckets did *not* change at any time
|
||||
during the scan. This provides a way of correcting the stored tuple count if
|
||||
it gets out of sync for some reason. But if a split or insertion does occur
|
||||
concurrently, the scan count is untrustworthy; instead, subtract the number of
|
||||
tuples deleted from the stored tuple count and use that.
|
||||
|
||||
|
||||
Free Space Management
|
||||
|
@ -360,25 +446,23 @@ overflow page to the free pool.
|
|||
|
||||
Obtaining an overflow page:
|
||||
|
||||
read/exclusive-lock meta page
|
||||
take metapage content lock in exclusive mode
|
||||
determine next bitmap page number; if none, exit loop
|
||||
release meta page lock
|
||||
read/exclusive-lock bitmap page
|
||||
release meta page content lock
|
||||
pin bitmap page and take content lock in exclusive mode
|
||||
search for a free page (zero bit in bitmap)
|
||||
if found:
|
||||
set bit in bitmap
|
||||
write/release bitmap page
|
||||
read/exclusive-lock meta page
|
||||
mark bitmap page dirty
|
||||
take metapage buffer content lock in exclusive mode
|
||||
if first-free-bit value did not change,
|
||||
update it and write meta page
|
||||
release meta page
|
||||
return page number
|
||||
update it and mark meta page dirty
|
||||
else (not found):
|
||||
release bitmap page
|
||||
release bitmap page buffer content lock
|
||||
loop back to try next bitmap page, if any
|
||||
-- here when we have checked all bitmap pages; we hold meta excl. lock
|
||||
extend index to add another overflow page; update meta information
|
||||
write/release meta page
|
||||
mark meta page dirty
|
||||
return page number
|
||||
|
||||
It is slightly annoying to release and reacquire the metapage lock
|
||||
|
@ -398,12 +482,17 @@ like this:
|
|||
|
||||
-- having determined that no space is free in the target bucket:
|
||||
remember last page of bucket, drop write lock on it
|
||||
call free-page-acquire routine
|
||||
re-write-lock last page of bucket
|
||||
if it is not last anymore, step to the last page
|
||||
update (former) last page to point to new page
|
||||
execute free-page-acquire (obtaining an overflow page) mechanism
|
||||
described above
|
||||
update (former) last page to point to the new page and mark buffer dirty
|
||||
write-lock and initialize new page, with back link to former last page
|
||||
write and release former last page
|
||||
write WAL for addition of overflow page
|
||||
release the locks on meta page and bitmap page acquired in
|
||||
free-page-acquire algorithm
|
||||
release the lock on former last page
|
||||
release the lock on new overflow page
|
||||
insert tuple into new page
|
||||
-- etc.
|
||||
|
||||
|
@ -418,27 +507,27 @@ free page; there can be no other process holding lock on it.
|
|||
|
||||
Bucket splitting uses a similar algorithm if it has to extend the new
|
||||
bucket, but it need not worry about concurrent extension since it has
|
||||
exclusive lock on the new bucket.
|
||||
buffer content lock in exclusive mode on the new bucket.
|
||||
|
||||
Freeing an overflow page is done by garbage collection and by bucket
|
||||
splitting (the old bucket may contain no-longer-needed overflow pages).
|
||||
In both cases, the process holds exclusive lock on the containing bucket,
|
||||
so need not worry about other accessors of pages in the bucket. The
|
||||
algorithm is:
|
||||
Freeing an overflow page requires the process to hold buffer content lock in
|
||||
exclusive mode on the containing bucket, so need not worry about other
|
||||
accessors of pages in the bucket. The algorithm is:
|
||||
|
||||
delink overflow page from bucket chain
|
||||
(this requires read/update/write/release of fore and aft siblings)
|
||||
read/share-lock meta page
|
||||
pin meta page and take buffer content lock in shared mode
|
||||
determine which bitmap page contains the free space bit for page
|
||||
release meta page
|
||||
read/exclusive-lock bitmap page
|
||||
release meta page buffer content lock
|
||||
pin bitmap page and take buffer content lock in exclusive mode
|
||||
retake meta page buffer content lock in exclusive mode
|
||||
move (insert) tuples that belong to the overflow page being freed
|
||||
update bitmap bit
|
||||
write/release bitmap page
|
||||
if page number is less than what we saw as first-free-bit in meta:
|
||||
read/exclusive-lock meta page
|
||||
mark bitmap page dirty
|
||||
if page number is still less than first-free-bit,
|
||||
update first-free-bit field and write meta page
|
||||
release meta page
|
||||
update first-free-bit field and mark meta page dirty
|
||||
write WAL for delinking overflow page operation
|
||||
release buffer content lock and pin
|
||||
release meta page buffer content lock and pin
|
||||
|
||||
We have to do it this way because we must clear the bitmap bit before
|
||||
changing the first-free-bit field (hashm_firstfree). It is possible that
|
||||
|
@ -448,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to. What must be
|
|||
avoided is having first-free-bit greater than the actual first free bit,
|
||||
because then that free page would never be found by searchers.
|
||||
|
||||
All the freespace operations should be called while holding no buffer
|
||||
locks. Since they need no lmgr locks, deadlock is not possible.
|
||||
The reason of moving tuples from overflow page while delinking the later is
|
||||
to make that as an atomic operation. Not doing so could lead to spurious reads
|
||||
on standby. Basically, the user might see the same tuple twice.
|
||||
|
||||
|
||||
WAL Considerations
|
||||
------------------
|
||||
|
||||
The hash index operations like create index, insert, delete, bucket split,
|
||||
allocate overflow page, and squeeze in themselves don't guarantee hash index
|
||||
consistency after a crash. To provide robustness, we write WAL for each of
|
||||
these operations.
|
||||
|
||||
CREATE INDEX writes multiple WAL records. First, we write a record to cover
|
||||
the initializatoin of the metapage, followed by one for each new bucket
|
||||
created, followed by one for the initial bitmap page. It's not important for
|
||||
index creation to appear atomic, because the index isn't yet visible to any
|
||||
other transaction, and the creating transaction will roll back in the event of
|
||||
a crash. It would be difficult to cover the whole operation with a single
|
||||
write-ahead log record anyway, because we can log only a fixed number of
|
||||
pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery.
|
||||
|
||||
Ordinary item insertions (that don't force a page split or need a new overflow
|
||||
page) are single WAL entries. They touch a single bucket page and the
|
||||
metapage. The metapage is updated during replay as it is updated during
|
||||
original operation.
|
||||
|
||||
If an insertion causes the addition of an overflow page, there will be one
|
||||
WAL entry for the new overflow page and second entry for insert itself.
|
||||
|
||||
If an insertion causes a bucket split, there will be one WAL entry for insert
|
||||
itself, followed by a WAL entry for allocating a new bucket, followed by a WAL
|
||||
entry for each overflow bucket page in the new bucket to which the tuples are
|
||||
moved from old bucket, followed by a WAL entry to indicate that split is
|
||||
complete for both old and new buckets. A split operation which requires
|
||||
overflow pages to complete the operation will need to write a WAL record for
|
||||
each new allocation of an overflow page.
|
||||
|
||||
As splitting involves multiple atomic actions, it's possible that the system
|
||||
crashes between moving tuples from bucket pages of the old bucket to new
|
||||
bucket. In such a case, after recovery, the old and new buckets will be
|
||||
marked with bucket-being-split and bucket-being-populated flags respectively
|
||||
which indicates that split is in progress for those buckets. The reader
|
||||
algorithm works correctly, as it will scan both the old and new buckets when
|
||||
the split is in progress as explained in the reader algorithm section above.
|
||||
|
||||
We finish the split at next insert or split operation on the old bucket as
|
||||
explained in insert and split algorithm above. It could be done during
|
||||
searches, too, but it seems best not to put any extra updates in what would
|
||||
otherwise be a read-only operation (updating is not possible in hot standby
|
||||
mode anyway). It would seem natural to complete the split in VACUUM, but since
|
||||
splitting a bucket might require allocating a new page, it might fail if you
|
||||
run out of disk space. That would be bad during VACUUM - the reason for
|
||||
running VACUUM in the first place might be that you run out of disk space,
|
||||
and now VACUUM won't finish because you're out of disk space. In contrast,
|
||||
an insertion can require enlarging the physical file anyway.
|
||||
|
||||
Deletion of tuples from a bucket is performed for two reasons: to remove dead
|
||||
tuples, and to remove tuples that were moved by a bucket split. A WAL entry
|
||||
is made for each bucket page from which tuples are removed, and then another
|
||||
WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples
|
||||
are removed, a separate WAL entry is made to update the metapage.
|
||||
|
||||
As deletion involves multiple atomic operations, it is quite possible that
|
||||
system crashes after (a) removing tuples from some of the bucket pages, (b)
|
||||
before clearing the garbage flag, or (c) before updating the metapage. If the
|
||||
system crashes before completing (b), it will again try to clean the bucket
|
||||
during next vacuum or insert after recovery which can have some performance
|
||||
impact, but it will work fine. If the system crashes before completing (c),
|
||||
after recovery there could be some additional splits until the next vacuum
|
||||
updates the metapage, but the other operations like insert, delete and scan
|
||||
will work correctly. We can fix this problem by actually updating the
|
||||
metapage based on delete operation during replay, but it's not clear whether
|
||||
it's worth the complication.
|
||||
|
||||
A squeeze operation moves tuples from one of the buckets later in the chain to
|
||||
one of the bucket earlier in chain and writes WAL record when either the
|
||||
bucket to which it is writing tuples is filled or bucket from which it
|
||||
is removing the tuples becomes empty.
|
||||
|
||||
As a squeeze operation involves writing multiple atomic operations, it is
|
||||
quite possible that the system crashes before completing the operation on
|
||||
entire bucket. After recovery, the operations will work correctly, but
|
||||
the index will remain bloated and this can impact performance of read and
|
||||
insert operations until the next vacuum squeeze the bucket completely.
|
||||
|
||||
|
||||
Other Notes
|
||||
-----------
|
||||
|
||||
All the shenanigans with locking prevent a split occurring while *another*
|
||||
process is stopped in a given bucket. They do not ensure that one of
|
||||
our *own* backend's scans is not stopped in the bucket, because lmgr
|
||||
doesn't consider a process's own locks to conflict. So the Split
|
||||
algorithm must check for that case separately before deciding it can go
|
||||
ahead with the split. VACUUM does not have this problem since nothing
|
||||
else can be happening within the vacuuming backend.
|
||||
|
||||
Should we instead try to fix the state of any conflicting local scan?
|
||||
Seems mighty ugly --- got to move the held bucket S-lock as well as lots
|
||||
of other messiness. For now, just punt and don't split.
|
||||
Clean up locks prevent a split from occurring while *another* process is stopped
|
||||
in a given bucket. It also ensures that one of our *own* backend's scans is not
|
||||
stopped in the bucket.
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* hash.cpp
|
||||
* Implementation of Margo Seltzer's Hashing package for postgres.
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -20,6 +20,8 @@
|
|||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xloginsert.h"
|
||||
#include "access/tableam.h"
|
||||
#include "access/relscan.h"
|
||||
#include "catalog/index.h"
|
||||
|
@ -34,6 +36,7 @@
|
|||
typedef struct {
|
||||
HSpool *spool; /* NULL if not using spooling */
|
||||
double indtuples; /* # tuples accepted into index */
|
||||
Relation heapRel; /* heap relation descriptor */
|
||||
} HashBuildState;
|
||||
|
||||
static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, const bool *isnull, bool tupleIsAlive,
|
||||
|
@ -52,6 +55,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
|
|||
double reltuples;
|
||||
double allvisfrac;
|
||||
uint32 num_buckets;
|
||||
long sort_threshold;
|
||||
HashBuildState buildstate;
|
||||
|
||||
/*
|
||||
|
@ -66,7 +70,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
|
|||
estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac, NULL);
|
||||
|
||||
/* Initialize the hash index metadata page and initial buckets */
|
||||
num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
|
||||
num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
|
||||
/*
|
||||
* If we just insert the tuples into the index in scan order, then
|
||||
* (assuming their hash codes are pretty random) there will be no locality
|
||||
|
@ -74,25 +78,38 @@ Datum hashbuild(PG_FUNCTION_ARGS)
|
|||
* then we'll thrash horribly. To prevent that scenario, we can sort the
|
||||
* tuples by (expected) bucket number. However, such a sort is useless
|
||||
* overhead when the index does fit in RAM. We choose to sort if the
|
||||
* initial index size exceeds NBuffers.
|
||||
* initial index size exceeds maintenance_work_mem, or the number of
|
||||
* buffers usable for the index, whichever is less. (Limiting by the
|
||||
* number of buffers should reduce thrashing between PG buffers and kernel
|
||||
* buffers, which seems useful even if no physical I/O results. Limiting
|
||||
* by maintenance_work_mem is useful to allow easy testing of the sort
|
||||
* code path, and may be useful to DBAs as an additional control knob.)
|
||||
*
|
||||
* NOTE: this test will need adjustment if a bucket is ever different from
|
||||
* one page.
|
||||
* one page. Also, "initial index size" accounting does not include the
|
||||
* metapage, nor the first bitmap page.
|
||||
*/
|
||||
if (num_buckets >= (uint32)g_instance.attr.attr_storage.NBuffers)
|
||||
buildstate.spool = _h_spoolinit(index, num_buckets, &indexInfo->ii_desc);
|
||||
sort_threshold = (u_sess->attr.attr_memory.maintenance_work_mem * 1024L) / BLCKSZ;
|
||||
if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
|
||||
sort_threshold = Min(sort_threshold, g_instance.attr.attr_storage.NBuffers);
|
||||
else
|
||||
sort_threshold = Min(sort_threshold, u_sess->storage_cxt.NLocBuffer);
|
||||
|
||||
if (num_buckets >= (uint32)sort_threshold)
|
||||
buildstate.spool = _h_spoolinit(heap, index, num_buckets, &indexInfo->ii_desc);
|
||||
else
|
||||
buildstate.spool = NULL;
|
||||
|
||||
/* prepare to build the index */
|
||||
buildstate.indtuples = 0;
|
||||
buildstate.heapRel = heap;
|
||||
|
||||
/* do the heap scan */
|
||||
reltuples = tableam_index_build_scan(heap, index, indexInfo, true, hashbuildCallback, (void*)&buildstate, NULL);
|
||||
|
||||
if (buildstate.spool != NULL) {
|
||||
/* sort the tuples and insert them into the index */
|
||||
_h_indexbuild(buildstate.spool);
|
||||
_h_indexbuild(buildstate.spool, buildstate.heapRel);
|
||||
_h_spooldestroy(buildstate.spool);
|
||||
}
|
||||
|
||||
|
@ -114,7 +131,7 @@ Datum hashbuildempty(PG_FUNCTION_ARGS)
|
|||
{
|
||||
Relation index = (Relation)PG_GETARG_POINTER(0);
|
||||
|
||||
_hash_metapinit(index, 0, INIT_FORKNUM);
|
||||
_hash_init(index, 0, INIT_FORKNUM);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
@ -126,21 +143,24 @@ static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, con
|
|||
void *state)
|
||||
{
|
||||
HashBuildState *buildstate = (HashBuildState *)state;
|
||||
Datum index_values[1];
|
||||
bool index_isnull[1];
|
||||
IndexTuple itup;
|
||||
|
||||
/* Hash indexes don't index nulls, see notes in hashinsert */
|
||||
if (isnull[0]) {
|
||||
/* convert data to a hash key; on failure, do not insert anything */
|
||||
if (!_hash_convert_tuple(index,
|
||||
values, isnull,
|
||||
index_values, index_isnull))
|
||||
return;
|
||||
}
|
||||
|
||||
/* Either spool the tuple for sorting, or just put it into the index */
|
||||
if (buildstate->spool != NULL) {
|
||||
_h_spool(buildstate->spool, &htup->t_self, values, isnull);
|
||||
_h_spool(buildstate->spool, &htup->t_self, index_values, index_isnull);
|
||||
} else {
|
||||
/* form an index tuple and point it at the heap tuple */
|
||||
itup = _hash_form_tuple(index, values, isnull);
|
||||
itup = index_form_tuple(RelationGetDescr(index), index_values, index_isnull);
|
||||
itup->t_tid = htup->t_self;
|
||||
_hash_doinsert(index, itup);
|
||||
_hash_doinsert(index, itup, buildstate->heapRel);
|
||||
pfree(itup);
|
||||
}
|
||||
|
||||
|
@ -159,30 +179,22 @@ Datum hashinsert(PG_FUNCTION_ARGS)
|
|||
Datum *values = (Datum *)PG_GETARG_POINTER(1);
|
||||
bool *isnull = (bool *)PG_GETARG_POINTER(2);
|
||||
ItemPointer ht_ctid = (ItemPointer)PG_GETARG_POINTER(3);
|
||||
|
||||
#ifdef NOT_USED
|
||||
Relation heapRel = (Relation)PG_GETARG_POINTER(4);
|
||||
IndexUniqueCheck checkUnique = (IndexUniqueCheck)PG_GETARG_INT32(5);
|
||||
#endif
|
||||
Datum index_values[1];
|
||||
bool index_isnull[1];
|
||||
IndexTuple itup;
|
||||
|
||||
/*
|
||||
* If the single index key is null, we don't insert it into the index.
|
||||
* Hash tables support scans on '='. Relational algebra says that A = B
|
||||
* returns null if either A or B is null. This means that no
|
||||
* qualification used in an index scan could ever return true on a null
|
||||
* attribute. It also means that indices can't be used by ISNULL or
|
||||
* NOTNULL scans, but that's an artifact of the strategy map architecture
|
||||
* chosen in 1986, not of the way nulls are handled here.
|
||||
*/
|
||||
if (isnull[0])
|
||||
PG_RETURN_BOOL(false);
|
||||
/* convert data to a hash key; on failure, do not insert anything */
|
||||
if (!_hash_convert_tuple(rel,
|
||||
values, isnull,
|
||||
index_values, index_isnull))
|
||||
return false;
|
||||
|
||||
/* generate an index tuple */
|
||||
itup = _hash_form_tuple(rel, values, isnull);
|
||||
/* form an index tuple and point it at the heap tuple */
|
||||
itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
|
||||
itup->t_tid = *ht_ctid;
|
||||
|
||||
_hash_doinsert(rel, itup);
|
||||
_hash_doinsert(rel, itup, heapRel);
|
||||
|
||||
pfree(itup);
|
||||
|
||||
|
@ -212,7 +224,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
* Reacquire the read lock here.
|
||||
*/
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
|
||||
|
||||
/*
|
||||
* If we've already initialized this scan, we can just advance it in the
|
||||
|
@ -224,16 +236,21 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
/*
|
||||
* An insertion into the current index page could have happened while
|
||||
* we didn't have read lock on it. Re-find our position by looking
|
||||
* for the TID we previously returned. (Because we hold share lock on
|
||||
* the bucket, no deletions or splits could have occurred; therefore
|
||||
* we can expect that the TID still exists in the current index page,
|
||||
* at an offset >= where we were.)
|
||||
* for the TID we previously returned. (Because we hold a pin on the
|
||||
* primary bucket page, no deletions or splits could have occurred;
|
||||
* therefore we can expect that the TID still exists in the current
|
||||
* index page, at an offset >= where we were.)
|
||||
*/
|
||||
OffsetNumber maxoffnum;
|
||||
|
||||
buf = so->hashso_curbuf;
|
||||
Assert(BufferIsValid(buf));
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
/*
|
||||
* We don't need test for old snapshot here as the current buffer is
|
||||
* pinned, so vacuum can't clean the page.
|
||||
*/
|
||||
maxoffnum = PageGetMaxOffsetNumber(page);
|
||||
for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) {
|
||||
IndexTuple itup;
|
||||
|
@ -253,14 +270,22 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
*/
|
||||
if (scan->kill_prior_tuple) {
|
||||
/*
|
||||
* Yes, so mark it by setting the LP_DEAD state in the item flags.
|
||||
* Yes, so remember it for later. (We'll deal with all such tuples
|
||||
* at once right after leaving the index page or at end of scan.)
|
||||
* In case if caller reverses the indexscan direction it is quite
|
||||
* possible that the same item might get entered multiple times.
|
||||
* But, we don't detect that; instead, we just forget any excess
|
||||
* entries.
|
||||
*/
|
||||
ItemIdMarkDead(PageGetItemId(page, offnum));
|
||||
if (so->killedItems == NULL)
|
||||
so->killedItems = (HashScanPosItem *)palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem));
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, mark as a hint.
|
||||
*/
|
||||
MarkBufferDirtyHint(buf, true);
|
||||
if (so->numKilled < MaxIndexTuplesPerPage) {
|
||||
so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
|
||||
so->killedItems[so->numKilled].indexOffset =
|
||||
ItemPointerGetOffsetNumber(&(so->hashso_curpos));
|
||||
so->numKilled++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -285,7 +310,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
|
||||
/* Release read lock on current buffer, but keep it pinned */
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/* Return current heap TID on success */
|
||||
scan->xs_ctup.t_self = so->hashso_heappos;
|
||||
|
@ -360,17 +385,20 @@ Datum hashbeginscan(PG_FUNCTION_ARGS)
|
|||
scan = RelationGetIndexScan(rel, nkeys, norderbys);
|
||||
|
||||
so = (HashScanOpaque)palloc(sizeof(HashScanOpaqueData));
|
||||
so->hashso_bucket_valid = false;
|
||||
so->hashso_bucket_blkno = 0;
|
||||
so->hashso_curbuf = InvalidBuffer;
|
||||
so->hashso_bucket_buf = InvalidBuffer;
|
||||
so->hashso_split_bucket_buf = InvalidBuffer;
|
||||
/* set position invalid (this will cause _hash_first call) */
|
||||
ItemPointerSetInvalid(&(so->hashso_curpos));
|
||||
ItemPointerSetInvalid(&(so->hashso_heappos));
|
||||
|
||||
scan->opaque = so;
|
||||
so->hashso_buc_populated = false;
|
||||
so->hashso_buc_split = false;
|
||||
|
||||
/* register scan in case we change pages it's using */
|
||||
_hash_regscan(scan);
|
||||
so->killedItems = NULL;
|
||||
so->numKilled = 0;
|
||||
|
||||
scan->opaque = so;
|
||||
|
||||
PG_RETURN_POINTER(scan);
|
||||
}
|
||||
|
@ -388,14 +416,13 @@ Datum hashrescan(PG_FUNCTION_ARGS)
|
|||
Relation rel = scan->indexRelation;
|
||||
|
||||
/* release any pin we still hold */
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_dropbuf(rel, so->hashso_curbuf);
|
||||
so->hashso_curbuf = InvalidBuffer;
|
||||
if (so->numKilled > 0) {
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
|
||||
_hash_kill_items(scan);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
|
||||
/* release lock on bucket, too */
|
||||
if (so->hashso_bucket_blkno)
|
||||
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
|
||||
so->hashso_bucket_blkno = 0;
|
||||
_hash_dropscanbuf(rel, so);
|
||||
|
||||
/* set position invalid (this will cause _hash_first call) */
|
||||
ItemPointerSetInvalid(&(so->hashso_curpos));
|
||||
|
@ -407,10 +434,11 @@ Datum hashrescan(PG_FUNCTION_ARGS)
|
|||
rc = memmove_s(scan->keyData, (unsigned)scan->numberOfKeys * sizeof(ScanKeyData), scankey,
|
||||
(unsigned)scan->numberOfKeys * sizeof(ScanKeyData));
|
||||
securec_check(rc, "", "");
|
||||
|
||||
so->hashso_bucket_valid = false;
|
||||
}
|
||||
|
||||
so->hashso_buc_populated = false;
|
||||
so->hashso_buc_split = false;
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
@ -423,18 +451,20 @@ Datum hashendscan(PG_FUNCTION_ARGS)
|
|||
HashScanOpaque so = (HashScanOpaque)scan->opaque;
|
||||
Relation rel = scan->indexRelation;
|
||||
|
||||
/* don't need scan registered anymore */
|
||||
_hash_dropscan(scan);
|
||||
/*
|
||||
* Before leaving current page, deal with any killed items. Also, ensure
|
||||
* that we acquire lock on current page before calling _hash_kill_items.
|
||||
*/
|
||||
if (so->numKilled > 0) {
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
|
||||
_hash_kill_items(scan);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
|
||||
/* release any pin we still hold */
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_dropbuf(rel, so->hashso_curbuf);
|
||||
so->hashso_curbuf = InvalidBuffer;
|
||||
_hash_dropscanbuf(rel, so);
|
||||
|
||||
/* release lock on bucket, too */
|
||||
if (so->hashso_bucket_blkno)
|
||||
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
|
||||
so->hashso_bucket_blkno = 0;
|
||||
if (so->killedItems != NULL)
|
||||
pfree(so->killedItems);
|
||||
|
||||
pfree(so);
|
||||
scan->opaque = NULL;
|
||||
|
@ -465,6 +495,9 @@ Datum hashrestrpos(PG_FUNCTION_ARGS)
|
|||
* The set of target tuples is specified via a callback routine that tells
|
||||
* whether any given heap tuple (identified by ItemPointer) is being deleted.
|
||||
*
|
||||
* This function also deletes the tuples that are moved by split to other
|
||||
* bucket.
|
||||
*
|
||||
* Result: a palloc'd struct containing statistical info for VACUUM displays.
|
||||
*/
|
||||
Datum hashbulkdelete(PG_FUNCTION_ARGS)
|
||||
|
@ -480,29 +513,24 @@ Datum hashbulkdelete(PG_FUNCTION_ARGS)
|
|||
Bucket orig_maxbucket;
|
||||
Bucket cur_maxbucket;
|
||||
Bucket cur_bucket;
|
||||
Buffer metabuf;
|
||||
Buffer metabuf = InvalidBuffer;
|
||||
HashMetaPage metap;
|
||||
HashMetaPageData local_metapage;
|
||||
errno_t rc;
|
||||
HashMetaPage cachedmetap;
|
||||
|
||||
tuples_removed = 0;
|
||||
num_index_tuples = 0;
|
||||
|
||||
/*
|
||||
* Read the metapage to fetch original bucket and tuple counts. Also, we
|
||||
* keep a copy of the last-seen metapage so that we can use its
|
||||
* hashm_spares[] values to compute bucket page addresses. This is a bit
|
||||
* hokey but perfectly safe, since the interesting entries in the spares
|
||||
* array cannot change under us; and it beats rereading the metapage for
|
||||
* each bucket.
|
||||
* We need a copy of the metapage so that we can use its hashm_spares[]
|
||||
* values to compute bucket page addresses, but a cached copy should be
|
||||
* good enough. (If not, we'll detect that further down and refresh the
|
||||
* cache as necessary.)
|
||||
*/
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
orig_maxbucket = metap->hashm_maxbucket;
|
||||
orig_ntuples = metap->hashm_ntuples;
|
||||
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
|
||||
securec_check(rc, "", "");
|
||||
_hash_relbuf(rel, metabuf);
|
||||
cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
|
||||
Assert(cachedmetap != NULL);
|
||||
|
||||
orig_maxbucket = cachedmetap->hashm_maxbucket;
|
||||
orig_ntuples = cachedmetap->hashm_ntuples;
|
||||
|
||||
/* Scan the buckets that we know exist */
|
||||
cur_bucket = 0;
|
||||
|
@ -512,90 +540,85 @@ loop_top:
|
|||
while (cur_bucket <= cur_maxbucket) {
|
||||
BlockNumber bucket_blkno;
|
||||
BlockNumber blkno;
|
||||
bool bucket_dirty = false;
|
||||
Buffer bucket_buf;
|
||||
Buffer buf;
|
||||
HashPageOpaque bucket_opaque;
|
||||
Page page;
|
||||
bool split_cleanup = false;
|
||||
|
||||
/* Get address of bucket's start page */
|
||||
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
|
||||
bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
|
||||
|
||||
/* Exclusive-lock the bucket so we can shrink it */
|
||||
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
|
||||
|
||||
/* Shouldn't have any active scans locally, either */
|
||||
if (_hash_has_active_scan(rel, cur_bucket))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), (errmsg("hash index has active scan during VACUUM."))));
|
||||
|
||||
/* Scan each page in bucket */
|
||||
blkno = bucket_blkno;
|
||||
while (BlockNumberIsValid(blkno)) {
|
||||
Buffer buf;
|
||||
Page page;
|
||||
HashPageOpaque opaque;
|
||||
OffsetNumber offno;
|
||||
OffsetNumber maxoffno;
|
||||
OffsetNumber deletable[MaxOffsetNumber];
|
||||
int ndeletable = 0;
|
||||
|
||||
vacuum_delay_point();
|
||||
/*
|
||||
* We need to acquire a cleanup lock on the primary bucket page to out
|
||||
* wait concurrent scans before deleting the dead tuples.
|
||||
*/
|
||||
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
|
||||
LockBufferForCleanup(buf);
|
||||
_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
|
||||
|
||||
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(opaque->hasho_bucket == cur_bucket);
|
||||
page = BufferGetPage(buf);
|
||||
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* Scan each tuple in page */
|
||||
maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
|
||||
IndexTuple itup;
|
||||
ItemPointer htup;
|
||||
|
||||
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offno));
|
||||
htup = &(itup->t_tid);
|
||||
if (callback(htup, callback_state, InvalidOid, InvalidBktId)) {
|
||||
/* mark the item for deletion */
|
||||
deletable[ndeletable++] = offno;
|
||||
tuples_removed += 1;
|
||||
} else
|
||||
num_index_tuples += 1;
|
||||
}
|
||||
/*
|
||||
* If the bucket contains tuples that are moved by split, then we need
|
||||
* to delete such tuples. We can't delete such tuples if the split
|
||||
* operation on bucket is not finished as those are needed by scans.
|
||||
*/
|
||||
if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) {
|
||||
split_cleanup = true;
|
||||
|
||||
/*
|
||||
* Apply deletions and write page if needed, advance to next page.
|
||||
* This bucket might have been split since we last held a lock on
|
||||
* the metapage. If so, hashm_maxbucket, hashm_highmask and
|
||||
* hashm_lowmask might be old enough to cause us to fail to remove
|
||||
* tuples left behind by the most recent split. To prevent that,
|
||||
* now that the primary page of the target bucket has been locked
|
||||
* (and thus can't be further split), check whether we need to
|
||||
* update our cached metapage data.
|
||||
*/
|
||||
blkno = opaque->hasho_nextblkno;
|
||||
|
||||
if (ndeletable > 0) {
|
||||
PageIndexMultiDelete(page, deletable, ndeletable);
|
||||
_hash_wrtbuf(rel, buf);
|
||||
bucket_dirty = true;
|
||||
} else
|
||||
_hash_relbuf(rel, buf);
|
||||
Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
|
||||
if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) {
|
||||
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
|
||||
Assert(cachedmetap != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we deleted anything, try to compact free space */
|
||||
if (bucket_dirty)
|
||||
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy);
|
||||
bucket_buf = buf;
|
||||
|
||||
/* Release bucket lock */
|
||||
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
|
||||
hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
|
||||
cachedmetap->hashm_maxbucket,
|
||||
cachedmetap->hashm_highmask,
|
||||
cachedmetap->hashm_lowmask, &tuples_removed,
|
||||
&num_index_tuples, split_cleanup,
|
||||
callback, callback_state);
|
||||
|
||||
_hash_dropbuf(rel, bucket_buf);
|
||||
|
||||
/* Advance to next bucket */
|
||||
cur_bucket++;
|
||||
}
|
||||
|
||||
if (BufferIsInvalid(metabuf))
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
|
||||
|
||||
/* Write-lock metapage and check for split since we started */
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
|
||||
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
if (cur_maxbucket != metap->hashm_maxbucket) {
|
||||
/* There's been a split, so process the additional bucket(s) */
|
||||
cur_maxbucket = metap->hashm_maxbucket;
|
||||
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
|
||||
securec_check(rc, "", "");
|
||||
_hash_relbuf(rel, metabuf);
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
|
||||
Assert(cachedmetap != NULL);
|
||||
cur_maxbucket = cachedmetap->hashm_maxbucket;
|
||||
goto loop_top;
|
||||
}
|
||||
|
||||
/* Okay, we're really done. Update tuple count in metapage. */
|
||||
START_CRIT_SECTION();
|
||||
if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) {
|
||||
/*
|
||||
* No one has split or inserted anything since start of scan, so
|
||||
|
@ -616,7 +639,27 @@ loop_top:
|
|||
num_index_tuples = metap->hashm_ntuples;
|
||||
}
|
||||
|
||||
_hash_wrtbuf(rel, metabuf);
|
||||
MarkBufferDirty(metabuf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_update_meta_page xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.ntuples = metap->hashm_ntuples;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
|
||||
|
||||
XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
|
||||
PageSetLSN(BufferGetPage(metabuf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
/* return statistics */
|
||||
if (stats == NULL)
|
||||
|
@ -652,9 +695,244 @@ Datum hashvacuumcleanup(PG_FUNCTION_ARGS)
|
|||
PG_RETURN_POINTER(stats);
|
||||
}
|
||||
|
||||
void hash_redo(XLogReaderState *record)
|
||||
/*
|
||||
* Helper function to perform deletion of index entries from a bucket.
|
||||
*
|
||||
* This function expects that the caller has acquired a cleanup lock on the
|
||||
* primary bucket page, and will return with a write lock again held on the
|
||||
* primary bucket page. The lock won't necessarily be held continuously,
|
||||
* though, because we'll release it when visiting overflow pages.
|
||||
*
|
||||
* It would be very bad if this function cleaned a page while some other
|
||||
* backend was in the midst of scanning it, because hashgettuple assumes
|
||||
* that the next valid TID will be greater than or equal to the current
|
||||
* valid TID. There can't be any concurrent scans in progress when we first
|
||||
* enter this function because of the cleanup lock we hold on the primary
|
||||
* bucket page, but as soon as we release that lock, there might be. We
|
||||
* handle that by conspiring to prevent those scans from passing our cleanup
|
||||
* scan. To do that, we lock the next page in the bucket chain before
|
||||
* releasing the lock on the previous page. (This type of lock chaining is
|
||||
* not ideal, so we might want to look for a better solution at some point.)
|
||||
*
|
||||
* We need to retain a pin on the primary bucket to ensure that no concurrent
|
||||
* split can start.
|
||||
*/
|
||||
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
|
||||
BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
|
||||
uint32 maxbucket, uint32 highmask, uint32 lowmask,
|
||||
double *tuples_removed, double *num_index_tuples,
|
||||
bool split_cleanup,
|
||||
IndexBulkDeleteCallback callback, void *callback_state)
|
||||
{
|
||||
ereport(PANIC, (errmsg("hash_redo: unimplemented")));
|
||||
BlockNumber blkno;
|
||||
Buffer buf;
|
||||
Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
|
||||
bool bucket_dirty = false;
|
||||
|
||||
blkno = bucket_blkno;
|
||||
buf = bucket_buf;
|
||||
|
||||
if (split_cleanup)
|
||||
new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
|
||||
lowmask, maxbucket);
|
||||
|
||||
/* Scan each page in bucket */
|
||||
for (;;) {
|
||||
HashPageOpaque opaque;
|
||||
OffsetNumber offno;
|
||||
OffsetNumber maxoffno;
|
||||
Buffer next_buf;
|
||||
Page page;
|
||||
OffsetNumber deletable[MaxOffsetNumber];
|
||||
int ndeletable = 0;
|
||||
bool retain_pin = false;
|
||||
bool clear_dead_marking = false;
|
||||
|
||||
vacuum_delay_point();
|
||||
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* Scan each tuple in page */
|
||||
maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
|
||||
ItemPointer htup;
|
||||
IndexTuple itup;
|
||||
Bucket bucket;
|
||||
bool kill_tuple = false;
|
||||
|
||||
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno));
|
||||
htup = &(itup->t_tid);
|
||||
|
||||
/*
|
||||
* To remove the dead tuples, we strictly want to rely on results
|
||||
* of callback function. refer btvacuumpage for detailed reason.
|
||||
*/
|
||||
if (callback && callback(htup, callback_state, InvalidOid, InvalidBktId)) {
|
||||
kill_tuple = true;
|
||||
if (tuples_removed)
|
||||
*tuples_removed += 1;
|
||||
} else if (split_cleanup) {
|
||||
/* delete the tuples that are moved by split. */
|
||||
bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
|
||||
maxbucket, highmask, lowmask);
|
||||
/* mark the item for deletion */
|
||||
if (bucket != cur_bucket) {
|
||||
/*
|
||||
* We expect tuples to either belong to current bucket or
|
||||
* new_bucket. This is ensured because we don't allow
|
||||
* further splits from bucket that contains garbage. See
|
||||
* comments in _hash_expandtable.
|
||||
*/
|
||||
Assert(bucket == new_bucket);
|
||||
kill_tuple = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (kill_tuple) {
|
||||
/* mark the item for deletion */
|
||||
deletable[ndeletable++] = offno;
|
||||
} else {
|
||||
/* we're keeping it, so count it */
|
||||
if (num_index_tuples)
|
||||
*num_index_tuples += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* retain the pin on primary bucket page till end of bucket scan */
|
||||
if (blkno == bucket_blkno)
|
||||
retain_pin = true;
|
||||
else
|
||||
retain_pin = false;
|
||||
|
||||
blkno = opaque->hasho_nextblkno;
|
||||
|
||||
/*
|
||||
* Apply deletions, advance to next page and write page if needed.
|
||||
*/
|
||||
if (ndeletable > 0) {
|
||||
/* No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
PageIndexMultiDelete(page, deletable, ndeletable);
|
||||
bucket_dirty = true;
|
||||
|
||||
/*
|
||||
* Let us mark the page as clean if vacuum removes the DEAD tuples
|
||||
* from an index page. We do this by clearing
|
||||
* LH_PAGE_HAS_DEAD_TUPLES flag.
|
||||
*/
|
||||
if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) {
|
||||
opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
||||
clear_dead_marking = true;
|
||||
}
|
||||
|
||||
MarkBufferDirty(buf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_delete xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.clear_dead_marking = clear_dead_marking;
|
||||
xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
|
||||
|
||||
/*
|
||||
* bucket buffer needs to be registered to ensure that we can
|
||||
* acquire a cleanup lock on it during replay.
|
||||
*/
|
||||
if (!xlrec.is_primary_bucket_page) {
|
||||
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
|
||||
}
|
||||
|
||||
XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
|
||||
XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber));
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
|
||||
if (!xlrec.is_primary_bucket_page) {
|
||||
PageSetLSN(BufferGetPage(bucket_buf), recptr);
|
||||
}
|
||||
PageSetLSN(BufferGetPage(buf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
|
||||
/* bail out if there are no more pages to scan. */
|
||||
if (!BlockNumberIsValid(blkno))
|
||||
break;
|
||||
|
||||
next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
|
||||
LH_OVERFLOW_PAGE,
|
||||
bstrategy);
|
||||
|
||||
/*
|
||||
* release the lock on previous page after acquiring the lock on next
|
||||
* page
|
||||
*/
|
||||
if (retain_pin)
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
else
|
||||
_hash_relbuf(rel, buf);
|
||||
|
||||
buf = next_buf;
|
||||
}
|
||||
|
||||
/*
|
||||
* lock the bucket page to clear the garbage flag and squeeze the bucket.
|
||||
* if the current buffer is same as bucket buffer, then we already have
|
||||
* lock on bucket page.
|
||||
*/
|
||||
if (buf != bucket_buf) {
|
||||
_hash_relbuf(rel, buf);
|
||||
LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear the garbage flag from bucket after deleting the tuples that are
|
||||
* moved by split. We purposefully clear the flag before squeeze bucket,
|
||||
* so that after restart, vacuum shouldn't again try to delete the moved
|
||||
* by split tuples.
|
||||
*/
|
||||
if (split_cleanup) {
|
||||
HashPageOpaque bucket_opaque;
|
||||
Page page;
|
||||
|
||||
page = BufferGetPage(bucket_buf);
|
||||
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
|
||||
MarkBufferDirty(bucket_buf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
XLogRecPtr recptr;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
|
||||
PageSetLSN(page, recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have deleted anything, try to compact free space. For squeezing
|
||||
* the bucket, we must have a cleanup lock, else it can impact the
|
||||
* ordering of tuples for a scan that has started before it.
|
||||
*/
|
||||
if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
|
||||
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy);
|
||||
else
|
||||
LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
|
||||
Datum hashmerge(PG_FUNCTION_ARGS)
|
||||
|
|
|
@ -0,0 +1,861 @@
|
|||
/* -------------------------------------------------------------------------
|
||||
*
|
||||
* hash_xlog.cpp
|
||||
* WAL replay logic for hash index.
|
||||
*
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/gausskernel/storage/access/hash/hash_xlog.cpp
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "access/xlogproc.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/transam.h"
|
||||
#include "access/xlogproc.h"
|
||||
#include "storage/procarray.h"
|
||||
#include "miscadmin.h"
|
||||
|
||||
/*
|
||||
* replay a hash index meta page
|
||||
*/
|
||||
static void hash_xlog_init_meta_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo metabuf;
|
||||
ForkNumber forknum;
|
||||
|
||||
/* create the index' metapage */
|
||||
XLogInitBufferForRedo(record, 0, &metabuf);
|
||||
Assert(BufferIsValid(metabuf.buf));
|
||||
HashRedoInitMetaPageOperatorPage(&metabuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
|
||||
/*
|
||||
* Force the on-disk state of init forks to always be in sync with the
|
||||
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
|
||||
* special handling for init forks as create index operations don't log a
|
||||
* full page image of the metapage.
|
||||
*/
|
||||
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
|
||||
if (forknum == INIT_FORKNUM)
|
||||
FlushOneBuffer(metabuf.buf);
|
||||
|
||||
/* all done */
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay a hash index bitmap page
|
||||
*/
|
||||
static void hash_xlog_init_bitmap_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo bitmapbuf;
|
||||
RedoBufferInfo metabuf;
|
||||
ForkNumber forknum;
|
||||
|
||||
/*
|
||||
* Initialize bitmap page
|
||||
*/
|
||||
XLogInitBufferForRedo(record, 0, &bitmapbuf);
|
||||
HashRedoInitBitmapPageOperatorBitmapPage(&bitmapbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(bitmapbuf.buf);
|
||||
|
||||
/*
|
||||
* Force the on-disk state of init forks to always be in sync with the
|
||||
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
|
||||
* special handling for init forks as create index operations don't log a
|
||||
* full page image of the metapage.
|
||||
*/
|
||||
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
|
||||
if (forknum == INIT_FORKNUM)
|
||||
FlushOneBuffer(bitmapbuf.buf);
|
||||
UnlockReleaseBuffer(bitmapbuf.buf);
|
||||
|
||||
/* add the new bitmap page to the metapage's list of bitmaps */
|
||||
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
|
||||
/*
|
||||
* Note: in normal operation, we'd update the metapage while still
|
||||
* holding lock on the bitmap page. But during replay it's not
|
||||
* necessary to hold that lock, since nobody can see it yet; the
|
||||
* creating transaction hasn't yet committed.
|
||||
*/
|
||||
HashRedoInitBitmapPageOperatorMetaPage(&metabuf);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
|
||||
XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL);
|
||||
if (forknum == INIT_FORKNUM)
|
||||
FlushOneBuffer(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay a hash index insert without split
|
||||
*/
|
||||
static void hash_xlog_insert(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buffer;
|
||||
RedoBufferInfo metabuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
|
||||
Size datalen;
|
||||
char *datapos = XLogRecGetBlockData(record, 0, &datalen);
|
||||
|
||||
HashRedoInsertOperatorPage(&buffer, XLogRecGetData(record), datapos, datalen);
|
||||
MarkBufferDirty(buffer.buf);
|
||||
}
|
||||
if (BufferIsValid(buffer.buf))
|
||||
UnlockReleaseBuffer(buffer.buf);
|
||||
|
||||
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
|
||||
/*
|
||||
* Note: in normal operation, we'd update the metapage while still
|
||||
* holding lock on the page we inserted into. But during replay it's
|
||||
* not necessary to hold that lock, since no other index updates can
|
||||
* be happening concurrently.
|
||||
*/
|
||||
HashRedoInsertOperatorMetaPage(&metabuf);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay addition of overflow page for hash index
|
||||
*/
|
||||
static void hash_xlog_add_ovfl_page(XLogReaderState* record)
|
||||
{
|
||||
RedoBufferInfo leftbuf;
|
||||
RedoBufferInfo ovflbuf;
|
||||
RedoBufferInfo metabuf;
|
||||
BlockNumber leftblk;
|
||||
BlockNumber rightblk;
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
|
||||
XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
|
||||
|
||||
XLogInitBufferForRedo(record, 0, &ovflbuf);
|
||||
Assert(BufferIsValid(ovflbuf.buf));
|
||||
|
||||
data = XLogRecGetBlockData(record, 0, &datalen);
|
||||
HashRedoAddOvflPageOperatorOvflPage(&ovflbuf, leftblk, data, datalen);
|
||||
MarkBufferDirty(ovflbuf.buf);
|
||||
|
||||
if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) {
|
||||
HashRedoAddOvflPageOperatorLeftPage(&leftbuf, rightblk);
|
||||
MarkBufferDirty(leftbuf.buf);
|
||||
}
|
||||
|
||||
if (BufferIsValid(leftbuf.buf))
|
||||
UnlockReleaseBuffer(leftbuf.buf);
|
||||
UnlockReleaseBuffer(ovflbuf.buf);
|
||||
|
||||
/*
|
||||
* Note: in normal operation, we'd update the bitmap and meta page while
|
||||
* still holding lock on the overflow pages. But during replay it's not
|
||||
* necessary to hold those locks, since no other index updates can be
|
||||
* happening concurrently.
|
||||
*/
|
||||
if (XLogRecHasBlockRef(record, 2)) {
|
||||
RedoBufferInfo mapbuffer;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) {
|
||||
data = XLogRecGetBlockData(record, 2, &datalen);
|
||||
|
||||
HashRedoAddOvflPageOperatorMapPage(&mapbuffer, data);
|
||||
MarkBufferDirty(mapbuffer.buf);
|
||||
}
|
||||
if (BufferIsValid(mapbuffer.buf))
|
||||
UnlockReleaseBuffer(mapbuffer.buf);
|
||||
}
|
||||
|
||||
if (XLogRecHasBlockRef(record, 3)) {
|
||||
RedoBufferInfo newmapbuf;
|
||||
|
||||
XLogInitBufferForRedo(record, 3, &newmapbuf);
|
||||
|
||||
HashRedoAddOvflPageOperatorNewmapPage(&newmapbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(newmapbuf.buf);
|
||||
|
||||
UnlockReleaseBuffer(newmapbuf.buf);
|
||||
}
|
||||
|
||||
if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) {
|
||||
data = XLogRecGetBlockData(record, 4, &datalen);
|
||||
|
||||
HashRedoAddOvflPageOperatorMetaPage(&metabuf, XLogRecGetData(record), data, datalen);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay allocation of page for split operation
|
||||
*/
|
||||
static void hash_xlog_split_allocate_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo oldbuf;
|
||||
RedoBufferInfo newbuf;
|
||||
RedoBufferInfo metabuf;
|
||||
Size datalen PG_USED_FOR_ASSERTS_ONLY;
|
||||
char *data = NULL;
|
||||
XLogRedoAction action;
|
||||
|
||||
/*
|
||||
* To be consistent with normal operation, here we take cleanup locks on
|
||||
* both the old and new buckets even though there can't be any concurrent
|
||||
* inserts.
|
||||
*/
|
||||
|
||||
/* replay the record for old bucket */
|
||||
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
|
||||
|
||||
/*
|
||||
* Note that we still update the page even if it was restored from a full
|
||||
* page image, because the special space is not included in the image.
|
||||
*/
|
||||
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
|
||||
HashRedoSplitAllocatePageOperatorObukPage(&oldbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(oldbuf.buf);
|
||||
}
|
||||
|
||||
/* replay the record for new bucket */
|
||||
XLogInitBufferForRedo(record, 1, &newbuf);
|
||||
HashRedoSplitAllocatePageOperatorNbukPage(&newbuf, XLogRecGetData(record));
|
||||
if (!IsBufferCleanupOK(newbuf.buf))
|
||||
elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock");
|
||||
MarkBufferDirty(newbuf.buf);
|
||||
|
||||
/*
|
||||
* We can release the lock on old bucket early as well but doing here to
|
||||
* consistent with normal operation.
|
||||
*/
|
||||
if (BufferIsValid(oldbuf.buf))
|
||||
UnlockReleaseBuffer(oldbuf.buf);
|
||||
if (BufferIsValid(newbuf.buf))
|
||||
UnlockReleaseBuffer(newbuf.buf);
|
||||
|
||||
/*
|
||||
* Note: in normal operation, we'd update the meta page while still
|
||||
* holding lock on the old and new bucket pages. But during replay it's
|
||||
* not necessary to hold those locks, since no other bucket splits can be
|
||||
* happening concurrently.
|
||||
*/
|
||||
|
||||
/* replay the record for metapage changes */
|
||||
if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) {
|
||||
data = XLogRecGetBlockData(record, 2, &datalen);
|
||||
|
||||
HashRedoSplitAllocatePageOperatorMetaPage(&metabuf, XLogRecGetData(record), data);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay of split operation
|
||||
*/
|
||||
static void hash_xlog_split_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
|
||||
elog(ERROR, "Hash split record did not contain a full-page image");
|
||||
|
||||
if (BufferIsValid(buf.buf))
|
||||
UnlockReleaseBuffer(buf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay completion of split operation
|
||||
*/
|
||||
static void hash_xlog_split_complete(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo oldbuf;
|
||||
RedoBufferInfo newbuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
/* replay the record for old bucket */
|
||||
action = XLogReadBufferForRedo(record, 0, &oldbuf);
|
||||
|
||||
/*
|
||||
* Note that we still update the page even if it was restored from a full
|
||||
* page image, because the bucket flag is not included in the image.
|
||||
*/
|
||||
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
|
||||
HashRedoSplitCompleteOperatorObukPage(&oldbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(oldbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(oldbuf.buf))
|
||||
UnlockReleaseBuffer(oldbuf.buf);
|
||||
|
||||
/* replay the record for new bucket */
|
||||
action = XLogReadBufferForRedo(record, 1, &newbuf);
|
||||
|
||||
/*
|
||||
* Note that we still update the page even if it was restored from a full
|
||||
* page image, because the bucket flag is not included in the image.
|
||||
*/
|
||||
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
|
||||
HashRedoSplitCompleteOperatorNbukPage(&newbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(newbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(newbuf.buf))
|
||||
UnlockReleaseBuffer(newbuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay move of page contents for squeeze operation of hash index
|
||||
*/
|
||||
static void hash_xlog_move_page_contents(XLogReaderState *record)
|
||||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
|
||||
RedoBufferInfo bucketbuf;
|
||||
RedoBufferInfo writebuf;
|
||||
RedoBufferInfo deletebuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
bucketbuf.buf = InvalidBuffer;
|
||||
writebuf.buf = InvalidBuffer;
|
||||
deletebuf.buf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Ensure we have a cleanup lock on primary bucket page before we start
|
||||
* with the actual replay operation. This is to ensure that neither a
|
||||
* scan can start nor a scan can be already-in-progress during the replay
|
||||
* of this operation. If we allow scans during this operation, then they
|
||||
* can miss some records or show the same record multiple times.
|
||||
*/
|
||||
if (xldata->is_prim_bucket_same_wrt) {
|
||||
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
|
||||
} else {
|
||||
/*
|
||||
* we don't care for return value as the purpose of reading bucketbuf
|
||||
* is to ensure a cleanup lock on primary bucket page.
|
||||
*/
|
||||
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
||||
|
||||
PageSetLSN(bucketbuf.pageinfo.page, lsn);
|
||||
|
||||
action = XLogReadBufferForRedo(record, 1, &writebuf);
|
||||
}
|
||||
|
||||
/* replay the record for adding entries in overflow buffer */
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 1, &datalen);
|
||||
|
||||
HashXlogMoveAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
|
||||
|
||||
MarkBufferDirty(writebuf.buf);
|
||||
}
|
||||
|
||||
/* replay the record for deleting entries from overflow buffer */
|
||||
if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) {
|
||||
char *ptr = NULL;
|
||||
Size len;
|
||||
|
||||
ptr = XLogRecGetBlockData(record, 2, &len);
|
||||
|
||||
HashXlogMoveDeleteOvflPageOperatorPage(&deletebuf, (void *)ptr, len);
|
||||
|
||||
MarkBufferDirty(deletebuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Replay is complete, now we can release the buffers. We release locks at
|
||||
* end of replay operation to ensure that we hold lock on primary bucket
|
||||
* page till end of operation. We can optimize by releasing the lock on
|
||||
* write buffer as soon as the operation for same is complete, if it is
|
||||
* not same as primary bucket page, but that doesn't seem to be worth
|
||||
* complicating the code.
|
||||
*/
|
||||
if (BufferIsValid(deletebuf.buf))
|
||||
UnlockReleaseBuffer(deletebuf.buf);
|
||||
|
||||
if (BufferIsValid(writebuf.buf))
|
||||
UnlockReleaseBuffer(writebuf.buf);
|
||||
|
||||
if (BufferIsValid(bucketbuf.buf))
|
||||
UnlockReleaseBuffer(bucketbuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay squeeze page operation of hash index
|
||||
*/
|
||||
static void hash_xlog_squeeze_page(XLogReaderState *record)
|
||||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
|
||||
RedoBufferInfo bucketbuf;
|
||||
RedoBufferInfo writebuf;
|
||||
RedoBufferInfo ovflbuf;
|
||||
RedoBufferInfo prevbuf;
|
||||
RedoBufferInfo mapbuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
bucketbuf.buf = InvalidBuffer;
|
||||
prevbuf.buf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Ensure we have a cleanup lock on primary bucket page before we start
|
||||
* with the actual replay operation. This is to ensure that neither a
|
||||
* scan can start nor a scan can be already-in-progress during the replay
|
||||
* of this operation. If we allow scans during this operation, then they
|
||||
* can miss some records or show the same record multiple times.
|
||||
*/
|
||||
if (xldata->is_prim_bucket_same_wrt) {
|
||||
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
|
||||
} else {
|
||||
/*
|
||||
* we don't care for return value as the purpose of reading bucketbuf
|
||||
* is to ensure a cleanup lock on primary bucket page.
|
||||
*/
|
||||
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
||||
|
||||
PageSetLSN(bucketbuf.pageinfo.page, lsn);
|
||||
|
||||
action = XLogReadBufferForRedo(record, 1, &writebuf);
|
||||
}
|
||||
|
||||
/* replay the record for adding entries in overflow buffer */
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 1, &datalen);
|
||||
|
||||
HashXlogSqueezeAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
|
||||
|
||||
MarkBufferDirty(writebuf.buf);
|
||||
}
|
||||
|
||||
/* replay the record for initializing overflow buffer */
|
||||
if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogSqueezeInitOvflbufOperatorPage(&ovflbuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(ovflbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(ovflbuf.buf))
|
||||
UnlockReleaseBuffer(ovflbuf.buf);
|
||||
|
||||
/* replay the record for page previous to the freed overflow page */
|
||||
if (!xldata->is_prev_bucket_same_wrt &&
|
||||
XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogSqueezeUpdatePrevPageOperatorPage(&prevbuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(prevbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(prevbuf.buf))
|
||||
UnlockReleaseBuffer(prevbuf.buf);
|
||||
|
||||
/* replay the record for page next to the freed overflow page */
|
||||
if (XLogRecHasBlockRef(record, 4)) {
|
||||
RedoBufferInfo nextbuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogSqueezeUpdateNextPageOperatorPage(&nextbuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(nextbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(nextbuf.buf))
|
||||
UnlockReleaseBuffer(nextbuf.buf);
|
||||
}
|
||||
|
||||
if (BufferIsValid(writebuf.buf))
|
||||
UnlockReleaseBuffer(writebuf.buf);
|
||||
|
||||
if (BufferIsValid(bucketbuf.buf))
|
||||
UnlockReleaseBuffer(bucketbuf.buf);
|
||||
|
||||
/*
|
||||
* Note: in normal operation, we'd update the bitmap and meta page while
|
||||
* still holding lock on the primary bucket page and overflow pages. But
|
||||
* during replay it's not necessary to hold those locks, since no other
|
||||
* index updates can be happening concurrently.
|
||||
*/
|
||||
/* replay the record for bitmap page */
|
||||
if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 5, &datalen);
|
||||
HashXlogSqueezeUpdateBitmapOperatorPage(&mapbuf, (void *)data);
|
||||
|
||||
MarkBufferDirty(mapbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(mapbuf.buf))
|
||||
UnlockReleaseBuffer(mapbuf.buf);
|
||||
|
||||
/* replay the record for meta page */
|
||||
if (XLogRecHasBlockRef(record, 6)) {
|
||||
RedoBufferInfo metabuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 6, &datalen);
|
||||
HashXlogSqueezeUpdateMateOperatorPage(&metabuf, (void *)data);
|
||||
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* replay delete operation of hash index
|
||||
*/
|
||||
static void hash_xlog_delete(XLogReaderState *record)
|
||||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
|
||||
RedoBufferInfo bucketbuf;
|
||||
RedoBufferInfo deletebuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
bucketbuf.buf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Ensure we have a cleanup lock on primary bucket page before we start
|
||||
* with the actual replay operation. This is to ensure that neither a
|
||||
* scan can start nor a scan can be already-in-progress during the replay
|
||||
* of this operation. If we allow scans during this operation, then they
|
||||
* can miss some records or show the same record multiple times.
|
||||
*/
|
||||
if (xldata->is_primary_bucket_page) {
|
||||
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
|
||||
} else {
|
||||
/*
|
||||
* we don't care for return value as the purpose of reading bucketbuf
|
||||
* is to ensure a cleanup lock on primary bucket page.
|
||||
*/
|
||||
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
||||
|
||||
PageSetLSN(bucketbuf.pageinfo.page, lsn);
|
||||
|
||||
action = XLogReadBufferForRedo(record, 1, &deletebuf);
|
||||
}
|
||||
|
||||
/* replay the record for deleting entries in bucket page */
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
char *ptr = NULL;
|
||||
Size len;
|
||||
|
||||
ptr = XLogRecGetBlockData(record, 1, &len);
|
||||
|
||||
HashXlogDeleteBlockOperatorPage(&deletebuf, XLogRecGetData(record), (void *)ptr, len);
|
||||
|
||||
MarkBufferDirty(deletebuf.buf);
|
||||
}
|
||||
if (BufferIsValid(deletebuf.buf))
|
||||
UnlockReleaseBuffer(deletebuf.buf);
|
||||
|
||||
if (BufferIsValid(bucketbuf.buf))
|
||||
UnlockReleaseBuffer(bucketbuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay split cleanup flag operation for primary bucket page.
|
||||
*/
|
||||
static void hash_xlog_split_cleanup(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buffer;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
|
||||
HashXlogSplitCleanupOperatorPage(&buffer);
|
||||
|
||||
MarkBufferDirty(buffer.buf);
|
||||
}
|
||||
if (BufferIsValid(buffer.buf))
|
||||
UnlockReleaseBuffer(buffer.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay for update meta page
|
||||
*/
|
||||
static void hash_xlog_update_meta_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo metabuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogUpdateMetaOperatorPage(&metabuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the latestRemovedXid from the heap pages pointed at by the index
|
||||
* tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
|
||||
* on which this function is based.
|
||||
*/
|
||||
static TransactionId hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record)
|
||||
{
|
||||
xl_hash_vacuum_one_page *xlrec;
|
||||
OffsetNumber *unused = NULL;
|
||||
Buffer ibuffer;
|
||||
Buffer hbuffer;
|
||||
Page ipage;
|
||||
Page hpage;
|
||||
RelFileNode rnode;
|
||||
BlockNumber blkno;
|
||||
ItemId iitemid;
|
||||
ItemId hitemid;
|
||||
IndexTuple itup;
|
||||
BlockNumber hblkno;
|
||||
OffsetNumber hoffnum;
|
||||
TransactionId latestRemovedXid = InvalidTransactionId;
|
||||
int i;
|
||||
|
||||
xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
|
||||
|
||||
/*
|
||||
* If there's nothing running on the standby we don't need to derive a
|
||||
* full latestRemovedXid value, so use a fast path out of here. This
|
||||
* returns InvalidTransactionId, and so will conflict with all HS
|
||||
* transactions; but since we just worked out that that's zero people,
|
||||
* it's OK.
|
||||
*
|
||||
* XXX There is a race condition here, which is that a new backend might
|
||||
* start just after we look. If so, it cannot need to conflict, but this
|
||||
* coding will result in throwing a conflict anyway.
|
||||
*/
|
||||
if (CountDBBackends(InvalidOid) == 0)
|
||||
return latestRemovedXid;
|
||||
|
||||
/*
|
||||
* Check if WAL replay has reached a consistent database state. If not, we
|
||||
* must PANIC. See the definition of
|
||||
* btree_xlog_delete_get_latestRemovedXid for more details.
|
||||
*/
|
||||
if (!t_thrd.xlog_cxt.reachedConsistency)
|
||||
elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data");
|
||||
|
||||
/*
|
||||
* Get index page. If the DB is consistent, this should not fail, nor
|
||||
* should any of the heap page fetches below. If one does, we return
|
||||
* InvalidTransactionId to cancel all HS transactions. That's probably
|
||||
* overkill, but it's safe, and certainly better than panicking here.
|
||||
*/
|
||||
XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
|
||||
ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL, NULL);
|
||||
|
||||
if (!BufferIsValid(ibuffer))
|
||||
return InvalidTransactionId;
|
||||
LockBuffer(ibuffer, HASH_READ);
|
||||
ipage = (Page) BufferGetPage(ibuffer);
|
||||
|
||||
/*
|
||||
* Loop through the deleted index items to obtain the TransactionId from
|
||||
* the heap items they point to.
|
||||
*/
|
||||
unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage);
|
||||
|
||||
for (i = 0; i < xlrec->ntuples; i++) {
|
||||
/*
|
||||
* Identify the index tuple about to be deleted.
|
||||
*/
|
||||
iitemid = PageGetItemId(ipage, unused[i]);
|
||||
itup = (IndexTuple) PageGetItem(ipage, iitemid);
|
||||
|
||||
/*
|
||||
* Locate the heap page that the index tuple points at
|
||||
*/
|
||||
hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
||||
hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL, NULL);
|
||||
|
||||
if (!BufferIsValid(hbuffer)) {
|
||||
UnlockReleaseBuffer(ibuffer);
|
||||
return InvalidTransactionId;
|
||||
}
|
||||
LockBuffer(hbuffer, HASH_READ);
|
||||
hpage = (Page) BufferGetPage(hbuffer);
|
||||
|
||||
/*
|
||||
* Look up the heap tuple header that the index tuple points at by
|
||||
* using the heap node supplied with the xlrec. We can't use
|
||||
* heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
|
||||
* Note that we are not looking at tuple data here, just headers.
|
||||
*/
|
||||
hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
|
||||
hitemid = PageGetItemId(hpage, hoffnum);
|
||||
|
||||
/*
|
||||
* Follow any redirections until we find something useful.
|
||||
*/
|
||||
while (ItemIdIsRedirected(hitemid)) {
|
||||
hoffnum = ItemIdGetRedirect(hitemid);
|
||||
hitemid = PageGetItemId(hpage, hoffnum);
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
/*
|
||||
* If the heap item has storage, then read the header and use that to
|
||||
* set latestRemovedXid.
|
||||
*
|
||||
* Some LP_DEAD items may not be accessible, so we ignore them.
|
||||
*/
|
||||
if (ItemIdHasStorage(hitemid)) {
|
||||
HeapTupleData tuple;
|
||||
tuple.t_data = (HeapTupleHeader) PageGetItem(hpage, hitemid);
|
||||
HeapTupleCopyBaseFromPage(&tuple, &hpage);
|
||||
HeapTupleHeaderAdvanceLatestRemovedXid(&tuple, &latestRemovedXid);
|
||||
} else if (ItemIdIsDead(hitemid)) {
|
||||
/*
|
||||
* Conjecture: if hitemid is dead then it had xids before the xids
|
||||
* marked on LP_NORMAL items. So we just ignore this item and move
|
||||
* onto the next, for the purposes of calculating
|
||||
* latestRemovedxids.
|
||||
*/
|
||||
} else
|
||||
Assert(!ItemIdIsUsed(hitemid));
|
||||
|
||||
UnlockReleaseBuffer(hbuffer);
|
||||
}
|
||||
|
||||
UnlockReleaseBuffer(ibuffer);
|
||||
|
||||
/*
|
||||
* If all heap tuples were LP_DEAD then we will be returning
|
||||
* InvalidTransactionId here, which avoids conflicts. This matches
|
||||
* existing logic which assumes that LP_DEAD tuples must already be older
|
||||
* than the latestRemovedXid on the cleanup record that set them as
|
||||
* LP_DEAD, hence must already have generated a conflict.
|
||||
*/
|
||||
return latestRemovedXid;
|
||||
}
|
||||
|
||||
/*
|
||||
* replay delete operation in hash index to remove
|
||||
* tuples marked as DEAD during index tuple insertion.
|
||||
*/
|
||||
static void hash_xlog_vacuum_one_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buffer;
|
||||
RedoBufferInfo metabuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
/*
|
||||
* If we have any conflict processing to do, it must happen before we
|
||||
* update the page.
|
||||
*
|
||||
* Hash index records that are marked as LP_DEAD and being removed during
|
||||
* hash index tuple insertion can conflict with standby queries. You might
|
||||
* think that vacuum records would conflict as well, but we've handled
|
||||
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
|
||||
* cleaned by the vacuum of the heap and so we can resolve any conflicts
|
||||
* just once when that arrives. After that we know that no conflicts
|
||||
* exist from individual hash index vacuum records on that index.
|
||||
*/
|
||||
if (InHotStandby) {
|
||||
TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record);
|
||||
RelFileNode rnode;
|
||||
|
||||
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
|
||||
ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
|
||||
}
|
||||
|
||||
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
|
||||
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
Size len;
|
||||
|
||||
len = XLogRecGetDataLen(record);
|
||||
HashXlogVacuumOnePageOperatorPage(&buffer, XLogRecGetData(record), len);
|
||||
|
||||
MarkBufferDirty(buffer.buf);
|
||||
}
|
||||
if (BufferIsValid(buffer.buf))
|
||||
UnlockReleaseBuffer(buffer.buf);
|
||||
|
||||
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogVacuumMateOperatorPage(&metabuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
void hash_redo(XLogReaderState *record)
|
||||
{
|
||||
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
||||
|
||||
switch (info) {
|
||||
case XLOG_HASH_INIT_META_PAGE:
|
||||
hash_xlog_init_meta_page(record);
|
||||
break;
|
||||
case XLOG_HASH_INIT_BITMAP_PAGE:
|
||||
hash_xlog_init_bitmap_page(record);
|
||||
break;
|
||||
case XLOG_HASH_INSERT:
|
||||
hash_xlog_insert(record);
|
||||
break;
|
||||
case XLOG_HASH_ADD_OVFL_PAGE:
|
||||
hash_xlog_add_ovfl_page(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
||||
hash_xlog_split_allocate_page(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_PAGE:
|
||||
hash_xlog_split_page(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_COMPLETE:
|
||||
hash_xlog_split_complete(record);
|
||||
break;
|
||||
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
||||
hash_xlog_move_page_contents(record);
|
||||
break;
|
||||
case XLOG_HASH_SQUEEZE_PAGE:
|
||||
hash_xlog_squeeze_page(record);
|
||||
break;
|
||||
case XLOG_HASH_DELETE:
|
||||
hash_xlog_delete(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_CLEANUP:
|
||||
hash_xlog_split_cleanup(record);
|
||||
break;
|
||||
case XLOG_HASH_UPDATE_META_PAGE:
|
||||
hash_xlog_update_meta_page(record);
|
||||
break;
|
||||
case XLOG_HASH_VACUUM_ONE_PAGE:
|
||||
hash_xlog_vacuum_one_page(record);
|
||||
break;
|
||||
default:
|
||||
elog(PANIC, "hash_redo: unknown op code %u", info);
|
||||
}
|
||||
}
|
||||
|
||||
bool IsHashVacuumPages(XLogReaderState *record)
|
||||
{
|
||||
uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK));
|
||||
|
||||
if (XLogRecGetRmid(record) == RM_HASH_ID) {
|
||||
if (info == XLOG_HASH_DELETE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
|
@ -3,8 +3,8 @@
|
|||
* hashinsert.cpp
|
||||
* Item insertion in hash tables for Postgres.
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -17,21 +17,30 @@
|
|||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/xloginsert.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rel_gs.h"
|
||||
#include "storage/lock/lwlock.h"
|
||||
#include "storage/buf/buf_internals.h"
|
||||
|
||||
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode);
|
||||
|
||||
/*
|
||||
* _hash_doinsert() -- Handle insertion of a single index tuple.
|
||||
*
|
||||
* This routine is called by the public interface routines, hashbuild
|
||||
* and hashinsert. By here, itup is completely filled in.
|
||||
* This routine is called by the public interface routines, hashbuild
|
||||
* and hashinsert. By here, itup is completely filled in.
|
||||
*/
|
||||
void _hash_doinsert(Relation rel, IndexTuple itup)
|
||||
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
|
||||
{
|
||||
Buffer buf;
|
||||
Buffer bucket_buf;
|
||||
Buffer metabuf;
|
||||
HashMetaPage metap;
|
||||
BlockNumber blkno;
|
||||
HashMetaPage usedmetap = NULL;
|
||||
Page metapage;
|
||||
Page page;
|
||||
HashPageOpaque pageopaque;
|
||||
|
@ -39,7 +48,7 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
|
|||
bool do_expand = false;
|
||||
uint32 hashkey;
|
||||
Bucket bucket;
|
||||
|
||||
OffsetNumber itup_off;
|
||||
/*
|
||||
* Get the hash key for the item (it's stored in the index tuple itself).
|
||||
*/
|
||||
|
@ -49,16 +58,16 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
|
|||
itemsz = IndexTupleDSize(*itup);
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
|
||||
* need to be consistent */
|
||||
/*
|
||||
* Acquire shared split lock so we can compute the target bucket safely
|
||||
* (see README).
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Read the metapage */
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
restart_insert:
|
||||
|
||||
/*
|
||||
* Read the metapage. We don't lock it yet; HashMaxItemSize() will
|
||||
* examine pd_pagesize_version, but that can't change so we can examine it
|
||||
* without a lock.
|
||||
*/
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
|
||||
metapage = BufferGetPage(metabuf);
|
||||
metap = HashPageGetMeta(metapage);
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a hash page at all. (Eventually, we
|
||||
|
@ -73,87 +82,154 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
|
|||
(unsigned long)HashMaxItemSize(metapage)),
|
||||
errhint("Values larger than a buffer page cannot be indexed.")));
|
||||
|
||||
/*
|
||||
* Compute the target bucket number, and convert to block number.
|
||||
*/
|
||||
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
|
||||
/* Lock the primary bucket page for the target bucket. */
|
||||
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, &usedmetap);
|
||||
Assert(usedmetap != NULL);
|
||||
|
||||
blkno = BUCKET_TO_BLKNO(metap, bucket);
|
||||
/* remember the primary bucket buffer to release the pin on it at end. */
|
||||
bucket_buf = buf;
|
||||
|
||||
/* release lock on metapage, but keep pin since we'll need it again */
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
|
||||
|
||||
/*
|
||||
* Acquire share lock on target bucket; then we can release split lock.
|
||||
*/
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
|
||||
_hash_droplock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Fetch the primary bucket page for the bucket */
|
||||
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
|
||||
page = BufferGetPage(buf);
|
||||
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(pageopaque->hasho_bucket == bucket);
|
||||
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
bucket = pageopaque->hasho_bucket;
|
||||
|
||||
/*
|
||||
* If this bucket is in the process of being split, try to finish the
|
||||
* split before inserting, because that might create room for the
|
||||
* insertion to proceed without allocating an additional overflow page.
|
||||
* It's only interesting to finish the split if we're trying to insert
|
||||
* into the bucket from which we're removing tuples (the "old" bucket),
|
||||
* not if we're trying to insert into the bucket into which tuples are
|
||||
* being moved (the "new" bucket).
|
||||
*/
|
||||
if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) {
|
||||
/* release the lock on bucket buffer, before completing the split. */
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
_hash_finish_split(rel, metabuf, buf, bucket,
|
||||
usedmetap->hashm_maxbucket,
|
||||
usedmetap->hashm_highmask,
|
||||
usedmetap->hashm_lowmask);
|
||||
|
||||
/* release the pin on old and meta buffer. retry for insert. */
|
||||
_hash_dropbuf(rel, buf);
|
||||
_hash_dropbuf(rel, metabuf);
|
||||
goto restart_insert;
|
||||
}
|
||||
|
||||
/* Do the insertion */
|
||||
while (PageGetFreeSpace(page) < itemsz) {
|
||||
BlockNumber nextblkno;
|
||||
|
||||
/*
|
||||
* Check if current page has any DEAD tuples. If yes, delete these
|
||||
* tuples and see if we can get a space for the new item to be
|
||||
* inserted before moving to the next page in the bucket chain.
|
||||
*/
|
||||
if (H_HAS_DEAD_TUPLES(pageopaque)) {
|
||||
if (IsBufferCleanupOK(buf)) {
|
||||
_hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
|
||||
|
||||
if (PageGetFreeSpace(page) >= itemsz)
|
||||
break; /* OK, now we have enough space */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* no space on this page; check for an overflow page
|
||||
*/
|
||||
BlockNumber nextblkno = pageopaque->hasho_nextblkno;
|
||||
nextblkno = pageopaque->hasho_nextblkno;
|
||||
|
||||
if (BlockNumberIsValid(nextblkno)) {
|
||||
/*
|
||||
* ovfl page exists; go get it. if it doesn't have room, we'll
|
||||
* find out next pass through the loop test above.
|
||||
* find out next pass through the loop test above. we always
|
||||
* release both the lock and pin if this is an overflow page, but
|
||||
* only the lock if this is the primary bucket page, since the pin
|
||||
* on the primary bucket must be retained throughout the scan.
|
||||
*/
|
||||
_hash_relbuf(rel, buf);
|
||||
if (buf != bucket_buf)
|
||||
_hash_relbuf(rel, buf);
|
||||
else
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
|
||||
page = BufferGetPage(buf);
|
||||
} else {
|
||||
/*
|
||||
* we're at the end of the bucket chain and we haven't found a
|
||||
* page with enough room. allocate a new overflow page.
|
||||
*
|
||||
* release our write lock without modifying buffer
|
||||
*/
|
||||
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
|
||||
|
||||
/* release our write lock without modifying buffer */
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/* chain to a new overflow page */
|
||||
buf = _hash_addovflpage(rel, metabuf, buf);
|
||||
buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
/* should fit now, given test above */
|
||||
Assert(PageGetFreeSpace(page) >= itemsz);
|
||||
}
|
||||
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
|
||||
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
|
||||
Assert(pageopaque->hasho_bucket == bucket);
|
||||
}
|
||||
|
||||
/* found page with enough space, so add the item here */
|
||||
(void)_hash_pgaddtup(rel, buf, itemsz, itup);
|
||||
|
||||
/* write and release the modified page */
|
||||
_hash_wrtbuf(rel, buf);
|
||||
|
||||
/* We can drop the bucket lock now */
|
||||
_hash_droplock(rel, blkno, HASH_SHARE);
|
||||
|
||||
/*
|
||||
* Write-lock the metapage so we can increment the tuple count. After
|
||||
* incrementing it, check to see if it's time for a split.
|
||||
*/
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
|
||||
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/* Do the update. No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
/* found page with enough space, so add the item here */
|
||||
itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
|
||||
MarkBufferDirty(buf);
|
||||
|
||||
/* metapage operations */
|
||||
metap = HashPageGetMeta(metapage);
|
||||
metap->hashm_ntuples += 1;
|
||||
|
||||
/* Make sure this stays in sync with _hash_expandtable() */
|
||||
do_expand = metap->hashm_ntuples > (double)metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
|
||||
|
||||
/* Write out the metapage and drop lock, but keep pin */
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
|
||||
MarkBufferDirty(metabuf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_insert xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.offnum = itup_off;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
|
||||
|
||||
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
|
||||
|
||||
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
||||
XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup));
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
|
||||
|
||||
PageSetLSN(BufferGetPage(buf), recptr);
|
||||
PageSetLSN(BufferGetPage(metabuf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
/* drop lock on metapage, but keep pin */
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/*
|
||||
* Release the modified page and ensure to release the pin on primary
|
||||
* page.
|
||||
*/
|
||||
_hash_relbuf(rel, buf);
|
||||
if (buf != bucket_buf)
|
||||
_hash_dropbuf(rel, bucket_buf);
|
||||
|
||||
/* Attempt to split if a split is needed */
|
||||
if (do_expand)
|
||||
|
@ -192,3 +268,130 @@ OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple
|
|||
|
||||
return itup_off;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_pgaddmultitup() -- add a tuple vector to a particular page in the index.
|
||||
*
|
||||
* This routine has same requirements for locking and tuple ordering as
|
||||
* _hash_pgaddtup().
|
||||
*
|
||||
* Returns the offset number array at which the tuples were inserted.
|
||||
*/
|
||||
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
|
||||
{
|
||||
OffsetNumber itup_off;
|
||||
Page page;
|
||||
uint32 hashkey;
|
||||
int i;
|
||||
|
||||
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
for (i = 0; i < nitups; i++) {
|
||||
Size itemsize;
|
||||
|
||||
itemsize = IndexTupleDSize(*itups[i]);
|
||||
itemsize = MAXALIGN(itemsize);
|
||||
|
||||
/* Find where to insert the tuple (preserving page's hashkey ordering) */
|
||||
hashkey = _hash_get_indextuple_hashkey(itups[i]);
|
||||
itup_off = _hash_binsearch(page, hashkey);
|
||||
|
||||
itup_offsets[i] = itup_off;
|
||||
|
||||
if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_vacuum_one_page - vacuum just one index page.
|
||||
*
|
||||
* Try to remove LP_DEAD items from the given page. We must acquire cleanup
|
||||
* lock on the page being modified before calling this function.
|
||||
*/
|
||||
|
||||
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode)
|
||||
{
|
||||
OffsetNumber deletable[MaxOffsetNumber];
|
||||
int ndeletable = 0;
|
||||
OffsetNumber offnum;
|
||||
OffsetNumber maxoff;
|
||||
Page page = BufferGetPage(buf);
|
||||
HashPageOpaque pageopaque;
|
||||
HashMetaPage metap;
|
||||
|
||||
/* Scan each tuple in page to see if it is marked as LP_DEAD */
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) {
|
||||
ItemId itemId = PageGetItemId(page, offnum);
|
||||
|
||||
if (ItemIdIsDead(itemId))
|
||||
deletable[ndeletable++] = offnum;
|
||||
}
|
||||
|
||||
if (ndeletable > 0) {
|
||||
/*
|
||||
* Write-lock the meta page so that we can decrement tuple count.
|
||||
*/
|
||||
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/* No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
PageIndexMultiDelete(page, deletable, ndeletable);
|
||||
|
||||
/*
|
||||
* Mark the page as not containing any LP_DEAD items. This is not
|
||||
* certainly true (there might be some that have recently been marked,
|
||||
* but weren't included in our target-item list), but it will almost
|
||||
* always be true and it doesn't seem worth an additional page scan to
|
||||
* check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
|
||||
* anyway.
|
||||
*/
|
||||
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
||||
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
metap->hashm_ntuples -= ndeletable;
|
||||
|
||||
MarkBufferDirty(buf);
|
||||
MarkBufferDirty(metabuf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_vacuum_one_page xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.hnode = hnode;
|
||||
xlrec.ntuples = ndeletable;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
|
||||
|
||||
/*
|
||||
* We need the target-offsets array whether or not we store the
|
||||
* whole buffer, to allow us to find the latestRemovedXid on a
|
||||
* standby server.
|
||||
*/
|
||||
XLogRegisterData((char *) deletable,
|
||||
ndeletable * sizeof(OffsetNumber));
|
||||
|
||||
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
|
||||
|
||||
PageSetLSN(BufferGetPage(buf), recptr);
|
||||
PageSetLSN(BufferGetPage(metabuf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
/*
|
||||
* Releasing write lock on meta page as we have updated the tuple
|
||||
* count.
|
||||
*/
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,138 +0,0 @@
|
|||
/* -------------------------------------------------------------------------
|
||||
*
|
||||
* hashscan.cpp
|
||||
* manage scans on hash tables
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/gausskernel/storage/access/hash/hashscan.cpp
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/relscan.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rel_gs.h"
|
||||
#include "utils/resowner.h"
|
||||
|
||||
/*
|
||||
* We track all of a backend's active scans on hash indexes using a list
|
||||
* of HashScanListData structs, which are allocated in t_thrd.top_mem_cxt.
|
||||
* It's okay to use a long-lived context because we rely on the ResourceOwner
|
||||
* mechanism to clean up unused entries after transaction or subtransaction
|
||||
* abort. We can't safely keep the entries in the executor's per-query
|
||||
* context, because that might be already freed before we get a chance to
|
||||
* clean up the list. (XXX seems like there should be a better way to
|
||||
* manage this...)
|
||||
*/
|
||||
typedef struct HashScanListData {
|
||||
IndexScanDesc hashsl_scan;
|
||||
ResourceOwner hashsl_owner;
|
||||
struct HashScanListData *hashsl_next;
|
||||
} HashScanListData;
|
||||
|
||||
typedef HashScanListData *HashScanList;
|
||||
|
||||
/*
|
||||
* ReleaseResources_hash() --- clean up hash subsystem resources.
|
||||
*
|
||||
* This is here because it needs to touch this module's static var HashScans.
|
||||
*/
|
||||
void ReleaseResources_hash(void)
|
||||
{
|
||||
HashScanList l = NULL;
|
||||
HashScanList prev = NULL;
|
||||
HashScanList next = NULL;
|
||||
|
||||
/*
|
||||
* Release all HashScanList items belonging to the current ResourceOwner.
|
||||
* Note that we do not release the underlying IndexScanDesc; that's in
|
||||
* executor memory and will go away on its own (in fact quite possibly has
|
||||
* gone away already, so we mustn't try to touch it here).
|
||||
*
|
||||
* Note: this should be a no-op during normal query shutdown. However, in
|
||||
* an abort situation ExecutorEnd is not called and so there may be open
|
||||
* index scans to clean up.
|
||||
*/
|
||||
prev = NULL;
|
||||
|
||||
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = next) {
|
||||
next = l->hashsl_next;
|
||||
if (l->hashsl_owner == t_thrd.utils_cxt.CurrentResourceOwner) {
|
||||
if (prev == NULL)
|
||||
u_sess->exec_cxt.HashScans = next;
|
||||
else
|
||||
prev->hashsl_next = next;
|
||||
|
||||
pfree(l);
|
||||
/* prev does not change */
|
||||
} else
|
||||
prev = l;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_regscan() -- register a new scan.
|
||||
*/
|
||||
void _hash_regscan(IndexScanDesc scan)
|
||||
{
|
||||
HashScanList new_el;
|
||||
|
||||
new_el = (HashScanList)MemoryContextAlloc(
|
||||
SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sizeof(HashScanListData));
|
||||
new_el->hashsl_scan = scan;
|
||||
new_el->hashsl_owner = t_thrd.utils_cxt.CurrentResourceOwner;
|
||||
new_el->hashsl_next = u_sess->exec_cxt.HashScans;
|
||||
u_sess->exec_cxt.HashScans = new_el;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_dropscan() -- drop a scan from the scan list
|
||||
*/
|
||||
void _hash_dropscan(IndexScanDesc scan)
|
||||
{
|
||||
HashScanList chk = NULL;
|
||||
HashScanList last = NULL;
|
||||
|
||||
last = NULL;
|
||||
for (chk = u_sess->exec_cxt.HashScans; chk != NULL && chk->hashsl_scan != scan; chk = chk->hashsl_next)
|
||||
last = chk;
|
||||
|
||||
if (chk == NULL)
|
||||
ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("hash scan list trashed")));
|
||||
|
||||
if (last == NULL)
|
||||
u_sess->exec_cxt.HashScans = chk->hashsl_next;
|
||||
else
|
||||
last->hashsl_next = chk->hashsl_next;
|
||||
|
||||
pfree(chk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Is there an active scan in this bucket?
|
||||
*/
|
||||
bool _hash_has_active_scan(Relation rel, Bucket bucket)
|
||||
{
|
||||
Oid relid = RelationGetRelid(rel);
|
||||
HashScanList l = NULL;
|
||||
|
||||
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = l->hashsl_next) {
|
||||
if (relid == l->hashsl_scan->indexRelation->rd_id) {
|
||||
HashScanOpaque so = (HashScanOpaque)l->hashsl_scan->opaque;
|
||||
|
||||
if (so->hashso_bucket_valid && so->hashso_bucket == bucket)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
|
@ -3,8 +3,8 @@
|
|||
* hashsearch.cpp
|
||||
* search code for openGauss hash tables
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -64,40 +64,131 @@ bool _hash_next(IndexScanDesc scan, ScanDirection dir)
|
|||
}
|
||||
|
||||
/*
|
||||
* Advance to next page in a bucket, if any.
|
||||
* Advance to next page in a bucket, if any. If we are scanning the bucket
|
||||
* being populated during split operation then this function advances to the
|
||||
* bucket being split after the last bucket page of bucket being populated.
|
||||
*/
|
||||
static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
|
||||
static void _hash_readnext(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
Relation rel = scan->indexRelation;
|
||||
HashScanOpaque so = (HashScanOpaque)scan->opaque;
|
||||
bool block_found = false;
|
||||
|
||||
blkno = (*opaquep)->hasho_nextblkno;
|
||||
_hash_relbuf(rel, *bufp);
|
||||
|
||||
/*
|
||||
* Retain the pin on primary bucket page till the end of scan. Refer the
|
||||
* comments in _hash_first to know the reason of retaining pin.
|
||||
*/
|
||||
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
|
||||
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
|
||||
else
|
||||
_hash_relbuf(rel, *bufp);
|
||||
|
||||
*bufp = InvalidBuffer;
|
||||
/* check for interrupts while we're not holding any buffer lock */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
if (BlockNumberIsValid(blkno)) {
|
||||
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
|
||||
block_found = true;
|
||||
} else if (so->hashso_buc_populated && !so->hashso_buc_split) {
|
||||
/*
|
||||
* end of bucket, scan bucket being split if there was a split in
|
||||
* progress at the start of scan.
|
||||
*/
|
||||
*bufp = so->hashso_split_bucket_buf;
|
||||
|
||||
/*
|
||||
* buffer for bucket being split must be valid as we acquire the pin
|
||||
* on it before the start of scan and retain it till end of scan.
|
||||
*/
|
||||
Assert(BufferIsValid(*bufp));
|
||||
|
||||
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
|
||||
|
||||
/*
|
||||
* setting hashso_buc_split to true indicates that we are scanning
|
||||
* bucket being split.
|
||||
*/
|
||||
so->hashso_buc_split = true;
|
||||
|
||||
block_found = true;
|
||||
}
|
||||
|
||||
if (block_found) {
|
||||
*pagep = BufferGetPage(*bufp);
|
||||
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
|
||||
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance to previous page in a bucket, if any.
|
||||
* Advance to previous page in a bucket, if any. If the current scan has
|
||||
* started during split operation then this function advances to bucket
|
||||
* being populated after the first bucket page of bucket being split.
|
||||
*/
|
||||
static void _hash_readprev(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
|
||||
static void _hash_readprev(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
|
||||
Relation rel = scan->indexRelation;
|
||||
HashScanOpaque so = (HashScanOpaque) scan->opaque;
|
||||
bool haveprevblk;
|
||||
|
||||
blkno = (*opaquep)->hasho_prevblkno;
|
||||
_hash_relbuf(rel, *bufp);
|
||||
/*
|
||||
* Retain the pin on primary bucket page till the end of scan. Refer the
|
||||
* comments in _hash_first to know the reason of retaining pin.
|
||||
*/
|
||||
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) {
|
||||
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
|
||||
haveprevblk = false;
|
||||
} else {
|
||||
_hash_relbuf(rel, *bufp);
|
||||
haveprevblk = true;
|
||||
}
|
||||
*bufp = InvalidBuffer;
|
||||
/* check for interrupts while we're not holding any buffer lock */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
if (BlockNumberIsValid(blkno)) {
|
||||
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
||||
if (haveprevblk) {
|
||||
Assert(BlockNumberIsValid(blkno));
|
||||
*bufp = _hash_getbuf(rel, blkno, HASH_READ,
|
||||
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
||||
*pagep = BufferGetPage(*bufp);
|
||||
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
|
||||
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
|
||||
|
||||
/*
|
||||
* We always maintain the pin on bucket page for whole scan operation,
|
||||
* so releasing the additional pin we have acquired here.
|
||||
*/
|
||||
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
|
||||
_hash_dropbuf(rel, *bufp);
|
||||
} else if (so->hashso_buc_populated && so->hashso_buc_split) {
|
||||
/*
|
||||
* end of bucket, scan bucket being populated if there was a split in
|
||||
* progress at the start of scan.
|
||||
*/
|
||||
*bufp = so->hashso_bucket_buf;
|
||||
|
||||
/*
|
||||
* buffer for bucket being populated must be valid as we acquire the
|
||||
* pin on it before the start of scan and retain it till end of scan.
|
||||
*/
|
||||
Assert(BufferIsValid(*bufp));
|
||||
|
||||
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
|
||||
*pagep = BufferGetPage(*bufp);
|
||||
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
|
||||
|
||||
/* move to the end of bucket chain */
|
||||
while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
|
||||
_hash_readnext(scan, bufp, pagep, opaquep);
|
||||
|
||||
/*
|
||||
* setting hashso_buc_split to false indicates that we are scanning
|
||||
* bucket being populated.
|
||||
*/
|
||||
so->hashso_buc_split = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -117,12 +208,9 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
|
|||
ScanKey cur;
|
||||
uint32 hashkey;
|
||||
Bucket bucket;
|
||||
BlockNumber blkno;
|
||||
Buffer buf;
|
||||
Buffer metabuf;
|
||||
Page page;
|
||||
HashPageOpaque opaque;
|
||||
HashMetaPage metap;
|
||||
IndexTuple itup;
|
||||
ItemPointer current;
|
||||
OffsetNumber offnum;
|
||||
|
@ -174,48 +262,71 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
|
|||
|
||||
so->hashso_sk_hash = hashkey;
|
||||
|
||||
/*
|
||||
* Acquire shared split lock so we can compute the target bucket safely
|
||||
* (see README).
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Read the metapage */
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
/*
|
||||
* Compute the target bucket number, and convert to block number.
|
||||
*/
|
||||
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
|
||||
|
||||
blkno = BUCKET_TO_BLKNO(metap, bucket);
|
||||
|
||||
/* done with the metapage */
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
/*
|
||||
* Acquire share lock on target bucket; then we can release split lock.
|
||||
*/
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
|
||||
_hash_droplock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Update scan opaque state to show we have lock on the bucket */
|
||||
so->hashso_bucket = bucket;
|
||||
so->hashso_bucket_valid = true;
|
||||
so->hashso_bucket_blkno = blkno;
|
||||
|
||||
/* Fetch the primary bucket page for the bucket */
|
||||
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
|
||||
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(opaque->hasho_bucket == bucket);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
bucket = opaque->hasho_bucket;
|
||||
|
||||
so->hashso_bucket_buf = buf;
|
||||
/*
|
||||
* If a bucket split is in progress, then while scanning the bucket being
|
||||
* populated, we need to skip tuples that were copied from bucket being
|
||||
* split. We also need to maintain a pin on the bucket being split to
|
||||
* ensure that split-cleanup work done by vacuum doesn't remove tuples
|
||||
* from it till this scan is done. We need to maintain a pin on the
|
||||
* bucket being populated to ensure that vacuum doesn't squeeze that
|
||||
* bucket till this scan is complete; otherwise, the ordering of tuples
|
||||
* can't be maintained during forward and backward scans. Here, we have
|
||||
* to be cautious about locking order: first, acquire the lock on bucket
|
||||
* being split; then, release the lock on it but not the pin; then,
|
||||
* acquire a lock on bucket being populated and again re-verify whether
|
||||
* the bucket split is still in progress. Acquiring the lock on bucket
|
||||
* being split first ensures that the vacuum waits for this scan to
|
||||
* finish.
|
||||
*/
|
||||
if (H_BUCKET_BEING_POPULATED(opaque)) {
|
||||
BlockNumber old_blkno;
|
||||
Buffer old_buf;
|
||||
|
||||
old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
|
||||
|
||||
/*
|
||||
* release the lock on new bucket and re-acquire it after acquiring
|
||||
* the lock on old bucket.
|
||||
*/
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
|
||||
|
||||
/*
|
||||
* remember the split bucket buffer so as to use it later for
|
||||
* scanning.
|
||||
*/
|
||||
so->hashso_split_bucket_buf = old_buf;
|
||||
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
LockBuffer(buf, BUFFER_LOCK_SHARE);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
Assert(opaque->hasho_bucket == bucket);
|
||||
|
||||
if (H_BUCKET_BEING_POPULATED(opaque)) {
|
||||
so->hashso_buc_populated = true;
|
||||
} else {
|
||||
_hash_dropbuf(rel, so->hashso_split_bucket_buf);
|
||||
so->hashso_split_bucket_buf = InvalidBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
/* If a backwards scan is requested, move to the end of the chain */
|
||||
if (ScanDirectionIsBackward(dir)) {
|
||||
while (BlockNumberIsValid(opaque->hasho_nextblkno))
|
||||
_hash_readnext(rel, &buf, &page, &opaque);
|
||||
/*
|
||||
* Backward scans that start during split needs to start from end of
|
||||
* bucket being split.
|
||||
*/
|
||||
while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
|
||||
(so->hashso_buc_populated && !so->hashso_buc_split))
|
||||
_hash_readnext(scan, &buf, &page, &opaque);
|
||||
}
|
||||
|
||||
/* Now find the first tuple satisfying the qualification */
|
||||
|
@ -239,6 +350,12 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
|
|||
* false. Else, return true and set the hashso_curpos for the
|
||||
* scan to the right thing.
|
||||
*
|
||||
* Here we need to ensure that if the scan has started during split, then
|
||||
* skip the tuples that are moved by split while scanning bucket being
|
||||
* populated and then scan the bucket being split to cover all such
|
||||
* tuples. This is done to ensure that we don't miss tuples in the scans
|
||||
* that are started during split.
|
||||
*
|
||||
* 'bufP' points to the current buffer, which is pinned and read-locked.
|
||||
* On success exit, we have pin and read-lock on whichever page
|
||||
* contains the right item; on failure, we have released all buffers.
|
||||
|
@ -283,9 +400,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
do {
|
||||
switch (dir) {
|
||||
case ForwardScanDirection:
|
||||
if (offnum != InvalidOffsetNumber)
|
||||
if (offnum != InvalidOffsetNumber) {
|
||||
offnum = OffsetNumberNext(offnum); /* move forward */
|
||||
else {
|
||||
} else {
|
||||
/* new page, locate starting position by binary search */
|
||||
offnum = _hash_binsearch(page, so->hashso_sk_hash);
|
||||
}
|
||||
|
@ -298,14 +415,27 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
if (offnum <= maxoff) {
|
||||
Assert(offnum >= FirstOffsetNumber);
|
||||
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
|
||||
/*
|
||||
* skip the tuples that are moved by split operation
|
||||
* for the scan that has started when split was in
|
||||
* progress
|
||||
*/
|
||||
if (so->hashso_buc_populated && !so->hashso_buc_split &&
|
||||
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
|
||||
offnum = OffsetNumberNext(offnum); /* move forward */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
|
||||
break; /* yes, so exit for-loop */
|
||||
}
|
||||
|
||||
/* Before leaving current page, deal with any killed items */
|
||||
if (so->numKilled > 0)
|
||||
_hash_kill_items(scan);
|
||||
/*
|
||||
* ran off the end of this page, try the next
|
||||
*/
|
||||
_hash_readnext(rel, &buf, &page, &opaque);
|
||||
_hash_readnext(scan, &buf, &page, &opaque);
|
||||
if (BufferIsValid(buf)) {
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
offnum = _hash_binsearch(page, so->hashso_sk_hash);
|
||||
|
@ -318,9 +448,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
break;
|
||||
|
||||
case BackwardScanDirection:
|
||||
if (offnum != InvalidOffsetNumber)
|
||||
if (offnum != InvalidOffsetNumber) {
|
||||
offnum = OffsetNumberPrev(offnum); /* move back */
|
||||
else {
|
||||
} else {
|
||||
/* new page, locate starting position by binary search */
|
||||
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
|
||||
}
|
||||
|
@ -333,14 +463,26 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
if (offnum >= FirstOffsetNumber) {
|
||||
Assert(offnum <= maxoff);
|
||||
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
|
||||
/*
|
||||
* skip the tuples that are moved by split operation
|
||||
* for the scan that has started when split was in
|
||||
* progress
|
||||
*/
|
||||
if (so->hashso_buc_populated && !so->hashso_buc_split &&
|
||||
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
|
||||
offnum = OffsetNumberPrev(offnum); /* move back */
|
||||
continue;
|
||||
}
|
||||
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
|
||||
break; /* yes, so exit for-loop */
|
||||
}
|
||||
|
||||
/* Before leaving current page, deal with any killed items */
|
||||
if (so->numKilled > 0)
|
||||
_hash_kill_items(scan);
|
||||
/*
|
||||
* ran off the end of this page, try the next
|
||||
*/
|
||||
_hash_readprev(rel, &buf, &page, &opaque);
|
||||
_hash_readprev(scan, &buf, &page, &opaque);
|
||||
if (BufferIsValid(buf)) {
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
|
||||
|
@ -360,9 +502,16 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
}
|
||||
|
||||
if (itup == NULL) {
|
||||
/* we ran off the end of the bucket without finding a match */
|
||||
/*
|
||||
* We ran off the end of the bucket without finding a match.
|
||||
* Release the pin on bucket buffers. Normally, such pins are
|
||||
* released at end of scan, however scrolling cursors can
|
||||
* reacquire the bucket lock and pin in the same scan multiple
|
||||
* times.
|
||||
*/
|
||||
*bufP = so->hashso_curbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(current);
|
||||
_hash_dropscanbuf(rel, so);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
* plenty of locality of access.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
|
@ -37,15 +37,23 @@
|
|||
struct HSpool {
|
||||
Tuplesortstate *sortstate; /* state data for tuplesort.c */
|
||||
Relation index;
|
||||
/*
|
||||
* We sort the hash keys based on the buckets they belong to. Below masks
|
||||
* are used in _hash_hashkey2bucket to determine the bucket of given hash
|
||||
* key.
|
||||
*/
|
||||
uint32 high_mask;
|
||||
uint32 low_mask;
|
||||
uint32 max_buckets;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* create and initialize a spool structure
|
||||
*/
|
||||
HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
|
||||
HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void *meminfo)
|
||||
{
|
||||
HSpool *hspool = (HSpool *)palloc0(sizeof(HSpool));
|
||||
uint32 hash_mask;
|
||||
UtilityDesc *desc = (UtilityDesc *)meminfo;
|
||||
int work_mem = (desc->query_mem[0] > 0) ? desc->query_mem[0] : u_sess->attr.attr_memory.maintenance_work_mem;
|
||||
int max_mem = (desc->query_mem[1] > 0) ? desc->query_mem[1] : 0;
|
||||
|
@ -57,18 +65,26 @@ HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
|
|||
* num_buckets buckets in the index, the appropriate mask can be computed
|
||||
* as follows.
|
||||
*
|
||||
* Note: at present, the passed-in num_buckets is always a power of 2, so
|
||||
* we could just compute num_buckets - 1. We prefer not to assume that
|
||||
* here, though.
|
||||
* NOTE : This hash mask calculation should be in sync with similar
|
||||
* calculation in _hash_init_metabuffer.
|
||||
*/
|
||||
hash_mask = (((uint32)1) << _hash_log2(num_buckets)) - 1;
|
||||
hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
|
||||
hspool->low_mask = (hspool->high_mask >> 1);
|
||||
hspool->max_buckets = num_buckets - 1;
|
||||
|
||||
/*
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem to
|
||||
* speed index creation. This should be OK since a single backend can't
|
||||
* run multiple index creations in parallel.
|
||||
*/
|
||||
hspool->sortstate = tuplesort_begin_index_hash(index, hash_mask, work_mem, false, max_mem);
|
||||
hspool->sortstate = tuplesort_begin_index_hash(heap,
|
||||
index,
|
||||
hspool->high_mask,
|
||||
hspool->low_mask,
|
||||
hspool->max_buckets,
|
||||
work_mem,
|
||||
false,
|
||||
max_mem);
|
||||
|
||||
return hspool;
|
||||
}
|
||||
|
@ -94,7 +110,7 @@ void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, const bool *isnul
|
|||
* given a spool loaded by successive calls to _h_spool,
|
||||
* create an entire index.
|
||||
*/
|
||||
void _h_indexbuild(HSpool *hspool)
|
||||
void _h_indexbuild(HSpool *hspool, Relation heapRel)
|
||||
{
|
||||
IndexTuple itup;
|
||||
bool should_free = false;
|
||||
|
@ -102,7 +118,7 @@ void _h_indexbuild(HSpool *hspool)
|
|||
tuplesort_performsort(hspool->sortstate);
|
||||
|
||||
while ((itup = tuplesort_getindextuple(hspool->sortstate, true, &should_free)) != NULL) {
|
||||
_hash_doinsert(hspool->index, itup);
|
||||
_hash_doinsert(hspool->index, itup, heapRel);
|
||||
if (should_free)
|
||||
pfree(itup);
|
||||
}
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* hashutil.cpp
|
||||
* Utility code for openGauss hash implementation.
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -22,7 +22,9 @@
|
|||
#include "utils/lsyscache.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rel_gs.h"
|
||||
#include "storage/buf/buf_internals.h"
|
||||
|
||||
#define CALC_NEW_BUCKET(old_bucket, lowmask) ((old_bucket) | ((lowmask) + 1))
|
||||
/*
|
||||
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
|
||||
*/
|
||||
|
@ -133,6 +135,70 @@ uint32 _hash_log2(uint32 num)
|
|||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_spareindex -- returns spare index / global splitpoint phase of the bucket
|
||||
*/
|
||||
uint32 _hash_spareindex(uint32 num_bucket)
|
||||
{
|
||||
uint32 splitpoint_group;
|
||||
uint32 splitpoint_phases;
|
||||
|
||||
splitpoint_group = _hash_log2(num_bucket);
|
||||
|
||||
if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
||||
return splitpoint_group;
|
||||
|
||||
/* account for single-phase groups */
|
||||
splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
|
||||
|
||||
/* account for multi-phase groups before splitpoint_group */
|
||||
splitpoint_phases +=
|
||||
((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
|
||||
HASH_SPLITPOINT_PHASE_BITS);
|
||||
|
||||
/* account for phases within current group */
|
||||
splitpoint_phases +=
|
||||
(((num_bucket - 1) >>
|
||||
(splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
|
||||
HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
|
||||
|
||||
return splitpoint_phases;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_totalbuckets -- returns total number of buckets allocated till
|
||||
* the given splitpoint phase.
|
||||
*/
|
||||
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
|
||||
{
|
||||
uint32 splitpoint_group;
|
||||
uint32 total_buckets;
|
||||
uint32 phases_within_splitpoint_group;
|
||||
|
||||
if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
||||
return (1 << splitpoint_phase);
|
||||
|
||||
/* get splitpoint's group */
|
||||
splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
|
||||
splitpoint_group +=
|
||||
((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
|
||||
HASH_SPLITPOINT_PHASE_BITS);
|
||||
|
||||
/* account for buckets before splitpoint_group */
|
||||
total_buckets = (1 << (splitpoint_group - 1));
|
||||
|
||||
/* account for buckets within splitpoint_group */
|
||||
phases_within_splitpoint_group =
|
||||
(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
|
||||
HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
|
||||
total_buckets +=
|
||||
(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
|
||||
phases_within_splitpoint_group);
|
||||
|
||||
return total_buckets;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* _hash_checkpage -- sanity checks on the format of all hash pages
|
||||
*
|
||||
|
@ -216,25 +282,36 @@ uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
|
|||
}
|
||||
|
||||
/*
|
||||
* _hash_form_tuple - form an index tuple containing hash code only
|
||||
* _hash_convert_tuple - convert raw index data to hash key
|
||||
*
|
||||
* Inputs: values and isnull arrays for the user data column(s)
|
||||
* Outputs: values and isnull arrays for the index tuple, suitable for
|
||||
* passing to index_form_tuple().
|
||||
*
|
||||
* Returns true if successful, false if not (because there are null values).
|
||||
* On a false result, the given data need not be indexed.
|
||||
*
|
||||
* Note: callers know that the index-column arrays are always of length 1.
|
||||
* In principle, there could be more than one input column, though we do not
|
||||
* currently support that.
|
||||
*/
|
||||
IndexTuple _hash_form_tuple(Relation index, Datum *values, const bool *isnull)
|
||||
bool _hash_convert_tuple(Relation index,
|
||||
Datum *user_values, const bool *user_isnull,
|
||||
Datum *index_values, bool *index_isnull)
|
||||
{
|
||||
IndexTuple itup;
|
||||
uint32 hashkey;
|
||||
Datum hashkeydatum;
|
||||
TupleDesc hashdesc;
|
||||
|
||||
if (isnull[0]) {
|
||||
hashkeydatum = (Datum)0;
|
||||
} else {
|
||||
hashkey = _hash_datum2hashkey(index, values[0]);
|
||||
hashkeydatum = UInt32GetDatum(hashkey);
|
||||
}
|
||||
hashdesc = RelationGetDescr(index);
|
||||
Assert(hashdesc->natts == 1);
|
||||
itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
|
||||
return itup;
|
||||
/*
|
||||
* We do not insert null values into hash indexes. This is okay because
|
||||
* the only supported search operator is '=', and we assume it is strict.
|
||||
*/
|
||||
if (user_isnull[0])
|
||||
return false;
|
||||
|
||||
hashkey = _hash_datum2hashkey(index, user_values[0]);
|
||||
index_values[0] = UInt32GetDatum(hashkey);
|
||||
index_isnull[0] = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -312,3 +389,154 @@ OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
|
|||
|
||||
return lower;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
|
||||
* from which current (new) bucket is being split.
|
||||
*/
|
||||
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
|
||||
{
|
||||
Bucket old_bucket;
|
||||
uint32 mask;
|
||||
Buffer metabuf;
|
||||
HashMetaPage metap;
|
||||
BlockNumber blkno;
|
||||
|
||||
/*
|
||||
* To get the old bucket from the current bucket, we need a mask to modulo
|
||||
* into lower half of table. This mask is stored in meta page as
|
||||
* hashm_lowmask, but here we can't rely on the same, because we need a
|
||||
* value of lowmask that was prevalent at the time when bucket split was
|
||||
* started. Masking the most significant bit of new bucket would give us
|
||||
* old bucket.
|
||||
*/
|
||||
mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
|
||||
old_bucket = new_bucket & mask;
|
||||
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
blkno = BUCKET_TO_BLKNO(metap, old_bucket);
|
||||
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
return blkno;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
|
||||
* that will be generated after split from old bucket.
|
||||
*
|
||||
* This is used to find the new bucket from old bucket based on current table
|
||||
* half. It is mainly required to finish the incomplete splits where we are
|
||||
* sure that not more than one bucket could have split in progress from old
|
||||
* bucket.
|
||||
*/
|
||||
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
|
||||
{
|
||||
Bucket new_bucket;
|
||||
Buffer metabuf;
|
||||
HashMetaPage metap;
|
||||
BlockNumber blkno;
|
||||
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
|
||||
metap->hashm_lowmask,
|
||||
metap->hashm_maxbucket);
|
||||
blkno = BUCKET_TO_BLKNO(metap, new_bucket);
|
||||
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
return blkno;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
|
||||
* generated after split from current (old) bucket.
|
||||
*
|
||||
* This is used to find the new bucket from old bucket. New bucket can be
|
||||
* obtained by OR'ing old bucket with most significant bit of current table
|
||||
* half (lowmask passed in this function can be used to identify msb of
|
||||
* current table half). There could be multiple buckets that could have
|
||||
* been split from current bucket. We need the first such bucket that exists.
|
||||
* Caller must ensure that no more than one split has happened from old
|
||||
* bucket.
|
||||
*/
|
||||
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
|
||||
uint32 lowmask, uint32 maxbucket)
|
||||
{
|
||||
Bucket new_bucket;
|
||||
|
||||
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
|
||||
if (new_bucket > maxbucket) {
|
||||
lowmask = lowmask >> 1;
|
||||
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
|
||||
}
|
||||
|
||||
return new_bucket;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_kill_items - set LP_DEAD state for items an indexscan caller has
|
||||
* told us were killed.
|
||||
*
|
||||
* scan->opaque, referenced locally through so, contains information about the
|
||||
* current page and killed tuples thereon (generally, this should only be
|
||||
* called if so->numKilled > 0).
|
||||
*
|
||||
* We match items by heap TID before assuming they are the right ones to
|
||||
* delete.
|
||||
*/
|
||||
void _hash_kill_items(IndexScanDesc scan)
|
||||
{
|
||||
HashScanOpaque so = (HashScanOpaque) scan->opaque;
|
||||
Page page;
|
||||
HashPageOpaque opaque;
|
||||
OffsetNumber offnum;
|
||||
OffsetNumber maxoff;
|
||||
int numKilled = so->numKilled;
|
||||
int i;
|
||||
bool killedsomething = false;
|
||||
|
||||
Assert(so->numKilled > 0);
|
||||
Assert(so->killedItems != NULL);
|
||||
|
||||
/*
|
||||
* Always reset the scan state, so we don't look for same items on other
|
||||
* pages.
|
||||
*/
|
||||
so->numKilled = 0;
|
||||
|
||||
page = BufferGetPage(so->hashso_curbuf);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
for (i = 0; i < numKilled; i++) {
|
||||
offnum = so->killedItems[i].indexOffset;
|
||||
|
||||
while (offnum <= maxoff) {
|
||||
ItemId iid = PageGetItemId(page, offnum);
|
||||
IndexTuple ituple = (IndexTuple)PageGetItem(page, iid);
|
||||
|
||||
if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) {
|
||||
/* found the item */
|
||||
ItemIdMarkDead(iid);
|
||||
killedsomething = true;
|
||||
break; /* out of inner search loop */
|
||||
}
|
||||
offnum = OffsetNumberNext(offnum);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, mark as dirty hint. Whenever
|
||||
* we mark anything LP_DEAD, we also set the page's
|
||||
* LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
|
||||
*/
|
||||
if (killedsomething) {
|
||||
opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
|
||||
MarkBufferDirtyHint(so->hashso_curbuf, true);
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -196,6 +196,9 @@ bool XLogBlockRefreshRedoBufferInfo(XLogBlockHead *blockhead, RedoBufferInfo *bu
|
|||
if (bufferinfo->blockinfo.rnode.relNode != XLogBlockHeadGetRelNode(blockhead)) {
|
||||
return false;
|
||||
}
|
||||
if (bufferinfo->blockinfo.rnode.opt != XLogBlockHeadGetCompressOpt(blockhead)) {
|
||||
return false;
|
||||
}
|
||||
if (bufferinfo->blockinfo.forknum != XLogBlockHeadGetForkNum(blockhead)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -219,6 +222,7 @@ void XLogBlockInitRedoBlockInfo(XLogBlockHead *blockhead, RedoBufferTag *blockin
|
|||
blockinfo->rnode.dbNode = XLogBlockHeadGetDbNode(blockhead);
|
||||
blockinfo->rnode.relNode = XLogBlockHeadGetRelNode(blockhead);
|
||||
blockinfo->rnode.bucketNode = XLogBlockHeadGetBucketId(blockhead);
|
||||
blockinfo->rnode.opt = XLogBlockHeadGetCompressOpt(blockhead);
|
||||
blockinfo->forknum = XLogBlockHeadGetForkNum(blockhead);
|
||||
blockinfo->blkno = XLogBlockHeadGetBlockNum(blockhead);
|
||||
blockinfo->pblk = XLogBlockHeadGetPhysicalBlock(blockhead);
|
||||
|
@ -305,7 +309,7 @@ void XLogRecSetBlockCommonState(XLogReaderState *record, XLogBlockParseEnum bloc
|
|||
blockparse->blockhead.spcNode = filenode.rnode.node.spcNode;
|
||||
blockparse->blockhead.dbNode = filenode.rnode.node.dbNode;
|
||||
blockparse->blockhead.bucketNode = filenode.rnode.node.bucketNode;
|
||||
|
||||
blockparse->blockhead.opt = filenode.rnode.node.opt;
|
||||
blockparse->blockhead.blkno = filenode.segno;
|
||||
blockparse->blockhead.forknum = filenode.forknumber;
|
||||
|
||||
|
@ -1288,6 +1292,8 @@ void XLogBlockDataCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoB
|
|||
break;
|
||||
case RM_UBTREE2_ID:
|
||||
UBTree2RedoDataBlock(blockhead, blockdatarec, bufferinfo);
|
||||
case RM_HASH_ID:
|
||||
HashRedoDataBlock(blockhead, blockdatarec, bufferinfo);
|
||||
break;
|
||||
case RM_XLOG_ID:
|
||||
xlog_redo_data_block(blockhead, blockdatarec, bufferinfo);
|
||||
|
@ -1417,7 +1423,7 @@ void XLogBlockDdlCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoBu
|
|||
rnode.dbNode = blockhead->dbNode;
|
||||
rnode.relNode = blockhead->relNode;
|
||||
rnode.bucketNode = blockhead->bucketNode;
|
||||
|
||||
rnode.opt = blockhead->opt;
|
||||
switch (blockddlrec->blockddltype) {
|
||||
case BLOCK_DDL_CREATE_RELNODE:
|
||||
smgr_redo_create(rnode, blockhead->forknum, blockddlrec->mainData);
|
||||
|
@ -1486,7 +1492,7 @@ void XLogBlockSegDdlDoRealAction(XLogBlockHead* blockhead, void* blockrecbody, R
|
|||
rnode.dbNode = blockhead->dbNode;
|
||||
rnode.relNode = blockhead->relNode;
|
||||
rnode.bucketNode = blockhead->bucketNode;
|
||||
|
||||
rnode.opt = blockhead->opt;
|
||||
switch (segddlrec->blockddlrec.blockddltype) {
|
||||
case BLOCK_DDL_TRUNCATE_RELNODE:
|
||||
xlog_block_segpage_redo_truncate(rnode, blockhead, segddlrec);
|
||||
|
@ -1511,7 +1517,7 @@ void XLogBlockDdlDoSmgrAction(XLogBlockHead *blockhead, void *blockrecbody, Redo
|
|||
rnode.dbNode = blockhead->dbNode;
|
||||
rnode.relNode = blockhead->relNode;
|
||||
rnode.bucketNode = blockhead->bucketNode;
|
||||
|
||||
rnode.opt = blockhead->opt;
|
||||
switch (blockddlrec->blockddltype) {
|
||||
case BLOCK_DDL_CREATE_RELNODE:
|
||||
smgr_redo_create(rnode, blockhead->forknum, blockddlrec->mainData);
|
||||
|
|
|
@ -16,7 +16,8 @@
|
|||
#include "postgres.h"
|
||||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/rmgr.h"
|
||||
#include "access/hash_xlog.h"
|
||||
|
||||
const char* hash_type_name(uint8 subtype)
|
||||
{
|
||||
|
@ -25,5 +26,150 @@ const char* hash_type_name(uint8 subtype)
|
|||
|
||||
void hash_desc(StringInfo buf, XLogReaderState *record)
|
||||
{
|
||||
/* nothing to do */
|
||||
char *rec = XLogRecGetData(record);
|
||||
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
||||
|
||||
switch (info) {
|
||||
case XLOG_HASH_INIT_META_PAGE:
|
||||
{
|
||||
xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "num_tuples %g, fillfactor %d",
|
||||
xlrec->num_tuples, xlrec->ffactor);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_INIT_BITMAP_PAGE:
|
||||
{
|
||||
xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "bmsize %d", xlrec->bmsize);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_INSERT:
|
||||
{
|
||||
xl_hash_insert *xlrec = (xl_hash_insert *) rec;
|
||||
|
||||
appendStringInfo(buf, "off %u", xlrec->offnum);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_ADD_OVFL_PAGE:
|
||||
{
|
||||
xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "bmsize %d, bmpage_found %c",
|
||||
xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
||||
{
|
||||
xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c",
|
||||
xlrec->new_bucket,
|
||||
(xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F',
|
||||
(xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_SPLIT_COMPLETE:
|
||||
{
|
||||
xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec;
|
||||
|
||||
appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u",
|
||||
xlrec->old_bucket_flag, xlrec->new_bucket_flag);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
||||
{
|
||||
xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec;
|
||||
|
||||
appendStringInfo(buf, "ntups %d, is_primary %c",
|
||||
xlrec->ntups,
|
||||
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_SQUEEZE_PAGE:
|
||||
{
|
||||
xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c",
|
||||
xlrec->prevblkno,
|
||||
xlrec->nextblkno,
|
||||
xlrec->ntups,
|
||||
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_DELETE:
|
||||
{
|
||||
xl_hash_delete *xlrec = (xl_hash_delete *) rec;
|
||||
|
||||
appendStringInfo(buf, "clear_dead_marking %c, is_primary %c",
|
||||
xlrec->clear_dead_marking ? 'T' : 'F',
|
||||
xlrec->is_primary_bucket_page ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_UPDATE_META_PAGE:
|
||||
{
|
||||
xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "ntuples %g",
|
||||
xlrec->ntuples);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_VACUUM_ONE_PAGE:
|
||||
{
|
||||
xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "ntuples %d",
|
||||
xlrec->ntuples);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const char *hash_identify(uint8 info)
|
||||
{
|
||||
const char *id = NULL;
|
||||
|
||||
switch (info & ~XLR_INFO_MASK) {
|
||||
case XLOG_HASH_INIT_META_PAGE:
|
||||
id = "INIT_META_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_INIT_BITMAP_PAGE:
|
||||
id = "INIT_BITMAP_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_INSERT:
|
||||
id = "INSERT";
|
||||
break;
|
||||
case XLOG_HASH_ADD_OVFL_PAGE:
|
||||
id = "ADD_OVFL_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
||||
id = "SPLIT_ALLOCATE_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_PAGE:
|
||||
id = "SPLIT_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_COMPLETE:
|
||||
id = "SPLIT_COMPLETE";
|
||||
break;
|
||||
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
||||
id = "MOVE_PAGE_CONTENTS";
|
||||
break;
|
||||
case XLOG_HASH_SQUEEZE_PAGE:
|
||||
id = "SQUEEZE_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_DELETE:
|
||||
id = "DELETE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_CLEANUP:
|
||||
id = "SPLIT_CLEANUP";
|
||||
break;
|
||||
case XLOG_HASH_UPDATE_META_PAGE:
|
||||
id = "UPDATE_META_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_VACUUM_ONE_PAGE:
|
||||
id = "VACUUM_ONE_PAGE";
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
|
@ -325,7 +325,11 @@ static void dw_prepare_page(dw_batch_t *batch, uint16 page_num, uint16 page_id,
|
|||
if (t_thrd.proc->workingVersionNum < DW_SUPPORT_SINGLE_FLUSH_VERSION) {
|
||||
page_num = page_num | IS_HASH_BKT_SEGPAGE_MASK;
|
||||
}
|
||||
batch->buftag_ver = HASHBUCKET_TAG;
|
||||
if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) {
|
||||
batch->buftag_ver = HASHBUCKET_TAG;
|
||||
} else {
|
||||
batch->buftag_ver = PAGE_COMPRESS_TAG;
|
||||
}
|
||||
} else {
|
||||
batch->buftag_ver = ORIGIN_TAG;
|
||||
}
|
||||
|
@ -349,7 +353,7 @@ void dw_prepare_file_head(char *file_head, uint16 start, uint16 dwn, int32 dw_ve
|
|||
curr_head->head.page_id = 0;
|
||||
curr_head->head.dwn = dwn;
|
||||
curr_head->start = start;
|
||||
curr_head->buftag_version = HASHBUCKET_TAG;
|
||||
curr_head->buftag_version = PAGE_COMPRESS_TAG;
|
||||
curr_head->tail.dwn = dwn;
|
||||
curr_head->dw_version = dw_version;
|
||||
dw_calc_file_head_checksum(curr_head);
|
||||
|
@ -477,15 +481,21 @@ static void dw_recover_pages(T1 *batch, T2 *buf_tag, PageHeader data_page, BufTa
|
|||
|
||||
for (i = 0; i < GET_REL_PGAENUM(batch->page_num); i++) {
|
||||
buf_tag = &batch->buf_tag[i];
|
||||
relnode.dbNode = buf_tag->rnode.dbNode;
|
||||
relnode.spcNode = buf_tag->rnode.spcNode;
|
||||
relnode.relNode = buf_tag->rnode.relNode;
|
||||
if (tag_ver == HASHBUCKET_TAG) {
|
||||
relnode.dbNode = buf_tag->rnode.dbNode;
|
||||
relnode.spcNode = buf_tag->rnode.spcNode;
|
||||
relnode.relNode = buf_tag->rnode.relNode;
|
||||
relnode.opt = 0;
|
||||
// 2 bytes are used for bucketNode.
|
||||
relnode.bucketNode = (int2)((BufferTagSecondVer *)buf_tag)->rnode.bucketNode;
|
||||
} else if (tag_ver == PAGE_COMPRESS_TAG) {
|
||||
relnode.opt = ((BufferTag *)buf_tag)->rnode.opt;
|
||||
relnode.bucketNode = ((BufferTag *)buf_tag)->rnode.bucketNode;
|
||||
} else {
|
||||
relnode.dbNode = buf_tag->rnode.dbNode;
|
||||
relnode.spcNode = buf_tag->rnode.spcNode;
|
||||
relnode.relNode = buf_tag->rnode.relNode;
|
||||
relnode.opt = 0;
|
||||
relnode.bucketNode = InvalidBktId;
|
||||
}
|
||||
dw_page = (PageHeader)((char *)batch + (i + 1) * BLCKSZ);
|
||||
|
@ -891,7 +901,10 @@ static void dw_recover_partial_write_batch(dw_batch_file_context *cxt)
|
|||
|
||||
if (t_thrd.proc->workingVersionNum < DW_SUPPORT_SINGLE_FLUSH_VERSION) {
|
||||
bool is_hashbucket = ((curr_head->page_num & IS_HASH_BKT_SEGPAGE_MASK) != 0);
|
||||
curr_head->buftag_ver = is_hashbucket ? HASHBUCKET_TAG : ORIGIN_TAG;
|
||||
curr_head->buftag_ver = is_hashbucket ?
|
||||
(t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION ? HASHBUCKET_TAG
|
||||
: PAGE_COMPRESS_TAG)
|
||||
: ORIGIN_TAG;
|
||||
}
|
||||
|
||||
remain_pages = read_asst.buf_end - read_asst.buf_start;
|
||||
|
@ -2216,9 +2229,9 @@ int buftag_compare(const void *pa, const void *pb)
|
|||
void dw_log_recovery_page(int elevel, const char *state, BufferTag buf_tag)
|
||||
{
|
||||
ereport(elevel, (errmodule(MOD_DW),
|
||||
errmsg("[single flush] recovery, %s: buf_tag[rel %u/%u/%u blk %u fork %d]",
|
||||
errmsg("[single flush] recovery, %s: buf_tag[rel %u/%u/%u blk %u fork %d], compress: %u",
|
||||
state, buf_tag.rnode.spcNode, buf_tag.rnode.dbNode, buf_tag.rnode.relNode, buf_tag.blockNum,
|
||||
buf_tag.forkNum)));
|
||||
buf_tag.forkNum, buf_tag.rnode.opt)));
|
||||
}
|
||||
|
||||
bool dw_read_data_page(BufferTag buf_tag, SMgrRelation reln, char* data_block)
|
||||
|
|
|
@ -53,6 +53,7 @@ static inline void PRXLogRecGetBlockTag(XLogRecParseState *recordBlockState, Rel
|
|||
rnode->relNode = blockparse->blockhead.relNode;
|
||||
rnode->spcNode = blockparse->blockhead.spcNode;
|
||||
rnode->bucketNode = blockparse->blockhead.bucketNode;
|
||||
rnode->opt = blockparse->blockhead.opt;
|
||||
}
|
||||
if (blknum != NULL) {
|
||||
*blknum = blockparse->blockhead.blkno;
|
||||
|
@ -245,6 +246,7 @@ void PRTrackRelStorageDrop(XLogRecParseState *recordBlockState, HTAB *redoItemHa
|
|||
rNode.dbNode = blockparse->blockhead.dbNode;
|
||||
rNode.relNode = blockparse->blockhead.relNode;
|
||||
rNode.bucketNode = blockparse->blockhead.bucketNode;
|
||||
rNode.opt = blockparse->blockhead.opt;
|
||||
#ifdef USE_ASSERT_CHECKING
|
||||
ereport(LOG, (errmsg("PRTrackRelTruncate:(%X/%X)clear relation %u/%u/%u forknum %u record",
|
||||
(uint32)(blockparse->blockhead.end_ptr >> 32), (uint32)(blockparse->blockhead.end_ptr), rNode.spcNode,
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include "access/xlog_internal.h"
|
||||
#include "access/nbtree.h"
|
||||
#include "access/ubtree.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/multixact.h"
|
||||
|
@ -190,7 +191,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
|
|||
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
|
||||
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
|
||||
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE },
|
||||
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
|
||||
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
|
||||
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
|
||||
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
|
||||
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
|
||||
|
@ -1152,8 +1153,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
|
|||
/* Run from the dispatcher thread. */
|
||||
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
{
|
||||
DispatchTxnRecord(record, expectedTLIs);
|
||||
return true;
|
||||
bool isNeedFullSync = false;
|
||||
|
||||
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
|
||||
if (IsHashVacuumPages(record) && g_supportHotStandby) {
|
||||
GetSlotIds(record);
|
||||
/* sync with trxn thread */
|
||||
/* only need to process in pageworker thread, wait trxn sync */
|
||||
/* pageworker exe, trxn don't need exe */
|
||||
DispatchToSpecPageWorker(record, expectedTLIs);
|
||||
} else {
|
||||
DispatchRecordWithPages(record, expectedTLIs);
|
||||
}
|
||||
|
||||
return isNeedFullSync;
|
||||
}
|
||||
|
||||
static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
|
|
|
@ -1369,6 +1369,7 @@ void RedoPageWorkerRedoBcmBlock(XLogRecParseState *procState)
|
|||
node.dbNode = procState->blockparse.blockhead.dbNode;
|
||||
node.relNode = procState->blockparse.blockhead.relNode;
|
||||
node.bucketNode = procState->blockparse.blockhead.bucketNode;
|
||||
node.opt = procState->blockparse.blockhead.opt;
|
||||
XLogBlockNewCuParse *newCuParse = &(procState->blockparse.extra_rec.blocknewcu);
|
||||
uint8 info = XLogBlockHeadGetInfo(&procState->blockparse.blockhead) & ~XLR_INFO_MASK;
|
||||
switch (info & XLOG_HEAP_OPMASK) {
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#include "access/xlog_internal.h"
|
||||
#include "access/nbtree.h"
|
||||
#include "access/ubtree.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/multixact.h"
|
||||
|
@ -181,7 +182,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
|
|||
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
|
||||
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
|
||||
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE},
|
||||
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
|
||||
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
|
||||
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
|
||||
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
|
||||
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
|
||||
|
@ -1073,8 +1074,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
|
|||
/* Run from the dispatcher thread. */
|
||||
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
{
|
||||
DispatchTxnRecord(record, expectedTLIs, recordXTime, false);
|
||||
return true;
|
||||
bool isNeedFullSync = false;
|
||||
|
||||
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
|
||||
if (IsHashVacuumPages(record) && g_supportHotStandby) {
|
||||
GetWorkerIds(record, ANY_WORKER, true);
|
||||
/* sync with trxn thread */
|
||||
/* only need to process in pageworker thread, wait trxn sync */
|
||||
/* pageworker exe, trxn don't need exe */
|
||||
DispatchToSpecPageWorker(record, expectedTLIs, true);
|
||||
} else {
|
||||
DispatchRecordWithPages(record, expectedTLIs, true);
|
||||
}
|
||||
|
||||
return isNeedFullSync;
|
||||
}
|
||||
|
||||
static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "access/gin.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/ustore/knl_uredo.h"
|
||||
#include "access/multixact.h"
|
||||
|
|
|
@ -511,7 +511,8 @@ XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, int bucket_id, bool istoast)
|
|||
* The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are
|
||||
* reserved for use by me.
|
||||
*/
|
||||
if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE | XLR_BTREE_UPGRADE_FLAG | XLR_IS_TOAST)) != 0) {
|
||||
if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE |
|
||||
XLR_BTREE_UPGRADE_FLAG | XLR_REL_COMPRESS | XLR_IS_TOAST)) != 0) {
|
||||
ereport(PANIC, (errmsg("invalid xlog info mask %hhx", info)));
|
||||
}
|
||||
|
||||
|
@ -717,6 +718,12 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_
|
|||
bool samerel = false;
|
||||
bool tde = false;
|
||||
|
||||
// must be uncompressed table during upgrade
|
||||
bool isCompressedTable = regbuf->rnode.opt != 0;
|
||||
if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) {
|
||||
Assert(!isCompressedTable);
|
||||
}
|
||||
|
||||
if (!regbuf->in_use)
|
||||
continue;
|
||||
|
||||
|
@ -864,7 +871,7 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_
|
|||
samerel = false;
|
||||
prev_regbuf = regbuf;
|
||||
|
||||
if (!samerel && IsSegmentFileNode(regbuf->rnode)) {
|
||||
if (!samerel && (IsSegmentFileNode(regbuf->rnode) || isCompressedTable)) {
|
||||
Assert(bkpb.id <= XLR_MAX_BLOCK_ID);
|
||||
bkpb.id += BKID_HAS_BUCKET_OR_SEGPAGE;
|
||||
}
|
||||
|
@ -880,9 +887,21 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogFPWInfo fpw_
|
|||
}
|
||||
|
||||
if (!samerel) {
|
||||
if (IsSegmentFileNode(regbuf->rnode)) {
|
||||
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), ®buf->rnode, remained_size);
|
||||
hashbucket_flag = true;
|
||||
if (IsSegmentFileNode(regbuf->rnode) || isCompressedTable) {
|
||||
if (IsSegmentFileNode(regbuf->rnode)) {
|
||||
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), ®buf->rnode, remained_size);
|
||||
hashbucket_flag = true;
|
||||
} else if (isCompressedTable) {
|
||||
if (t_thrd.proc->workingVersionNum < PAGE_COMPRESSION_VERSION) {
|
||||
Assert(!isCompressedTable);
|
||||
RelFileNodeV2 relFileNodeV2;
|
||||
RelFileNodeV2Copy(relFileNodeV2, regbuf->rnode);
|
||||
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNodeV2), ®buf->rnode, remained_size);
|
||||
} else {
|
||||
info |= XLR_REL_COMPRESS;
|
||||
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNode), ®buf->rnode, remained_size);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
XLOG_ASSEMBLE_ONE_ITEM(scratch, sizeof(RelFileNodeOld), ®buf->rnode, remained_size);
|
||||
no_hashbucket_flag = true;
|
||||
|
|
|
@ -949,6 +949,18 @@ void ResetDecoder(XLogReaderState *state)
|
|||
remaining -= sizeof(type); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* happens during the upgrade, copy the RelFileNodeV2 to RelFileNode
|
||||
* support little-endian system
|
||||
* @param relfileNode relfileNode
|
||||
*/
|
||||
static void CompressTableRecord(RelFileNode* relfileNode)
|
||||
{
|
||||
if (relfileNode->bucketNode <= -1 && relfileNode->opt == 0xFFFF) {
|
||||
relfileNode->opt = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode the previously read record.
|
||||
*
|
||||
|
@ -1067,8 +1079,11 @@ bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errorms
|
|||
if (remaining < filenodelen)
|
||||
goto shortdata_err;
|
||||
blk->rnode.bucketNode = InvalidBktId;
|
||||
blk->rnode.opt = 0;
|
||||
errno_t rc = memcpy_s(&blk->rnode, filenodelen, ptr, filenodelen);
|
||||
securec_check(rc, "\0", "\0");
|
||||
/* support decode old version of relfileNode */
|
||||
CompressTableRecord(&blk->rnode);
|
||||
ptr += filenodelen;
|
||||
remaining -= filenodelen;
|
||||
|
||||
|
|
|
@ -1331,6 +1331,7 @@ void XLogForgetDDLRedo(XLogRecParseState *redoblockstate)
|
|||
relNode.dbNode = redoblockstate->blockparse.blockhead.dbNode;
|
||||
relNode.relNode = redoblockstate->blockparse.blockhead.relNode;
|
||||
relNode.bucketNode = redoblockstate->blockparse.blockhead.bucketNode;
|
||||
relNode.opt = redoblockstate->blockparse.blockhead.opt;
|
||||
XLogTruncateRelation(relNode, redoblockstate->blockparse.blockhead.forknum,
|
||||
redoblockstate->blockparse.blockhead.blkno);
|
||||
}
|
||||
|
@ -1342,7 +1343,8 @@ void XLogDropSpaceShrink(XLogRecParseState *redoblockstate)
|
|||
.spcNode = redoblockstate->blockparse.blockhead.spcNode,
|
||||
.dbNode = redoblockstate->blockparse.blockhead.dbNode,
|
||||
.relNode = redoblockstate->blockparse.blockhead.relNode,
|
||||
.bucketNode = redoblockstate->blockparse.blockhead.bucketNode
|
||||
.bucketNode = redoblockstate->blockparse.blockhead.bucketNode,
|
||||
.opt = redoblockstate->blockparse.blockhead.opt
|
||||
};
|
||||
ForkNumber forknum = redoblockstate->blockparse.blockhead.forknum;
|
||||
BlockNumber target_size = redoblockstate->blockparse.blockhead.blkno;
|
||||
|
|
|
@ -1400,6 +1400,7 @@ static void UHeapXlogUpdateBlock(XLogBlockHead *blockhead, XLogBlockDataParse *b
|
|||
rnode.dbNode = blockhead->dbNode;
|
||||
rnode.relNode = blockhead->relNode;
|
||||
rnode.bucketNode = blockhead->bucketNode;
|
||||
rnode.opt = blockhead->opt;
|
||||
XLogRecordPageWithFreeSpace(rnode, bufferinfo->blockinfo.blkno, freespace);
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -5757,6 +5757,51 @@ bool ConditionalLockBufferForCleanup(Buffer buffer)
|
|||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* IsBufferCleanupOK - as above, but we already have the lock
|
||||
*
|
||||
* Check whether it's OK to perform cleanup on a buffer we've already
|
||||
* locked. If we observe that the pin count is 1, our exclusive lock
|
||||
* happens to be a cleanup lock, and we can proceed with anything that
|
||||
* would have been allowable had we sought a cleanup lock originally.
|
||||
*/
|
||||
bool IsBufferCleanupOK(Buffer buffer)
|
||||
{
|
||||
BufferDesc *bufHdr;
|
||||
uint32 buf_state;
|
||||
|
||||
Assert(BufferIsValid(buffer));
|
||||
|
||||
if (BufferIsLocal(buffer)) {
|
||||
/* There should be exactly one pin */
|
||||
if (u_sess->storage_cxt.LocalRefCount[-buffer - 1] != 1)
|
||||
return false;
|
||||
/* Nobody else to wait for */
|
||||
return true;
|
||||
}
|
||||
|
||||
/* There should be exactly one local pin */
|
||||
if (GetPrivateRefCount(buffer) != 1)
|
||||
return false;
|
||||
|
||||
bufHdr = GetBufferDescriptor(buffer - 1);
|
||||
|
||||
/* caller must hold exclusive lock on buffer */
|
||||
Assert(LWLockHeldByMeInMode(bufHdr->content_lock, LW_EXCLUSIVE));
|
||||
|
||||
buf_state = LockBufHdr(bufHdr);
|
||||
|
||||
Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
|
||||
if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) {
|
||||
/* pincount is OK. */
|
||||
UnlockBufHdr(bufHdr, buf_state);
|
||||
return true;
|
||||
}
|
||||
|
||||
UnlockBufHdr(bufHdr, buf_state);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Functions for buffer I/O handling
|
||||
*
|
||||
|
|
|
@ -191,6 +191,16 @@ static pthread_mutex_t VFDLockArray[NUM_VFD_PARTITIONS];
|
|||
#define VFDMappingPartitionLock(hashcode) \
|
||||
(&VFDLockArray[VFDTableHashPartition(hashcode)])
|
||||
|
||||
/*
|
||||
* pc_munmap
|
||||
*/
|
||||
#define SAFE_MUNMAP(vfdP) \
|
||||
do { \
|
||||
if ((vfdP)->with_pcmap && (vfdP)->pcmap != NULL) { \
|
||||
UnReferenceAddrFile((vfdP)); \
|
||||
(vfdP)->pcmap = NULL; \
|
||||
} \
|
||||
} while (0)
|
||||
/* --------------------
|
||||
*
|
||||
* Private Routines
|
||||
|
@ -344,11 +354,13 @@ RelFileNodeForkNum RelFileNodeForkNumFill(RelFileNode* rnode,
|
|||
filenode.rnode.node.spcNode = rnode->spcNode;
|
||||
filenode.rnode.node.dbNode = rnode->dbNode;
|
||||
filenode.rnode.node.bucketNode = rnode->bucketNode;
|
||||
filenode.rnode.node.opt = rnode->opt;
|
||||
} else {
|
||||
filenode.rnode.node.relNode = InvalidOid;
|
||||
filenode.rnode.node.spcNode = InvalidOid;
|
||||
filenode.rnode.node.dbNode = InvalidOid;
|
||||
filenode.rnode.node.bucketNode = InvalidBktId;
|
||||
filenode.rnode.node.opt = 0;
|
||||
}
|
||||
|
||||
filenode.rnode.backend = backend;
|
||||
|
@ -915,6 +927,7 @@ static void LruDelete(File file)
|
|||
|
||||
vfdP = &vfdcache[file];
|
||||
|
||||
SAFE_MUNMAP(vfdP);
|
||||
/* delete the vfd record from the LRU ring */
|
||||
Delete(file);
|
||||
|
||||
|
@ -1704,6 +1717,8 @@ void FileCloseWithThief(File file)
|
|||
{
|
||||
Vfd* vfdP = &GetVfdCache()[file];
|
||||
if (!FileIsNotOpen(file)) {
|
||||
SAFE_MUNMAP(vfdP);
|
||||
|
||||
/* remove the file from the lru ring */
|
||||
Delete(file);
|
||||
/* the thief has close the real fd */
|
||||
|
@ -1843,6 +1858,8 @@ void FileClose(File file)
|
|||
vfdP = &vfdcache[file];
|
||||
|
||||
if (!FileIsNotOpen(file)) {
|
||||
SAFE_MUNMAP(vfdP);
|
||||
|
||||
/* remove the file from the lru ring */
|
||||
Delete(file);
|
||||
|
||||
|
@ -3994,3 +4011,48 @@ static void UnlinkIfExistsFname(const char *fname, bool isdir, int elevel)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* initialize page compress memory map.
|
||||
*
|
||||
*/
|
||||
void SetupPageCompressMemoryMap(File file, RelFileNode node, const RelFileNodeForkNum& relFileNodeForkNum)
|
||||
{
|
||||
Vfd *vfdP = &GetVfdCache()[file];
|
||||
auto chunk_size = CHUNK_SIZE_LIST[GET_COMPRESS_CHUNK_SIZE(node.opt)];
|
||||
int returnCode = FileAccess(file);
|
||||
if (returnCode < 0) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to open file %s: %m", vfdP->fileName)));
|
||||
}
|
||||
RelFileNodeForkNum newOne(relFileNodeForkNum);
|
||||
newOne.forknumber = PCA_FORKNUM;
|
||||
PageCompressHeader *map = GetPageCompressHeader(vfdP, chunk_size, newOne);
|
||||
vfdP->with_pcmap = true;
|
||||
vfdP->pcmap = map;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the page compress memory map.
|
||||
*
|
||||
*/
|
||||
PageCompressHeader *GetPageCompressMemoryMap(File file, uint32 chunk_size)
|
||||
{
|
||||
int returnCode;
|
||||
Vfd *vfdP = &GetVfdCache()[file];
|
||||
PageCompressHeader *map = NULL;
|
||||
|
||||
Assert(FileIsValid(file));
|
||||
|
||||
returnCode = FileAccess(file);
|
||||
if (returnCode < 0) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("Failed to open file %s: %m", vfdP->fileName)));
|
||||
}
|
||||
|
||||
Assert(vfdP->with_pcmap);
|
||||
if (vfdP->pcmap == NULL) {
|
||||
map = GetPageCompressHeader(vfdP, chunk_size, vfdP->fileNode);
|
||||
vfdP->with_pcmap = true;
|
||||
vfdP->pcmap = map;
|
||||
}
|
||||
|
||||
return vfdP->pcmap;
|
||||
}
|
|
@ -436,3 +436,28 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno)
|
|||
|
||||
((PageHeader)page)->pd_checksum = pg_checksum_page((char*)page, blkno);
|
||||
}
|
||||
|
||||
/*
|
||||
* PageGetFreeSpaceForMultipleTuples
|
||||
* Returns the size of the free (allocatable) space on a page,
|
||||
* reduced by the space needed for multiple new line pointers.
|
||||
*
|
||||
* Note: this should usually only be used on index pages. Use
|
||||
* PageGetHeapFreeSpace on heap pages.
|
||||
*/
|
||||
Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
|
||||
{
|
||||
int space;
|
||||
|
||||
/*
|
||||
* Use signed arithmetic here so that we behave sensibly if pd_lower >
|
||||
* pd_upper.
|
||||
*/
|
||||
space = (int)((PageHeader)page)->pd_upper - (int)((PageHeader)page)->pd_lower;
|
||||
|
||||
if (space < (int)(ntups * sizeof(ItemIdData)))
|
||||
return 0;
|
||||
space -= ntups * sizeof(ItemIdData);
|
||||
|
||||
return (Size) space;
|
||||
}
|
||||
|
|
|
@ -105,6 +105,10 @@ static void formatBitmap(const unsigned char *start, int len, char bit1, char bi
|
|||
void PrepForRead(char *path, int64 blocknum, char *relation_type, char *outputFilename, RelFileNode *relnode,
|
||||
bool parse_page)
|
||||
{
|
||||
if (CalculateCompressMainForkSize(path, true) != 0) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("compressed table file is not allowed here."))));
|
||||
}
|
||||
char *pathFirstpart = (char *)palloc(MAXFNAMELEN * sizeof(char));
|
||||
errno_t rc = memset_s(pathFirstpart, MAXFNAMELEN, 0, MAXFNAMELEN);
|
||||
securec_check(rc, "\0", "\0");
|
||||
|
@ -133,7 +137,7 @@ void PrepForRead(char *path, int64 blocknum, char *relation_type, char *outputFi
|
|||
(errmsg("The tablespace oid is 0. Please check the first parameter path. "
|
||||
"If you are not sure about the table path, please check pg_relation_filepath."))));
|
||||
RelFileNodeRelCopy(*relnode, relfilenode.rnode.node);
|
||||
|
||||
relnode->opt = 0;
|
||||
char *pagesuffix = "page";
|
||||
char *xlogsuffix = "xlog";
|
||||
rc = snprintf_s(outputFilename + (int)strlen(outputFilename), MAXFILENAME, MAXFILENAME - 1, "%s/%u_%u_%u_%d.%s",
|
||||
|
@ -496,6 +500,7 @@ static void CheckSegment(RelFileNode *relnode, ForkNumber forkNum)
|
|||
relnodeHead->dbNode = relnode->dbNode;
|
||||
relnodeHead->relNode = 1;
|
||||
relnodeHead->bucketNode = relnode->bucketNode;
|
||||
relnodeHead->opt = relnode->opt;
|
||||
Buffer buffer_temp = ReadBufferFast(spc, *relnodeHead, forkNum, relnode->relNode, RBM_NORMAL);
|
||||
if (!BufferIsValid(buffer_temp))
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("Segment Head is invalid %u/%u/%u %d %u",
|
||||
|
|
|
@ -123,6 +123,7 @@ Datum gs_read_block_from_remote(PG_FUNCTION_ARGS)
|
|||
key.relfilenode.dbNode = dbNode;
|
||||
key.relfilenode.relNode = relNode;
|
||||
key.relfilenode.bucketNode = bucketNode;
|
||||
key.relfilenode.opt = 0;
|
||||
key.forknum = forkNum;
|
||||
key.blocknum = blockNum;
|
||||
|
||||
|
@ -141,6 +142,48 @@ Datum gs_read_block_from_remote(PG_FUNCTION_ARGS)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read block from buffer from primary, returning it as bytea
|
||||
*/
|
||||
Datum gs_read_block_from_remote_compress(PG_FUNCTION_ARGS)
|
||||
{
|
||||
RepairBlockKey key;
|
||||
uint32 blockSize;
|
||||
uint64 lsn;
|
||||
int timeout = 0;
|
||||
bool isForCU = false;
|
||||
bytea* result = NULL;
|
||||
|
||||
if (GetUserId() != BOOTSTRAP_SUPERUSERID) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be initial account to read files"))));
|
||||
}
|
||||
/* handle optional arguments */
|
||||
key.relfilenode.spcNode = PG_GETARG_UINT32(0);
|
||||
key.relfilenode.dbNode = PG_GETARG_UINT32(1);
|
||||
key.relfilenode.relNode = PG_GETARG_UINT32(2);
|
||||
key.relfilenode.bucketNode = PG_GETARG_INT16(3);
|
||||
key.relfilenode.opt = PG_GETARG_UINT16(4);
|
||||
key.forknum = PG_GETARG_INT32(5);
|
||||
key.blocknum = (uint64)PG_GETARG_TRANSACTIONID(6);
|
||||
blockSize = PG_GETARG_UINT32(7);
|
||||
lsn = (uint64)PG_GETARG_TRANSACTIONID(8);
|
||||
isForCU = PG_GETARG_BOOL(9);
|
||||
timeout = PG_GETARG_INT32(10);
|
||||
/* get block from local buffer */
|
||||
if (isForCU) {
|
||||
/* if request to read CU block, we use forkNum column to replace colid. */
|
||||
(void)StandbyReadCUforPrimary(key, key.blocknum, blockSize, lsn, timeout, &result);
|
||||
} else {
|
||||
(void)StandbyReadPageforPrimary(key, blockSize, lsn, &result, timeout, NULL);
|
||||
}
|
||||
|
||||
if (NULL != result) {
|
||||
PG_RETURN_BYTEA_P(result);
|
||||
} else {
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @Description: read cu for primary
|
||||
* @IN spcnode: tablespace id
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
#include "access/xlog.h"
|
||||
#include "storage/smgr/fd.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/pmsignal.h"
|
||||
#include "storage/checksum.h"
|
||||
#ifdef ENABLE_MOT
|
||||
|
@ -116,6 +117,9 @@ static void send_xlog_header(const char *linkpath);
|
|||
static void save_xlogloc(const char *xloglocation);
|
||||
static XLogRecPtr GetMinArchiveSlotLSN(void);
|
||||
|
||||
/* compressed Function */
|
||||
static void SendCompressedFile(char* readFileName, int basePathLen, struct stat& statbuf, bool missingOk, int64* size);
|
||||
|
||||
/*
|
||||
* save xlog location
|
||||
*/
|
||||
|
@ -1259,6 +1263,35 @@ static bool IsDCFPath(const char *pathname)
|
|||
return false;
|
||||
}
|
||||
|
||||
#define SEND_DIR_ADD_SIZE(size, statbuf) ((size) = (size) + (((statbuf).st_size + 511) & ~511) + BUILD_PATH_LEN)
|
||||
|
||||
/**
|
||||
* send file or compressed file
|
||||
* @param sizeOnly send or not
|
||||
* @param pathbuf path
|
||||
* @param pathBufLen pathLen
|
||||
* @param basepathlen subfix of path
|
||||
* @param statbuf path stat
|
||||
*/
|
||||
static void SendRealFile(bool sizeOnly, char* pathbuf, size_t pathBufLen, int basepathlen, struct stat* statbuf)
|
||||
{
|
||||
int64 size = 0;
|
||||
// we must ensure the page integrity when in IncrementalCheckpoint
|
||||
if (!sizeOnly && g_instance.attr.attr_storage.enableIncrementalCheckpoint &&
|
||||
IsCompressedFile(pathbuf, strlen(pathbuf)) != COMPRESSED_TYPE_UNKNOWN) {
|
||||
SendCompressedFile(pathbuf, basepathlen, (*statbuf), true, &size);
|
||||
} else {
|
||||
bool sent = false;
|
||||
if (!sizeOnly) {
|
||||
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, statbuf, true);
|
||||
}
|
||||
if (sent || sizeOnly) {
|
||||
/* Add size, rounded up to 512byte block */
|
||||
SEND_DIR_ADD_SIZE(size, (*statbuf));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Include all files from the given directory in the output tar stream. If
|
||||
* 'sizeonly' is true, we just calculate a total length and return it, without
|
||||
|
@ -1557,15 +1590,7 @@ static int64 sendDir(const char *path, int basepathlen, bool sizeonly, List *tab
|
|||
if (!skip_this_dir)
|
||||
size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, sendtblspclinks);
|
||||
} else if (S_ISREG(statbuf.st_mode)) {
|
||||
bool sent = false;
|
||||
|
||||
if (!sizeonly)
|
||||
sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf, true);
|
||||
|
||||
if (sent || sizeonly) {
|
||||
/* Add size, rounded up to 512byte block */
|
||||
size = size + ((statbuf.st_size + 511) & ~511) + BUILD_PATH_LEN;
|
||||
}
|
||||
SendRealFile(sizeonly, pathbuf, strlen(pathbuf), basepathlen, &statbuf);
|
||||
} else
|
||||
ereport(WARNING, (errmsg("skipping special file \"%s\"", pathbuf)));
|
||||
}
|
||||
|
@ -1692,6 +1717,15 @@ bool is_row_data_file(const char *path, int *segNo, UndoFileType *undoFileType)
|
|||
int nmatch;
|
||||
char *fname = NULL;
|
||||
|
||||
/* Skip compressed page files */
|
||||
size_t pathLen = strlen(path);
|
||||
if (pathLen >= 4) {
|
||||
const char* suffix = path + pathLen - 4;
|
||||
if (strncmp(suffix, "_pca", 4) == 0 || strncmp(suffix, "_pcd", 4) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if ((fname = strstr((char *)path, "pg_tblspc/")) != NULL) {
|
||||
nmatch = sscanf_s(fname, "pg_tblspc/%u/%*[^/]/%u/%s", &spcNode, &dbNode, buf, sizeof(buf));
|
||||
if (nmatch == 3) {
|
||||
|
@ -1809,6 +1843,245 @@ static void SendTableSpaceForBackup(basebackup_options* opt, List* tablespaces,
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* init buf_block if not yet; repalloc PqSendBuffer if necessary
|
||||
*/
|
||||
static void SendFilePreInit(void)
|
||||
{
|
||||
if (t_thrd.basebackup_cxt.buf_block == NULL) {
|
||||
MemoryContext oldcxt = MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
|
||||
t_thrd.basebackup_cxt.buf_block = (char *)palloc0(TAR_SEND_SIZE);
|
||||
MemoryContextSwitchTo(oldcxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* repalloc to `MaxBuildAllocSize' in one time, to avoid many small step repalloc in `pq_putmessage_noblock'
|
||||
* and low performance.
|
||||
*/
|
||||
if (INT2SIZET(t_thrd.libpq_cxt.PqSendBufferSize) < MaxBuildAllocSize) {
|
||||
t_thrd.libpq_cxt.PqSendBuffer = (char *)repalloc(t_thrd.libpq_cxt.PqSendBuffer, MaxBuildAllocSize);
|
||||
t_thrd.libpq_cxt.PqSendBufferSize = MaxBuildAllocSize;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* check file
|
||||
* @param readFileName
|
||||
* @param statbuf
|
||||
* @param supress error if missingOk is false when file is not found
|
||||
* @return return null if file.size > MAX_TAR_MEMBER_FILELEN or file cant found
|
||||
*/
|
||||
static FILE *SizeCheckAndAllocate(char *readFileName, const struct stat &statbuf, bool missingOk)
|
||||
{
|
||||
/*
|
||||
* Some compilers will throw a warning knowing this test can never be true
|
||||
* because pgoff_t can't exceed the compared maximum on their platform.
|
||||
*/
|
||||
if (statbuf.st_size > MAX_TAR_MEMBER_FILELEN) {
|
||||
ereport(WARNING, (errcode(ERRCODE_NAME_TOO_LONG),
|
||||
errmsg("archive member \"%s\" too large for tar format", readFileName)));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
FILE *fp = AllocateFile(readFileName, "rb");
|
||||
if (fp == NULL) {
|
||||
if (errno == ENOENT && missingOk)
|
||||
return NULL;
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", readFileName)));
|
||||
}
|
||||
return fp;
|
||||
|
||||
}
|
||||
|
||||
static void TransferPcaFile(const char *readFileName, int basePathLen, const struct stat &statbuf,
|
||||
PageCompressHeader *transfer,
|
||||
size_t len)
|
||||
{
|
||||
const char *tarfilename = readFileName + basePathLen + 1;
|
||||
_tarWriteHeader(tarfilename, NULL, (struct stat*)(&statbuf));
|
||||
char *data = (char *) transfer;
|
||||
size_t lenBuffer = len;
|
||||
while (lenBuffer > 0) {
|
||||
size_t transferLen = Min(TAR_SEND_SIZE, lenBuffer);
|
||||
if (pq_putmessage_noblock('d', data, transferLen)) {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup")));
|
||||
}
|
||||
data = data + transferLen;
|
||||
lenBuffer -= transferLen;
|
||||
}
|
||||
size_t pad = ((len + 511) & ~511) - len;
|
||||
if (pad > 0) {
|
||||
securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, pad, 0, pad), "", "");
|
||||
(void) pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, pad);
|
||||
}
|
||||
}
|
||||
|
||||
static void FileStat(char* path, struct stat* fileStat)
|
||||
{
|
||||
if (stat(path, fileStat) != 0) {
|
||||
if (errno != ENOENT) {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file or directory \"%s\": %m", path)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void SendCompressedFile(char* readFileName, int basePathLen, struct stat& statbuf, bool missingOk, int64* size)
|
||||
{
|
||||
char* tarfilename = readFileName + basePathLen + 1;
|
||||
SendFilePreInit();
|
||||
FILE* fp = SizeCheckAndAllocate(readFileName, statbuf, missingOk);
|
||||
if (fp == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t readFileNameLen = strlen(readFileName);
|
||||
/* dont send pca file */
|
||||
if (readFileNameLen < 4 || strncmp(readFileName + readFileNameLen - 4, "_pca", 4) == 0 ||
|
||||
strncmp(readFileName + readFileNameLen - 4, "_pcd", 4) != 0) {
|
||||
FreeFile(fp);
|
||||
return;
|
||||
}
|
||||
|
||||
char tablePath[MAXPGPATH] = {0};
|
||||
securec_check_c(memcpy_s(tablePath, MAXPGPATH, readFileName, readFileNameLen - 4), "", "");
|
||||
int segmentNo = 0;
|
||||
UndoFileType undoFileType = UNDO_INVALID;
|
||||
if (!is_row_data_file(tablePath, &segmentNo, &undoFileType)) {
|
||||
ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("%s is not a relation file.", tablePath)));
|
||||
}
|
||||
|
||||
char pcaFilePath[MAXPGPATH];
|
||||
securec_check_c(memcpy_s(pcaFilePath, MAXPGPATH, readFileName, readFileNameLen), "", "");
|
||||
pcaFilePath[readFileNameLen - 1] = 'a';
|
||||
|
||||
FILE* pcaFile = AllocateFile(pcaFilePath, "rb");
|
||||
if (pcaFile == NULL) {
|
||||
if (errno == ENOENT && missingOk) {
|
||||
FreeFile(fp);
|
||||
return;
|
||||
}
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", pcaFilePath)));
|
||||
}
|
||||
|
||||
uint16 chunkSize = ReadChunkSize(pcaFile, pcaFilePath, MAXPGPATH);
|
||||
|
||||
struct stat pcaStruct;
|
||||
FileStat((char*)pcaFilePath, &pcaStruct);
|
||||
|
||||
size_t pcaFileLen = SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize);
|
||||
PageCompressHeader* map = pc_mmap_real_size(fileno(pcaFile), pcaFileLen, true);
|
||||
if (map == MAP_FAILED) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
|
||||
errmsg("Failed to mmap page compression address file %s: %m", pcaFilePath)));
|
||||
}
|
||||
|
||||
PageCompressHeader* transfer = (PageCompressHeader*)palloc0(pcaFileLen);
|
||||
/* decompressed page buffer, avoid frequent allocation */
|
||||
BlockNumber blockNum = 0;
|
||||
size_t chunkIndex = 1;
|
||||
off_t totalLen = 0;
|
||||
off_t sendLen = 0;
|
||||
/* send the pkg header containing msg like file size */
|
||||
BlockNumber totalBlockNum = (BlockNumber)pg_atomic_read_u32(&map->nblocks);
|
||||
|
||||
/* some chunks may have been allocated but not used.
|
||||
* Reserve 0 chunks for avoiding the error when the size of a compressed block extends */
|
||||
auto reservedChunks = 0;
|
||||
securec_check(memcpy_s(transfer, pcaFileLen, map, pcaFileLen), "", "");
|
||||
decltype(statbuf.st_size) realSize = (map->allocated_chunks + reservedChunks) * chunkSize;
|
||||
statbuf.st_size = statbuf.st_size >= realSize ? statbuf.st_size : realSize;
|
||||
_tarWriteHeader(tarfilename, NULL, (struct stat*)(&statbuf));
|
||||
bool* onlyExtend = (bool*)palloc0(totalBlockNum * sizeof(bool));
|
||||
|
||||
/* allocated in advance to prevent repeated allocated */
|
||||
char pageBuffer[BLCKSZ];
|
||||
ReadBlockChunksStruct rbStruct{map, pageBuffer, BLCKSZ, fp, segmentNo, readFileName};
|
||||
for (blockNum = 0; blockNum < totalBlockNum; blockNum++) {
|
||||
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(transfer, chunkSize, blockNum);
|
||||
/* skip some blocks which only extends. The size of blocks is 0. */
|
||||
if (addr->nchunks == 0) {
|
||||
onlyExtend[blockNum] = true;
|
||||
continue;
|
||||
}
|
||||
/* read block to t_thrd.basebackup_cxt.buf_block */
|
||||
size_t bufferSize = TAR_SEND_SIZE - sendLen;
|
||||
size_t len = ReadAllChunkOfBlock(t_thrd.basebackup_cxt.buf_block + sendLen, bufferSize, blockNum, rbStruct);
|
||||
/* merge Blocks */
|
||||
sendLen += len;
|
||||
if (totalLen + (off_t)len > statbuf.st_size) {
|
||||
ReleaseMap(map, readFileName);
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("some blocks in %s had been changed. Retry backup please. PostBlocks:%u, currentReadBlocks "
|
||||
":%u, transferSize: %lu. totalLen: %lu, len: %lu",
|
||||
readFileName,
|
||||
totalBlockNum,
|
||||
blockNum,
|
||||
statbuf.st_size,
|
||||
totalLen,
|
||||
len)));
|
||||
}
|
||||
if (sendLen > TAR_SEND_SIZE - BLCKSZ) {
|
||||
if (pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, sendLen)) {
|
||||
ReleaseMap(map, readFileName);
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup")));
|
||||
}
|
||||
sendLen = 0;
|
||||
}
|
||||
uint8 nchunks = len / chunkSize;
|
||||
addr->nchunks = addr->allocated_chunks = nchunks;
|
||||
for (size_t i = 0; i < nchunks; i++) {
|
||||
addr->chunknos[i] = chunkIndex++;
|
||||
}
|
||||
addr->checksum = AddrChecksum32(blockNum, addr, chunkSize);
|
||||
totalLen += len;
|
||||
}
|
||||
ReleaseMap(map, readFileName);
|
||||
|
||||
if (sendLen != 0) {
|
||||
if (pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, sendLen)) {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("base backup could not send data, aborting backup")));
|
||||
}
|
||||
}
|
||||
|
||||
/* If the file was truncated while we were sending it, pad it with zeros */
|
||||
if (totalLen < statbuf.st_size) {
|
||||
securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, TAR_SEND_SIZE, 0, TAR_SEND_SIZE), "", "");
|
||||
while (totalLen < statbuf.st_size) {
|
||||
size_t cnt = Min(TAR_SEND_SIZE, statbuf.st_size - totalLen);
|
||||
(void)pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, cnt);
|
||||
totalLen += cnt;
|
||||
}
|
||||
}
|
||||
|
||||
size_t pad = ((totalLen + 511) & ~511) - totalLen;
|
||||
if (pad > 0) {
|
||||
securec_check(memset_s(t_thrd.basebackup_cxt.buf_block, pad, 0, pad), "", "");
|
||||
(void)pq_putmessage_noblock('d', t_thrd.basebackup_cxt.buf_block, pad);
|
||||
}
|
||||
SEND_DIR_ADD_SIZE(*size, statbuf);
|
||||
|
||||
// allocate chunks of some pages which only extend
|
||||
for (size_t blockNum = 0; blockNum < totalBlockNum; ++blockNum) {
|
||||
if (onlyExtend[blockNum]) {
|
||||
PageCompressAddr* addr = GET_PAGE_COMPRESS_ADDR(transfer, chunkSize, blockNum);
|
||||
for (size_t i = 0; i < addr->allocated_chunks; i++) {
|
||||
addr->chunknos[i] = chunkIndex++;
|
||||
}
|
||||
}
|
||||
}
|
||||
transfer->nblocks = transfer->last_synced_nblocks = blockNum;
|
||||
transfer->last_synced_allocated_chunks = transfer->allocated_chunks = chunkIndex;
|
||||
TransferPcaFile(pcaFilePath, basePathLen, pcaStruct, transfer, pcaFileLen);
|
||||
|
||||
SEND_DIR_ADD_SIZE(*size, pcaStruct);
|
||||
FreeFile(pcaFile);
|
||||
FreeFile(fp);
|
||||
pfree(transfer);
|
||||
pfree(onlyExtend);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given the member, write the TAR header & send the file.
|
||||
*
|
||||
|
@ -1832,39 +2105,11 @@ static bool sendFile(char *readfilename, char *tarfilename, struct stat *statbuf
|
|||
const int MAX_RETRY_LIMIT = 60;
|
||||
int retryCnt = 0;
|
||||
UndoFileType undoFileType = UNDO_INVALID;
|
||||
|
||||
if (t_thrd.basebackup_cxt.buf_block == NULL) {
|
||||
MemoryContext oldcxt = NULL;
|
||||
|
||||
oldcxt = MemoryContextSwitchTo(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
|
||||
t_thrd.basebackup_cxt.buf_block = (char *)palloc0(TAR_SEND_SIZE);
|
||||
MemoryContextSwitchTo(oldcxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* repalloc to `MaxBuildAllocSize' in one time, to avoid many small step repalloc in `pq_putmessage_noblock'
|
||||
* and low performance.
|
||||
*/
|
||||
if (INT2SIZET(t_thrd.libpq_cxt.PqSendBufferSize) < MaxBuildAllocSize) {
|
||||
t_thrd.libpq_cxt.PqSendBuffer = (char *)repalloc(t_thrd.libpq_cxt.PqSendBuffer, MaxBuildAllocSize);
|
||||
t_thrd.libpq_cxt.PqSendBufferSize = MaxBuildAllocSize;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some compilers will throw a warning knowing this test can never be true
|
||||
* because pgoff_t can't exceed the compared maximum on their platform.
|
||||
*/
|
||||
if (statbuf->st_size > MAX_FILE_SIZE_LIMIT) {
|
||||
ereport(WARNING, (errcode(ERRCODE_NAME_TOO_LONG),
|
||||
errmsg("archive member \"%s\" too large for tar format", tarfilename)));
|
||||
return false;
|
||||
}
|
||||
|
||||
fp = AllocateFile(readfilename, "rb");
|
||||
|
||||
SendFilePreInit();
|
||||
fp = SizeCheckAndAllocate(readfilename, *statbuf, missing_ok);
|
||||
if (fp == NULL) {
|
||||
if (errno == ENOENT && missing_ok)
|
||||
return false;
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", readfilename)));
|
||||
return false;
|
||||
}
|
||||
|
||||
isNeedCheck = is_row_data_file(readfilename, &segNo, &undoFileType);
|
||||
|
|
|
@ -13,6 +13,7 @@ set(TGT_smgr_INC
|
|||
${EVENT_INCLUDE_PATH}
|
||||
${PROTOBUF_INCLUDE_PATH}
|
||||
${ZLIB_INCLUDE_PATH}
|
||||
${ZSTD_INCLUDE_PATH}
|
||||
)
|
||||
|
||||
set(smgr_DEF_OPTIONS ${MACRO_OPTIONS})
|
||||
|
|
|
@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
OBJS = md.o smgr.o smgrtype.o knl_uundofile.o segstore.o
|
||||
OBJS = md.o smgr.o smgrtype.o knl_uundofile.o segstore.o page_compression.o mmap_shared.o
|
||||
|
||||
SUBDIRS = segment
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,149 @@
|
|||
/*
|
||||
* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
*
|
||||
* openGauss is licensed under Mulan PSL v2.
|
||||
* You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
* You may obtain a copy of Mulan PSL v2 at:
|
||||
*
|
||||
* http://license.coscl.org.cn/MulanPSL2
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
* See the Mulan PSL v2 for more details.
|
||||
* ---------------------------------------------------------------------------------------
|
||||
*
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/gausskernel/storage/smgr/mmap_shared.cpp
|
||||
*
|
||||
* ---------------------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "miscadmin.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "utils/datum.h"
|
||||
#include "utils/relcache.h"
|
||||
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/memprot.h"
|
||||
|
||||
#include "storage/page_compression.h"
|
||||
#include "executor/executor.h"
|
||||
#include "storage/vfd.h"
|
||||
|
||||
struct MmapEntry {
|
||||
RelFileNodeForkNum relFileNodeForkNum;
|
||||
/*
|
||||
* the following are setting sin runtime
|
||||
*/
|
||||
size_t reference = 0;
|
||||
PageCompressHeader *pcmap = NULL;
|
||||
};
|
||||
|
||||
constexpr size_t LOCK_ARRAY_SIZE = 1024;
|
||||
static pthread_mutex_t mmapLockArray[LOCK_ARRAY_SIZE];
|
||||
|
||||
static inline uint32 MmapTableHashCode(const RelFileNodeForkNum &relFileNodeForkNum)
|
||||
{
|
||||
return tag_hash((void *)&relFileNodeForkNum, sizeof(RelFileNodeForkNum));
|
||||
}
|
||||
|
||||
static inline pthread_mutex_t *MmapPartitionLock(size_t hashCode)
|
||||
{
|
||||
return &mmapLockArray[hashCode % LOCK_ARRAY_SIZE];
|
||||
}
|
||||
|
||||
static inline PageCompressHeader *MmapSharedMapFile(Vfd *vfdP, uint16 chunkSize, uint2 opt, bool readonly)
|
||||
{
|
||||
auto map = pc_mmap_real_size(vfdP->fd, SIZE_OF_PAGE_COMPRESS_ADDR_FILE(chunkSize), false);
|
||||
if (map->chunk_size == 0 || map->algorithm == 0) {
|
||||
map->chunk_size = chunkSize;
|
||||
map->algorithm = GET_COMPRESS_ALGORITHM(opt);
|
||||
if (pc_msync(map) != 0) {
|
||||
ereport(data_sync_elevel(ERROR),
|
||||
(errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", vfdP->fileName)));
|
||||
}
|
||||
}
|
||||
if (RecoveryInProgress() && !map->sync) {
|
||||
CheckAndRepairCompressAddress(map, chunkSize, map->algorithm, vfdP->fileName);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
void RealInitialMMapLockArray()
|
||||
{
|
||||
for (size_t i = 0; i < LOCK_ARRAY_SIZE; ++i) {
|
||||
pthread_mutex_init(&mmapLockArray[i], NULL);
|
||||
}
|
||||
|
||||
HASHCTL ctl;
|
||||
/* hash accessed by database file id */
|
||||
errno_t rc = memset_s(&ctl, sizeof(ctl), 0, sizeof(ctl));
|
||||
securec_check(rc, "", "");
|
||||
|
||||
ctl.keysize = sizeof(RelFileNodeForkNum);
|
||||
ctl.entrysize = sizeof(MmapEntry);
|
||||
ctl.hash = tag_hash;
|
||||
ctl.num_partitions = LOCK_ARRAY_SIZE;
|
||||
const size_t initLen = 256;
|
||||
g_instance.mmapCache = HeapMemInitHash(
|
||||
"mmap hash", initLen,
|
||||
(Max(g_instance.attr.attr_common.max_files_per_process, t_thrd.storage_cxt.max_userdatafiles)) / 2, &ctl,
|
||||
HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
|
||||
}
|
||||
|
||||
PageCompressHeader *GetPageCompressHeader(void *vfd, uint16 chunkSize, const RelFileNodeForkNum &relFileNodeForkNum)
|
||||
{
|
||||
Vfd *currentVfd = (Vfd *)vfd;
|
||||
uint32 hashCode = MmapTableHashCode(relFileNodeForkNum);
|
||||
AutoMutexLock mmapLock(MmapPartitionLock(hashCode));
|
||||
|
||||
mmapLock.lock();
|
||||
bool find = false;
|
||||
MmapEntry *mmapEntry = (MmapEntry *)hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum,
|
||||
hashCode, HASH_ENTER, &find);
|
||||
if (!find) {
|
||||
mmapEntry->pcmap = NULL;
|
||||
mmapEntry->reference = 0;
|
||||
}
|
||||
if (mmapEntry->pcmap == NULL) {
|
||||
mmapEntry->pcmap = MmapSharedMapFile(currentVfd, chunkSize, relFileNodeForkNum.rnode.node.opt, false);
|
||||
}
|
||||
++mmapEntry->reference;
|
||||
mmapLock.unLock();
|
||||
return mmapEntry->pcmap;
|
||||
}
|
||||
|
||||
void UnReferenceAddrFile(void *vfd)
|
||||
{
|
||||
Vfd *currentVfd = (Vfd *)vfd;
|
||||
RelFileNodeForkNum relFileNodeForkNum = currentVfd->fileNode;
|
||||
uint32 hashCode = MmapTableHashCode(relFileNodeForkNum);
|
||||
AutoMutexLock mmapLock(MmapPartitionLock(hashCode));
|
||||
mmapLock.lock();
|
||||
|
||||
MmapEntry *mmapEntry = (MmapEntry *)hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum,
|
||||
hashCode, HASH_FIND, NULL);
|
||||
if (mmapEntry == NULL) {
|
||||
ereport(ERROR, (errcode_for_file_access(),
|
||||
errmsg("UnReferenceAddrFile failed! mmap not found, filePath: %s", currentVfd->fileName)));
|
||||
}
|
||||
--mmapEntry->reference;
|
||||
if (mmapEntry->reference == 0) {
|
||||
if (pc_munmap(mmapEntry->pcmap) != 0) {
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", currentVfd->fileName)));
|
||||
}
|
||||
if (hash_search_with_hash_value(g_instance.mmapCache, (void *)&relFileNodeForkNum, hashCode, HASH_REMOVE,
|
||||
NULL) == NULL) {
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("UnReferenceAddrFile failed! remove hash key failed, filePath: %s", currentVfd->fileName)));
|
||||
}
|
||||
} else if (mmapEntry->reference < 0) {
|
||||
ereport(FATAL, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", currentVfd->fileName)));
|
||||
}
|
||||
mmapLock.unLock();
|
||||
}
|
|
@ -0,0 +1,472 @@
|
|||
/*
|
||||
* Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
|
||||
* Copyright (c) 2020, PostgreSQL Global Development Group
|
||||
*
|
||||
* openGauss is licensed under Mulan PSL v2.
|
||||
* You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
* You may obtain a copy of Mulan PSL v2 at:
|
||||
*
|
||||
* http://license.coscl.org.cn/MulanPSL2
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
* See the Mulan PSL v2 for more details.
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* page_compression.cpp
|
||||
* Routines for page compression
|
||||
*
|
||||
* There are two implementations at the moment: zstd, and the Postgres
|
||||
* pg_lzcompress(). zstd support requires that the server was compiled
|
||||
* with --with-zstd.
|
||||
* IDENTIFICATION
|
||||
* ./src/gausskernel/storage/smgr/page_compression.cpp
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "miscadmin.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "utils/datum.h"
|
||||
#include "utils/relcache.h"
|
||||
|
||||
#include "utils/timestamp.h"
|
||||
#include "storage/checksum.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/page_compression_impl.h"
|
||||
|
||||
static void CheckHeaderOfCompressAddr(PageCompressHeader* pcMap, uint16 chunk_size, uint8 algorithm, const char* path)
|
||||
{
|
||||
if (pcMap->chunk_size != chunk_size || pcMap->algorithm != algorithm) {
|
||||
if (u_sess->attr.attr_security.zero_damaged_pages) {
|
||||
ereport(WARNING,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid chunk_size %u or algorithm %u in head of compress relation address file \"%s\", "
|
||||
"and reinitialized it.",
|
||||
pcMap->chunk_size,
|
||||
pcMap->algorithm,
|
||||
path)));
|
||||
|
||||
pcMap->algorithm = algorithm;
|
||||
pg_atomic_write_u32(&pcMap->nblocks, RELSEG_SIZE);
|
||||
pg_atomic_write_u32(&pcMap->allocated_chunks, 0);
|
||||
pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, 0);
|
||||
pcMap->chunk_size = chunk_size;
|
||||
} else {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid chunk_size %u or algorithm %u in head of compress relation address file \"%s\"",
|
||||
pcMap->chunk_size,
|
||||
pcMap->algorithm,
|
||||
path)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CheckAndRepairCompressAddress(PageCompressHeader *pcMap, uint16 chunk_size, uint8 algorithm, const char *path)
|
||||
{
|
||||
TimestampTz lastRecoveryTime = pcMap->last_recovery_start_time;
|
||||
TimestampTz pgStartTime = t_thrd.time_cxt.pg_start_time;
|
||||
error_t rc;
|
||||
/* if the relation had been checked in this startup, skip */
|
||||
if (lastRecoveryTime == pgStartTime) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* check head of compress address file */
|
||||
CheckHeaderOfCompressAddr(pcMap, chunk_size, algorithm, path);
|
||||
|
||||
uint32 nblocks = pg_atomic_read_u32(&pcMap->nblocks);
|
||||
uint32 allocated_chunks = pg_atomic_read_u32(&pcMap->allocated_chunks);
|
||||
BlockNumber *global_chunknos = (BlockNumber *)palloc0(MAX_CHUNK_NUMBER(chunk_size) * sizeof(BlockNumber));
|
||||
|
||||
BlockNumber max_blocknum = (BlockNumber)-1;
|
||||
BlockNumber max_nonzero_blocknum = (BlockNumber)-1;
|
||||
BlockNumber max_allocated_chunkno = (pc_chunk_number_t)0;
|
||||
|
||||
/* check compress address of every pages */
|
||||
for (BlockNumber blocknum = 0; blocknum < (BlockNumber)RELSEG_SIZE; ++blocknum) {
|
||||
PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum);
|
||||
if (pcAddr->checksum != AddrChecksum32(blocknum, pcAddr, chunk_size)) {
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid checkum %u of block %u in file \"%s\"",
|
||||
pcAddr->checksum, blocknum, path)));
|
||||
pcAddr->allocated_chunks = pcAddr->nchunks = 0;
|
||||
for (int i = 0; i < BLCKSZ / chunk_size; ++i) {
|
||||
pcAddr->chunknos[i] = 0;
|
||||
}
|
||||
pcAddr->checksum = 0;
|
||||
}
|
||||
/*
|
||||
* skip when found first zero filled block after nblocks
|
||||
* if(blocknum >= (BlockNumber)nblocks && pcAddr->allocated_chunks == 0)
|
||||
* break;
|
||||
*/
|
||||
|
||||
/* check allocated_chunks for one page */
|
||||
if (pcAddr->allocated_chunks > BLCKSZ / chunk_size) {
|
||||
if (u_sess->attr.attr_security.zero_damaged_pages) {
|
||||
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
|
||||
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
|
||||
securec_check_c(rc, "\0", "\0");
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid allocated_chunks %u of block %u in file \"%s\", and zero this block",
|
||||
pcAddr->allocated_chunks, blocknum, path)));
|
||||
continue;
|
||||
} else {
|
||||
pfree(global_chunknos);
|
||||
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid allocated_chunks %u of block %u in file \"%s\"",
|
||||
pcAddr->allocated_chunks, blocknum, path)));
|
||||
}
|
||||
}
|
||||
|
||||
/* check chunknos for one page */
|
||||
for (int i = 0; i < pcAddr->allocated_chunks; ++i) {
|
||||
/* check for invalid chunkno */
|
||||
if (pcAddr->chunknos[i] == 0 || pcAddr->chunknos[i] > MAX_CHUNK_NUMBER(chunk_size)) {
|
||||
if (u_sess->attr.attr_security.zero_damaged_pages) {
|
||||
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
|
||||
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
|
||||
securec_check_c(rc, "\0", "\0");
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid chunk number %u of block %u in file \"%s\", and zero this block",
|
||||
pcAddr->chunknos[i], blocknum, path)));
|
||||
continue;
|
||||
} else {
|
||||
pfree(global_chunknos);
|
||||
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("invalid chunk number %u of block %u in file \"%s\"", pcAddr->chunknos[i],
|
||||
blocknum, path)));
|
||||
}
|
||||
}
|
||||
|
||||
/* check for duplicate chunkno */
|
||||
if (global_chunknos[pcAddr->chunknos[i] - 1] != 0) {
|
||||
if (u_sess->attr.attr_security.zero_damaged_pages) {
|
||||
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
|
||||
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
|
||||
securec_check_c(rc, "\0", "\0");
|
||||
ereport(
|
||||
WARNING,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg(
|
||||
"chunk number %u of block %u duplicate with block %u in file \"%s\", and zero this block",
|
||||
pcAddr->chunknos[i], blocknum, global_chunknos[pcAddr->chunknos[i] - 1], path)));
|
||||
continue;
|
||||
} else {
|
||||
pfree(global_chunknos);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("chunk number %u of block %u duplicate with block %u in file \"%s\"",
|
||||
pcAddr->chunknos[i], blocknum, global_chunknos[pcAddr->chunknos[i] - 1], path)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* clean chunknos beyond allocated_chunks for one page */
|
||||
for (int i = pcAddr->allocated_chunks; i < BLCKSZ / chunk_size; ++i) {
|
||||
if (pcAddr->chunknos[i] != 0) {
|
||||
pcAddr->chunknos[i] = 0;
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("clear chunk number %u beyond allocated_chunks %u of block %u in file \"%s\"",
|
||||
pcAddr->chunknos[i], pcAddr->allocated_chunks, blocknum, path)));
|
||||
}
|
||||
}
|
||||
|
||||
/* check nchunks for one page */
|
||||
if (pcAddr->nchunks > pcAddr->allocated_chunks) {
|
||||
if (u_sess->attr.attr_security.zero_damaged_pages) {
|
||||
rc = memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0,
|
||||
SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
|
||||
securec_check_c(rc, "\0", "\0");
|
||||
ereport(
|
||||
WARNING,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("nchunks %u exceeds allocated_chunks %u of block %u in file \"%s\", and zero this block",
|
||||
pcAddr->nchunks, pcAddr->allocated_chunks, blocknum, path)));
|
||||
continue;
|
||||
} else {
|
||||
pfree(global_chunknos);
|
||||
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("nchunks %u exceeds allocated_chunks %u of block %u in file \"%s\"",
|
||||
pcAddr->nchunks, pcAddr->allocated_chunks, blocknum, path)));
|
||||
}
|
||||
}
|
||||
|
||||
max_blocknum = blocknum;
|
||||
if (pcAddr->nchunks > 0) {
|
||||
max_nonzero_blocknum = blocknum;
|
||||
}
|
||||
|
||||
for (int i = 0; i < pcAddr->allocated_chunks; ++i) {
|
||||
global_chunknos[pcAddr->chunknos[i] - 1] = blocknum + 1;
|
||||
if (pcAddr->chunknos[i] > max_allocated_chunkno) {
|
||||
max_allocated_chunkno = pcAddr->chunknos[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int unused_chunks = 0;
|
||||
/* check for holes in allocated chunks */
|
||||
for (BlockNumber i = 0; i < max_allocated_chunkno; i++) {
|
||||
if (global_chunknos[i] == 0) {
|
||||
unused_chunks++;
|
||||
}
|
||||
}
|
||||
|
||||
if (unused_chunks > 0) {
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("there are %u chunks of total allocated chunks %u can not be use in file \"%s\"",
|
||||
unused_chunks, max_allocated_chunkno, path),
|
||||
errhint("You may need to run VACUMM FULL to optimize space allocation.")));
|
||||
}
|
||||
|
||||
/* update nblocks in head of compressed file */
|
||||
if (nblocks < max_nonzero_blocknum + 1) {
|
||||
pg_atomic_write_u32(&pcMap->nblocks, max_nonzero_blocknum + 1);
|
||||
pg_atomic_write_u32(&pcMap->last_synced_nblocks, max_nonzero_blocknum + 1);
|
||||
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("update nblocks head of compressed file \"%s\". old: %u, new: %u", path, nblocks,
|
||||
max_nonzero_blocknum + 1)));
|
||||
}
|
||||
|
||||
/* update allocated_chunks in head of compress file */
|
||||
if (allocated_chunks != max_allocated_chunkno) {
|
||||
pg_atomic_write_u32(&pcMap->allocated_chunks, max_allocated_chunkno);
|
||||
pg_atomic_write_u32(&pcMap->last_synced_allocated_chunks, max_allocated_chunkno);
|
||||
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("update allocated_chunks in head of compressed file \"%s\". old: %u, new: %u", path,
|
||||
allocated_chunks, max_allocated_chunkno)));
|
||||
}
|
||||
|
||||
/* clean compress address after max_blocknum + 1 */
|
||||
for (BlockNumber blocknum = max_blocknum + 1; blocknum < (BlockNumber)RELSEG_SIZE; blocknum++) {
|
||||
char buf[128];
|
||||
char *p = NULL;
|
||||
PageCompressAddr *pcAddr = GET_PAGE_COMPRESS_ADDR(pcMap, chunk_size, blocknum);
|
||||
|
||||
/* skip zero block */
|
||||
if (pcAddr->allocated_chunks == 0 && pcAddr->nchunks == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* clean compress address and output content of the address */
|
||||
rc = memset_s(buf, sizeof(buf), 0, sizeof(buf));
|
||||
securec_check_c(rc, "\0", "\0");
|
||||
p = buf;
|
||||
|
||||
for (int i = 0; i < pcAddr->allocated_chunks; i++) {
|
||||
if (pcAddr->chunknos[i]) {
|
||||
const char *formatStr = i == 0 ? "%u" : ",%u";
|
||||
errno_t rc =
|
||||
snprintf_s(p, sizeof(buf) - (p - buf), sizeof(buf) - (p - buf) - 1, formatStr, pcAddr->chunknos[i]);
|
||||
securec_check_ss(rc, "\0", "\0");
|
||||
p += strlen(p);
|
||||
}
|
||||
}
|
||||
|
||||
rc =
|
||||
memset_s((void *)pcAddr, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size), 0, SIZE_OF_PAGE_COMPRESS_ADDR(chunk_size));
|
||||
securec_check_c(rc, "\0", "\0");
|
||||
ereport(WARNING, (errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("clean unused compress address of block %u in file \"%s\", old "
|
||||
"allocated_chunks/nchunks/chunknos: %u/%u/{%s}",
|
||||
blocknum, path, pcAddr->allocated_chunks, pcAddr->nchunks, buf)));
|
||||
}
|
||||
|
||||
pfree(global_chunknos);
|
||||
|
||||
if (pc_msync(pcMap) != 0) {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not msync file \"%s\": %m", path)));
|
||||
}
|
||||
|
||||
pcMap->last_recovery_start_time = pgStartTime;
|
||||
}
|
||||
|
||||
int64 CalculateMainForkSize(char* pathName, RelFileNode* rnode, ForkNumber forkNumber)
|
||||
{
|
||||
Assert(IS_COMPRESSED_RNODE((*rnode), forkNumber));
|
||||
Assert(rnode->bucketNode == -1);
|
||||
return CalculateCompressMainForkSize(pathName);
|
||||
}
|
||||
|
||||
void CopyCompressedPath(char dst[MAXPGPATH], const char* pathName, CompressedFileType compressFileType)
|
||||
{
|
||||
int rc;
|
||||
if (compressFileType == COMPRESSED_TABLE_PCA_FILE) {
|
||||
rc = snprintf_s(dst, MAXPGPATH, MAXPGPATH - 1, PCA_SUFFIX, pathName);
|
||||
} else {
|
||||
rc = snprintf_s(dst, MAXPGPATH, MAXPGPATH - 1, PCD_SUFFIX, pathName);
|
||||
}
|
||||
securec_check_ss(rc, "\0", "\0");
|
||||
}
|
||||
|
||||
int64 CalculateCompressMainForkSize(char* pathName, bool suppressedENOENT)
|
||||
{
|
||||
int64 totalsize = 0;
|
||||
|
||||
char pcFilePath[MAXPGPATH];
|
||||
CopyCompressedPath(pcFilePath, pathName, COMPRESSED_TABLE_PCA_FILE);
|
||||
totalsize += CalculateFileSize(pcFilePath, MAXPGPATH, suppressedENOENT);
|
||||
|
||||
CopyCompressedPath(pcFilePath, pathName, COMPRESSED_TABLE_PCD_FILE);
|
||||
totalsize += CalculateFileSize(pcFilePath, MAXPGPATH, suppressedENOENT);
|
||||
|
||||
return totalsize;
|
||||
}
|
||||
|
||||
uint16 ReadChunkSize(FILE* pcaFile, char* pcaFilePath, size_t len)
|
||||
{
|
||||
uint16 chunkSize;
|
||||
if (fseeko(pcaFile, (off_t)offsetof(PageCompressHeader, chunk_size), SEEK_SET) != 0) {
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(), errmsg("could not seek in file \"%s\": \"%lu\": %m", pcaFilePath, len)));
|
||||
}
|
||||
|
||||
if (fread(&chunkSize, sizeof(chunkSize), 1, pcaFile) <= 0) {
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(), errmsg("could not open file \"%s\": \"%lu\": %m", pcaFilePath, len)));
|
||||
}
|
||||
return chunkSize;
|
||||
}
|
||||
|
||||
int64 CalculateFileSize(char* pathName, size_t size, bool suppressedENOENT)
|
||||
{
|
||||
struct stat structstat;
|
||||
if (stat(pathName, &structstat)) {
|
||||
if (errno == ENOENT) {
|
||||
if (suppressedENOENT) {
|
||||
return 0;
|
||||
}
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not FIND file \"%s\": %m", pathName)));
|
||||
} else {
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathName)));
|
||||
}
|
||||
}
|
||||
return structstat.st_size;
|
||||
}
|
||||
|
||||
uint1 ConvertChunkSize(uint32 compressedChunkSize, bool *success)
|
||||
{
|
||||
uint1 chunkSize = INDEX_OF_HALF_BLCKSZ;
|
||||
switch (compressedChunkSize) {
|
||||
case BLCKSZ / 2:
|
||||
chunkSize = INDEX_OF_HALF_BLCKSZ;
|
||||
break;
|
||||
case BLCKSZ / 4:
|
||||
chunkSize = INDEX_OF_QUARTER_BLCKSZ;
|
||||
break;
|
||||
case BLCKSZ / 8:
|
||||
chunkSize = INDEX_OF_EIGHTH_BRICK_BLCKSZ;
|
||||
break;
|
||||
case BLCKSZ / 16:
|
||||
chunkSize = INDEX_OF_SIXTEENTHS_BLCKSZ;
|
||||
break;
|
||||
default:
|
||||
*success = false;
|
||||
return chunkSize;
|
||||
}
|
||||
*success = true;
|
||||
return chunkSize;
|
||||
}
|
||||
|
||||
constexpr int MAX_RETRY_LIMIT = 60;
|
||||
constexpr long RETRY_SLEEP_TIME = 1000000L;
|
||||
|
||||
size_t ReadAllChunkOfBlock(char *dst, size_t destLen, BlockNumber blockNumber, ReadBlockChunksStruct& rbStruct)
|
||||
{
|
||||
PageCompressHeader* header = rbStruct.header;
|
||||
if (blockNumber >= header->nblocks) {
|
||||
ereport(ERROR,
|
||||
(ERRCODE_INVALID_PARAMETER_VALUE,
|
||||
errmsg("blocknum \"%u\" exceeds max block number", blockNumber)));
|
||||
}
|
||||
char* pageBuffer = rbStruct.pageBuffer;
|
||||
const char* fileName = rbStruct.fileName;
|
||||
decltype(PageCompressHeader::chunk_size) chunkSize = header->chunk_size;
|
||||
decltype(ReadBlockChunksStruct::segmentNo) segmentNo = rbStruct.segmentNo;
|
||||
PageCompressAddr* currentAddr = GET_PAGE_COMPRESS_ADDR(header, chunkSize, blockNumber);
|
||||
|
||||
size_t tryCount = 0;
|
||||
/* for empty chunks write */
|
||||
uint8 allocatedChunks;
|
||||
uint8 nchunks;
|
||||
do {
|
||||
allocatedChunks = currentAddr->allocated_chunks;
|
||||
nchunks = currentAddr->nchunks;
|
||||
for (uint8 i = 0; i < nchunks; ++i) {
|
||||
off_t seekPos = (off_t)OFFSET_OF_PAGE_COMPRESS_CHUNK(chunkSize, currentAddr->chunknos[i]);
|
||||
uint8 start = i;
|
||||
while (i < nchunks - 1 && currentAddr->chunknos[i + 1] == currentAddr->chunknos[i] + 1) {
|
||||
i++;
|
||||
}
|
||||
if (fseeko(rbStruct.fp, seekPos, SEEK_SET) != 0) {
|
||||
ReleaseMap(header, fileName);
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in file \"%s\": %m", fileName)));
|
||||
}
|
||||
size_t readAmount = chunkSize * (i - start + 1);
|
||||
if (fread(dst + start * chunkSize, 1, readAmount, rbStruct.fp) != readAmount && ferror(rbStruct.fp)) {
|
||||
ReleaseMap(header, fileName);
|
||||
ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fileName)));
|
||||
}
|
||||
}
|
||||
if (nchunks == 0) {
|
||||
break;
|
||||
}
|
||||
if (DecompressPage(dst, pageBuffer, header->algorithm) == BLCKSZ) {
|
||||
PageHeader phdr = PageHeader(pageBuffer);
|
||||
BlockNumber blkNo = blockNumber + segmentNo * ((BlockNumber)RELSEG_SIZE);
|
||||
if (PageIsNew(phdr) || pg_checksum_page(pageBuffer, blkNo) == phdr->pd_checksum) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (tryCount < MAX_RETRY_LIMIT) {
|
||||
++tryCount;
|
||||
pg_usleep(RETRY_SLEEP_TIME);
|
||||
} else {
|
||||
ReleaseMap(header, fileName);
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("base backup cheksum or Decompressed blockno %u failed in file \"%s\", aborting backup. "
|
||||
"nchunks: %u, allocatedChunks: %u, segno: %d.",
|
||||
blockNumber,
|
||||
fileName,
|
||||
nchunks,
|
||||
allocatedChunks,
|
||||
segmentNo)));
|
||||
}
|
||||
} while (true);
|
||||
if (allocatedChunks > nchunks) {
|
||||
auto currentWriteSize = nchunks * chunkSize;
|
||||
securec_check(
|
||||
memset_s(dst + currentWriteSize, destLen - currentWriteSize, 0, (allocatedChunks - nchunks) * chunkSize),
|
||||
"",
|
||||
"");
|
||||
}
|
||||
return allocatedChunks * chunkSize;
|
||||
}
|
||||
|
||||
CompressedFileType IsCompressedFile(char *fileName, size_t fileNameLen)
|
||||
{
|
||||
size_t suffixLen = 4;
|
||||
if (fileNameLen >= suffixLen) {
|
||||
const char *suffix = fileName + fileNameLen - suffixLen;
|
||||
if (strncmp(suffix, "_pca", suffixLen) == 0) {
|
||||
return COMPRESSED_TABLE_PCA_FILE;
|
||||
} else if (strncmp(suffix, "_pcd", suffixLen) == 0) {
|
||||
return COMPRESSED_TABLE_PCD_FILE;
|
||||
}
|
||||
}
|
||||
return COMPRESSED_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
void ReleaseMap(PageCompressHeader* map, const char* fileName)
|
||||
{
|
||||
if (map != NULL && pc_munmap(map) != 0) {
|
||||
ereport(WARNING, (errcode_for_file_access(), errmsg("could not munmap file \"%s\": %m", fileName)));
|
||||
}
|
||||
}
|
|
@ -31,7 +31,8 @@
|
|||
|
||||
typedef enum BufTagVer {
|
||||
ORIGIN_TAG = 0,
|
||||
HASHBUCKET_TAG
|
||||
HASHBUCKET_TAG,
|
||||
PAGE_COMPRESS_TAG
|
||||
} BufTagVer;
|
||||
|
||||
typedef struct st_dw_batch {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
* header file for openGauss hash access method implementation
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/access/hash.h
|
||||
|
@ -33,36 +33,59 @@
|
|||
*/
|
||||
typedef uint32 Bucket;
|
||||
|
||||
#define INVALID_BUCKET_NUM (0xFFFFFFFF)
|
||||
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B) + 1) - 1] : 0)) + 1)
|
||||
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
|
||||
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B) + 1) - 1] : 0)) + 1)
|
||||
|
||||
/*
|
||||
* Special space for hash index pages.
|
||||
*
|
||||
* hasho_flag tells us which type of page we're looking at. For
|
||||
* example, knowing overflow pages from bucket pages is necessary
|
||||
* information when you're deleting tuples from a page. If all the
|
||||
* tuples are deleted from an overflow page, the overflow is made
|
||||
* available to other buckets by calling _hash_freeovflpage(). If all
|
||||
* the tuples are deleted from a bucket page, no additional action is
|
||||
* necessary.
|
||||
* hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
|
||||
* Additional bits in the flag word are used for more transient purposes.
|
||||
*
|
||||
* To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
|
||||
* However, we ensure that each used page type has a distinct bit so that
|
||||
* we can OR together page types for uses such as the allowable-page-types
|
||||
* argument of _hash_checkpage().
|
||||
*/
|
||||
#define LH_UNUSED_PAGE (0)
|
||||
#define LH_OVERFLOW_PAGE (1 << 0)
|
||||
#define LH_BUCKET_PAGE (1 << 1)
|
||||
#define LH_BITMAP_PAGE (1 << 2)
|
||||
#define LH_META_PAGE (1 << 3)
|
||||
#define LH_BUCKET_BEING_POPULATED (1 << 4)
|
||||
#define LH_BUCKET_BEING_SPLIT (1 << 5)
|
||||
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
|
||||
#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
|
||||
|
||||
#define LH_PAGE_TYPE \
|
||||
(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
|
||||
|
||||
/*
|
||||
* In an overflow page, hasho_prevblkno stores the block number of the previous
|
||||
* page in the bucket chain; in a bucket page, hasho_prevblkno stores the
|
||||
* hashm_maxbucket value as of the last time the bucket was last split, or
|
||||
* else as of the time the bucket was created. The latter convention is used
|
||||
* to determine whether a cached copy of the metapage is too stale to be used
|
||||
* without needing to lock or pin the metapage.
|
||||
*
|
||||
* hasho_nextblkno is always the block number of the next page in the
|
||||
* bucket chain, or InvalidBlockNumber if there are no more such pages.
|
||||
*/
|
||||
typedef struct HashPageOpaqueData {
|
||||
BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
|
||||
BlockNumber hasho_nextblkno; /* next ovfl blkno */
|
||||
Bucket hasho_bucket; /* bucket number this pg belongs to */
|
||||
uint16 hasho_flag; /* page type code, see above */
|
||||
uint16 hasho_page_id; /* for identification of hash indexes */
|
||||
BlockNumber hasho_prevblkno; /* see above */
|
||||
BlockNumber hasho_nextblkno; /* see above */
|
||||
Bucket hasho_bucket; /* bucket number this pg belongs to */
|
||||
uint16 hasho_flag; /* page type code + flag bits, see above */
|
||||
uint16 hasho_page_id; /* for identification of hash indexes */
|
||||
} HashPageOpaqueData;
|
||||
|
||||
typedef HashPageOpaqueData* HashPageOpaque;
|
||||
|
||||
#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
|
||||
#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
|
||||
#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
|
||||
#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
|
||||
|
||||
/*
|
||||
* The page ID is for the convenience of pg_filedump and similar utilities,
|
||||
* which otherwise would have a hard time telling pages of different index
|
||||
|
@ -71,26 +94,19 @@ typedef HashPageOpaqueData* HashPageOpaque;
|
|||
*/
|
||||
#define HASHO_PAGE_ID 0xFF80
|
||||
|
||||
typedef struct HashScanPosItem {
|
||||
ItemPointerData heapTid; /* TID of referenced heap item */
|
||||
OffsetNumber indexOffset; /* index item's location within page */
|
||||
} HashScanPosItem;
|
||||
|
||||
|
||||
/*
|
||||
* HashScanOpaqueData is private state for a hash index scan.
|
||||
* HashScanOpaqueData is private state for a hash index scan.
|
||||
*/
|
||||
typedef struct HashScanOpaqueData {
|
||||
/* Hash value of the scan key, ie, the hash key we seek */
|
||||
uint32 hashso_sk_hash;
|
||||
|
||||
/*
|
||||
* By definition, a hash scan should be examining only one bucket. We
|
||||
* record the bucket number here as soon as it is known.
|
||||
*/
|
||||
Bucket hashso_bucket;
|
||||
bool hashso_bucket_valid;
|
||||
|
||||
/*
|
||||
* If we have a share lock on the bucket, we record it here. When
|
||||
* hashso_bucket_blkno is zero, we have no such lock.
|
||||
*/
|
||||
BlockNumber hashso_bucket_blkno;
|
||||
|
||||
/*
|
||||
* We also want to remember which buffer we're currently examining in the
|
||||
* scan. We keep the buffer pinned (but not locked) across hashgettuple
|
||||
|
@ -99,11 +115,33 @@ typedef struct HashScanOpaqueData {
|
|||
*/
|
||||
Buffer hashso_curbuf;
|
||||
|
||||
/* remember the buffer associated with primary bucket */
|
||||
Buffer hashso_bucket_buf;
|
||||
|
||||
/*
|
||||
* remember the buffer associated with primary bucket page of bucket being
|
||||
* split. it is required during the scan of the bucket which is being
|
||||
* populated during split operation.
|
||||
*/
|
||||
Buffer hashso_split_bucket_buf;
|
||||
|
||||
/* Current position of the scan, as an index TID */
|
||||
ItemPointerData hashso_curpos;
|
||||
|
||||
/* Current position of the scan, as a heap TID */
|
||||
ItemPointerData hashso_heappos;
|
||||
|
||||
/* Whether scan starts on bucket being populated due to split */
|
||||
bool hashso_buc_populated;
|
||||
|
||||
/*
|
||||
* Whether scanning bucket being split? The value of this parameter is
|
||||
* referred only when hashso_buc_populated is true.
|
||||
*/
|
||||
bool hashso_buc_split;
|
||||
/* info about killed items if any (killedItems is NULL if never used) */
|
||||
HashScanPosItem *killedItems; /* tids and offset numbers of killed items */
|
||||
int numKilled; /* number of currently stored items */
|
||||
} HashScanOpaqueData;
|
||||
|
||||
typedef HashScanOpaqueData* HashScanOpaque;
|
||||
|
@ -115,7 +153,7 @@ typedef HashScanOpaqueData* HashScanOpaque;
|
|||
#define HASH_METAPAGE 0 /* metapage is always block 0 */
|
||||
|
||||
#define HASH_MAGIC 0x6440640
|
||||
#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */
|
||||
#define HASH_VERSION 4
|
||||
|
||||
/*
|
||||
* Spares[] holds the number of overflow pages currently allocated at or
|
||||
|
@ -128,17 +166,32 @@ typedef HashScanOpaqueData* HashScanOpaque;
|
|||
*
|
||||
* ovflpages that have been recycled for reuse can be found by looking at
|
||||
* bitmaps that are stored within ovflpages dedicated for the purpose.
|
||||
* The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the
|
||||
* The blknos of these bitmap pages are kept in mapp[]; nmaps is the
|
||||
* number of currently existing bitmaps.
|
||||
*
|
||||
* The limitation on the size of spares[] comes from the fact that there's
|
||||
* no point in having more than 2^32 buckets with only uint32 hashcodes.
|
||||
* (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
|
||||
* adjusted in such a way to accommodate multi phased allocation of buckets
|
||||
* after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
|
||||
*
|
||||
* There is no particular upper limit on the size of mapp[], other than
|
||||
* needing to fit into the metapage. (With 8K block size, 128 bitmaps
|
||||
* limit us to 64 Gb of overflow space...)
|
||||
* needing to fit into the metapage. (With 8K block size, 1024 bitmaps
|
||||
* limit us to 256 GB of overflow space...)
|
||||
*/
|
||||
#define HASH_MAX_SPLITPOINTS 32
|
||||
#define HASH_MAX_BITMAPS 128
|
||||
#define HASH_MAX_BITMAPS 1024
|
||||
|
||||
#define HASH_SPLITPOINT_PHASE_BITS 2
|
||||
#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
|
||||
#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
|
||||
#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
|
||||
|
||||
/* defines max number of splitpoit phases a hash index can have */
|
||||
#define HASH_MAX_SPLITPOINT_GROUP 32
|
||||
#define HASH_MAX_SPLITPOINTS \
|
||||
(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
|
||||
HASH_SPLITPOINT_PHASES_PER_GRP) + \
|
||||
HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
||||
|
||||
typedef struct HashMetaPageData {
|
||||
uint32 hashm_magic; /* magic no. for hash tables */
|
||||
|
@ -280,37 +333,40 @@ extern Datum hash_new_uint32(uint32 k);
|
|||
/* private routines */
|
||||
|
||||
/* hashinsert.c */
|
||||
extern void _hash_doinsert(Relation rel, IndexTuple itup);
|
||||
extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
|
||||
extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup);
|
||||
extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
|
||||
OffsetNumber *itup_offsets, uint16 nitups);
|
||||
|
||||
/* hashovfl.c */
|
||||
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
|
||||
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy);
|
||||
extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum);
|
||||
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy);
|
||||
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
|
||||
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
|
||||
Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
|
||||
Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
|
||||
extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
|
||||
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy);
|
||||
|
||||
/* hashpage.c */
|
||||
extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
|
||||
extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
|
||||
extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
|
||||
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags);
|
||||
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
|
||||
BlockNumber blkno, int flags);
|
||||
extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh);
|
||||
extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
|
||||
int access, HashMetaPage *cachedmetap);
|
||||
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
|
||||
extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage);
|
||||
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum);
|
||||
extern Buffer _hash_getbuf_with_strategy(
|
||||
Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy);
|
||||
extern void _hash_relbuf(Relation rel, Buffer buf);
|
||||
extern void _hash_dropbuf(Relation rel, Buffer buf);
|
||||
extern void _hash_wrtbuf(Relation rel, Buffer buf);
|
||||
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access);
|
||||
extern uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum);
|
||||
extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
|
||||
extern uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum);
|
||||
extern void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage);
|
||||
extern void _hash_pageinit(Page page, Size size);
|
||||
extern void _hash_expandtable(Relation rel, Buffer metabuf);
|
||||
|
||||
/* hashscan.c */
|
||||
extern void _hash_regscan(IndexScanDesc scan);
|
||||
extern void _hash_dropscan(IndexScanDesc scan);
|
||||
extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
|
||||
extern void ReleaseResources_hash(void);
|
||||
extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
|
||||
uint32 maxbucket, uint32 highmask, uint32 lowmask);
|
||||
|
||||
/* hashsearch.c */
|
||||
extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
|
||||
|
@ -320,10 +376,10 @@ extern bool _hash_step(IndexScanDesc scan, Buffer* bufP, ScanDirection dir);
|
|||
/* hashsort.c */
|
||||
typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
|
||||
|
||||
extern HSpool* _h_spoolinit(Relation index, uint32 num_buckets, void* meminfo);
|
||||
extern HSpool* _h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void* meminfo);
|
||||
extern void _h_spooldestroy(HSpool* hspool);
|
||||
extern void _h_spool(HSpool* hspool, ItemPointer self, Datum* values, const bool* isnull);
|
||||
extern void _h_indexbuild(HSpool* hspool);
|
||||
extern void _h_indexbuild(HSpool* hspool, Relation heapRel);
|
||||
|
||||
/* hashutil.c */
|
||||
extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
|
||||
|
@ -331,16 +387,31 @@ extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
|
|||
extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
|
||||
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask);
|
||||
extern uint32 _hash_log2(uint32 num);
|
||||
extern uint32 _hash_spareindex(uint32 num_bucket);
|
||||
extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
|
||||
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
|
||||
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
|
||||
extern IndexTuple _hash_form_tuple(Relation index, Datum* values, const bool* isnull);
|
||||
extern bool _hash_convert_tuple(Relation index, Datum *user_values, const bool *user_isnull,
|
||||
Datum *index_values, bool *index_isnull);
|
||||
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
|
||||
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
|
||||
extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
|
||||
extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
|
||||
extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
|
||||
uint32 lowmask, uint32 maxbucket);
|
||||
extern void _hash_kill_items(IndexScanDesc scan);
|
||||
|
||||
/* hash.c */
|
||||
extern void hash_redo(XLogReaderState* record);
|
||||
extern void hash_desc(StringInfo buf, XLogReaderState* record);
|
||||
extern const char* hash_type_name(uint8 subtype);
|
||||
extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
|
||||
Buffer bucket_buf, BlockNumber bucket_blkno,
|
||||
BufferAccessStrategy bstrategy,
|
||||
uint32 maxbucket, uint32 highmask, uint32 lowmask,
|
||||
double *tuples_removed, double *num_index_tuples,
|
||||
bool bucket_has_garbage,
|
||||
IndexBulkDeleteCallback callback, void *callback_state);
|
||||
|
||||
#ifdef PGXC
|
||||
extern Datum compute_hash(Oid type, Datum value, char locator);
|
||||
|
|
|
@ -0,0 +1,352 @@
|
|||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* hash_xlog.h
|
||||
* header file for Postgres hash AM implementation
|
||||
*
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/access/hash_xlog.h
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef HASH_XLOG_H
|
||||
#define HASH_XLOG_H
|
||||
|
||||
#include "access/xlogreader.h"
|
||||
#include "lib/stringinfo.h"
|
||||
#include "storage/off.h"
|
||||
|
||||
/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */
|
||||
#define HASH_XLOG_FREE_OVFL_BUFS 6
|
||||
|
||||
/*
|
||||
* XLOG records for hash operations
|
||||
*/
|
||||
#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */
|
||||
#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */
|
||||
#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */
|
||||
#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */
|
||||
#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */
|
||||
#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */
|
||||
#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */
|
||||
#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page
|
||||
* and add to another page */
|
||||
#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous
|
||||
* pages in chain and free the ovfl
|
||||
* page */
|
||||
#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */
|
||||
#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary
|
||||
* bucket page after deleting tuples
|
||||
* that are moved due to split */
|
||||
#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */
|
||||
#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index page */
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_INIT_META_PAGE_NUM = 0,
|
||||
}XLogHashInitMetaPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM = 0,
|
||||
XLOG_HASH_INIT_BITMAP_PAGE_META_NUM,
|
||||
}XLogHashInitBitmapPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_INSERT_PAGE_NUM = 0,
|
||||
XLOG_HASH_INSERT_META_NUM,
|
||||
}XLogHashInsertEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM = 0,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_META_NUM,
|
||||
}XLogHashAddOvflPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM = 0,
|
||||
XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM,
|
||||
XLOG_HASH_SPLIT_ALLOCATE_PAGE_META_NUM,
|
||||
}XLogHashSplitAllocatePageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_SPLIT_PAGE_NUM = 0,
|
||||
}XLogHashSplitPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM = 0,
|
||||
XLOG_HASH_SPLIT_COMPLETE_NBUK_NUM,
|
||||
}XLogHashSplitCompleteEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_MOVE_BUK_BLOCK_NUM = 0,
|
||||
HASH_MOVE_ADD_BLOCK_NUM,
|
||||
HASH_MOVE_DELETE_OVFL_BLOCK_NUM,
|
||||
}XLogHashMovePageEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_SQUEEZE_BUK_BLOCK_NUM = 0,
|
||||
HASH_SQUEEZE_ADD_BLOCK_NUM,
|
||||
HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_META_BLOCK_NUM,
|
||||
}XLogHashSqueezePageEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_DELETE_BUK_BLOCK_NUM = 0,
|
||||
HASH_DELETE_OVFL_BLOCK_NUM,
|
||||
}XLogHashDeleteEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_SPLIT_CLEANUP_BLOCK_NUM,
|
||||
}XLogHashSplitCleanupEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_UPDATE_META_BLOCK_NUM,
|
||||
} XLogHashUpdateMateEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_VACUUM_PAGE_BLOCK_NUM = 0,
|
||||
HASH_VACUUM_META_BLOCK_NUM,
|
||||
} XLogHashVacuumPageEnum;
|
||||
|
||||
/*
|
||||
* xl_hash_split_allocate_page flag values, 8 bits are available.
|
||||
*/
|
||||
#define XLH_SPLIT_META_UPDATE_MASKS (1<<0)
|
||||
#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1)
|
||||
|
||||
/*
|
||||
* This is what we need to know about a HASH index create.
|
||||
*
|
||||
* Backup block 0: metapage
|
||||
*/
|
||||
typedef struct xl_hash_createidx
|
||||
{
|
||||
double num_tuples;
|
||||
RegProcedure procid;
|
||||
uint16 ffactor;
|
||||
} xl_hash_createidx;
|
||||
|
||||
#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to know about simple (without split) insert.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_INSERT
|
||||
*
|
||||
* Backup Blk 0: original page (data contains the inserted tuple)
|
||||
* Backup Blk 1: metapage (HashMetaPageData)
|
||||
*/
|
||||
typedef struct xl_hash_insert
|
||||
{
|
||||
OffsetNumber offnum;
|
||||
} xl_hash_insert;
|
||||
|
||||
#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber))
|
||||
|
||||
/*
|
||||
* This is what we need to know about addition of overflow page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_ADD_OVFL_PAGE
|
||||
*
|
||||
* Backup Blk 0: newly allocated overflow page
|
||||
* Backup Blk 1: page before new overflow page in the bucket chain
|
||||
* Backup Blk 2: bitmap page
|
||||
* Backup Blk 3: new bitmap page
|
||||
* Backup Blk 4: metapage
|
||||
*/
|
||||
typedef struct xl_hash_add_ovfl_page
|
||||
{
|
||||
uint16 bmsize;
|
||||
bool bmpage_found;
|
||||
} xl_hash_add_ovfl_page;
|
||||
|
||||
#define SizeOfHashAddOvflPage \
|
||||
(offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need to know about allocating a page for split.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE
|
||||
*
|
||||
* Backup Blk 0: page for old bucket
|
||||
* Backup Blk 1: page for new bucket
|
||||
* Backup Blk 2: metapage
|
||||
*/
|
||||
typedef struct xl_hash_split_allocate_page
|
||||
{
|
||||
uint32 new_bucket;
|
||||
uint16 old_bucket_flag;
|
||||
uint16 new_bucket_flag;
|
||||
uint8 flags;
|
||||
} xl_hash_split_allocate_page;
|
||||
|
||||
#define SizeOfHashSplitAllocPage \
|
||||
(offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8))
|
||||
|
||||
/*
|
||||
* This is what we need to know about completing the split operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_SPLIT_COMPLETE
|
||||
*
|
||||
* Backup Blk 0: page for old bucket
|
||||
* Backup Blk 1: page for new bucket
|
||||
*/
|
||||
typedef struct xl_hash_split_complete
|
||||
{
|
||||
uint16 old_bucket_flag;
|
||||
uint16 new_bucket_flag;
|
||||
} xl_hash_split_complete;
|
||||
|
||||
#define SizeOfHashSplitComplete \
|
||||
(offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to know about move page contents required during
|
||||
* squeeze operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS
|
||||
*
|
||||
* Backup Blk 0: bucket page
|
||||
* Backup Blk 1: page containing moved tuples
|
||||
* Backup Blk 2: page from which tuples will be removed
|
||||
*/
|
||||
typedef struct xl_hash_move_page_contents
|
||||
{
|
||||
uint16 ntups;
|
||||
bool is_prim_bucket_same_wrt; /* true if the page to which
|
||||
* tuples are moved is same as
|
||||
* primary bucket page */
|
||||
} xl_hash_move_page_contents;
|
||||
|
||||
#define SizeOfHashMovePageContents \
|
||||
(offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need to know about the squeeze page operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_SQUEEZE_PAGE
|
||||
*
|
||||
* Backup Blk 0: page containing tuples moved from freed overflow page
|
||||
* Backup Blk 1: freed overflow page
|
||||
* Backup Blk 2: page previous to the freed overflow page
|
||||
* Backup Blk 3: page next to the freed overflow page
|
||||
* Backup Blk 4: bitmap page containing info of freed overflow page
|
||||
* Backup Blk 5: meta page
|
||||
*/
|
||||
typedef struct xl_hash_squeeze_page
|
||||
{
|
||||
BlockNumber prevblkno;
|
||||
BlockNumber nextblkno;
|
||||
uint16 ntups;
|
||||
bool is_prim_bucket_same_wrt; /* true if the page to which
|
||||
* tuples are moved is same as
|
||||
* primary bucket page */
|
||||
bool is_prev_bucket_same_wrt; /* true if the page to which
|
||||
* tuples are moved is the page
|
||||
* previous to the freed overflow
|
||||
* page */
|
||||
} xl_hash_squeeze_page;
|
||||
|
||||
#define SizeOfHashSqueezePage \
|
||||
(offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need to know about the deletion of index tuples from a page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_DELETE
|
||||
*
|
||||
* Backup Blk 0: primary bucket page
|
||||
* Backup Blk 1: page from which tuples are deleted
|
||||
*/
|
||||
typedef struct xl_hash_delete
|
||||
{
|
||||
bool clear_dead_marking; /* true if this operation clears
|
||||
* LH_PAGE_HAS_DEAD_TUPLES flag */
|
||||
bool is_primary_bucket_page; /* true if the operation is for
|
||||
* primary bucket page */
|
||||
} xl_hash_delete;
|
||||
|
||||
#define SizeOfHashDelete \
|
||||
(offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need for metapage update operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_UPDATE_META_PAGE
|
||||
*
|
||||
* Backup Blk 0: meta page
|
||||
*/
|
||||
typedef struct xl_hash_update_meta_page
|
||||
{
|
||||
double ntuples;
|
||||
} xl_hash_update_meta_page;
|
||||
|
||||
#define SizeOfHashUpdateMetaPage \
|
||||
(offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double))
|
||||
|
||||
/*
|
||||
* This is what we need to initialize metapage.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_INIT_META_PAGE
|
||||
*
|
||||
* Backup Blk 0: meta page
|
||||
*/
|
||||
typedef struct xl_hash_init_meta_page
|
||||
{
|
||||
double num_tuples;
|
||||
RegProcedure procid;
|
||||
uint16 ffactor;
|
||||
} xl_hash_init_meta_page;
|
||||
|
||||
#define SizeOfHashInitMetaPage \
|
||||
(offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to initialize bitmap page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_INIT_BITMAP_PAGE
|
||||
*
|
||||
* Backup Blk 0: bitmap page
|
||||
* Backup Blk 1: meta page
|
||||
*/
|
||||
typedef struct xl_hash_init_bitmap_page
|
||||
{
|
||||
uint16 bmsize;
|
||||
} xl_hash_init_bitmap_page;
|
||||
|
||||
#define SizeOfHashInitBitmapPage \
|
||||
(offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need for index tuple deletion and to
|
||||
* update the meta page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_VACUUM_ONE_PAGE
|
||||
*
|
||||
* Backup Blk 0: bucket page
|
||||
* Backup Blk 1: meta page
|
||||
*/
|
||||
typedef struct xl_hash_vacuum_one_page
|
||||
{
|
||||
RelFileNode hnode;
|
||||
int ntuples;
|
||||
|
||||
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
|
||||
} xl_hash_vacuum_one_page;
|
||||
|
||||
#define SizeOfHashVacuumOnePage \
|
||||
(offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(int))
|
||||
|
||||
extern void hash_redo(XLogReaderState *record);
|
||||
extern void hash_desc(StringInfo buf, XLogReaderState *record);
|
||||
extern const char *hash_identify(uint8 info);
|
||||
extern bool IsHashVacuumPages(XLogReaderState *record);
|
||||
|
||||
#endif /* HASH_XLOG_H */
|
|
@ -131,6 +131,22 @@ typedef struct {
|
|||
int offset; /* offset of field in result struct */
|
||||
} relopt_parse_elt;
|
||||
|
||||
struct TableCreateSupport {
|
||||
bool compressType;
|
||||
bool compressLevel;
|
||||
bool compressChunkSize;
|
||||
bool compressPreAllocChunks;
|
||||
bool compressByteConvert;
|
||||
bool compressDiffConvert;
|
||||
};
|
||||
|
||||
inline bool HasCompressOption(TableCreateSupport *tableCreateSupport)
|
||||
{
|
||||
return tableCreateSupport->compressLevel || tableCreateSupport->compressChunkSize ||
|
||||
tableCreateSupport->compressPreAllocChunks || tableCreateSupport->compressByteConvert ||
|
||||
tableCreateSupport->compressDiffConvert;
|
||||
}
|
||||
|
||||
/*
|
||||
* The following are the table append modes currently supported.
|
||||
* on: mark the table on-line scaleout mode, when it is set, later data write by append mode.
|
||||
|
@ -285,5 +301,8 @@ extern void forbid_to_set_options_for_timeseries_tbl(List* options);
|
|||
extern List* RemoveRelOption(List* options, const char* optName, bool* removed);
|
||||
void RowTblCheckCompressionOption(List *options, int8 rowCompress = REL_CMPRS_PAGE_PLAIN);
|
||||
void RowTblCheckHashBucketOption(List* options, StdRdOptions* std_opt);
|
||||
void SetOneOfCompressOption(const char *defname, TableCreateSupport *tableCreateSupport);
|
||||
void CheckCompressOption(TableCreateSupport *tableCreateSupport);
|
||||
void ForbidUserToSetCompressedOptions(List *options);
|
||||
#endif /* RELOPTIONS_H */
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "storage/buf/block.h"
|
||||
#include "storage/buf/buf.h"
|
||||
#include "storage/buf/bufpage.h"
|
||||
#include "storage/page_compression.h"
|
||||
#include "storage/smgr/relfilenode.h"
|
||||
|
||||
struct XLogPhyBlock;
|
||||
|
|
|
@ -59,6 +59,7 @@ typedef void (*relasexlogreadstate)(void* record);
|
|||
#define XLogBlockHeadGetForkNum(blockhead) ((blockhead)->forknum)
|
||||
#define XLogBlockHeadGetBlockNum(blockhead) ((blockhead)->blkno)
|
||||
#define XLogBlockHeadGetBucketId(blockhead) ((blockhead)->bucketNode)
|
||||
#define XLogBlockHeadGetCompressOpt(blockhead) ((blockhead)->opt)
|
||||
#define XLogBlockHeadGetValidInfo(blockhead) ((blockhead)->block_valid)
|
||||
#define XLogBlockHeadGetPhysicalBlock(blockhead) ((blockhead)->pblk)
|
||||
/* for common blockhead end */
|
||||
|
@ -495,7 +496,8 @@ typedef struct {
|
|||
TransactionId xl_xid; /* xact id */
|
||||
Oid spcNode; /* tablespace */
|
||||
Oid dbNode; /* database */
|
||||
int4 bucketNode; /* bucket */
|
||||
int2 bucketNode; /* bucket */
|
||||
uint2 opt;
|
||||
XLogPhyBlock pblk;
|
||||
} XLogBlockHead;
|
||||
|
||||
|
@ -1002,6 +1004,47 @@ extern void UBTreeXlogUnlinkPageOperatorChildpage(RedoBufferInfo* cbuf, void* re
|
|||
|
||||
extern void UBTreeXlogClearIncompleteSplit(RedoBufferInfo* buffer);
|
||||
|
||||
void HashRedoInitMetaPageOperatorPage(RedoBufferInfo *metabuf, void *recorddata);
|
||||
|
||||
void HashRedoInitBitmapPageOperatorBitmapPage(RedoBufferInfo *bitmapbuf, void *recorddata);
|
||||
void HashRedoInitBitmapPageOperatorMetaPage(RedoBufferInfo *metabuf);
|
||||
|
||||
void HashRedoInsertOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *data, Size datalen);
|
||||
void HashRedoInsertOperatorMetaPage(RedoBufferInfo *metabuf);
|
||||
|
||||
void HashRedoAddOvflPageOperatorOvflPage(RedoBufferInfo *ovflbuf, BlockNumber leftblk, void *data, Size datalen);
|
||||
void HashRedoAddOvflPageOperatorLeftPage(RedoBufferInfo *ovflbuf, BlockNumber rightblk);
|
||||
void HashRedoAddOvflPageOperatorMapPage(RedoBufferInfo *mapbuf, void *data);
|
||||
void HashRedoAddOvflPageOperatorNewmapPage(RedoBufferInfo *newmapbuf, void *recorddata);
|
||||
void HashRedoAddOvflPageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *data, Size datalen);
|
||||
|
||||
void HashRedoSplitAllocatePageOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
|
||||
void HashRedoSplitAllocatePageOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
|
||||
void HashRedoSplitAllocatePageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *blkdata);
|
||||
|
||||
void HashRedoSplitCompleteOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
|
||||
void HashRedoSplitCompleteOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
|
||||
|
||||
void HashXlogMoveAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
|
||||
void HashXlogMoveDeleteOvflPageOperatorPage(RedoBufferInfo *redobuffer, void *blkdata, Size len);
|
||||
|
||||
void HashXlogSqueezeAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
|
||||
void HashXlogSqueezeInitOvflbufOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
void HashXlogSqueezeUpdatePrevPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
void HashXlogSqueezeUpdateNextPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
void HashXlogSqueezeUpdateBitmapOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
|
||||
void HashXlogSqueezeUpdateMateOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
|
||||
|
||||
void HashXlogDeleteBlockOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
|
||||
|
||||
void HashXlogSplitCleanupOperatorPage(RedoBufferInfo *redobuffer);
|
||||
|
||||
void HashXlogUpdateMetaOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
|
||||
void HashXlogVacuumOnePageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, Size len);
|
||||
|
||||
void HashXlogVacuumMateOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
|
||||
void XLogRecSetBlockCommonState(XLogReaderState* record, XLogBlockParseEnum blockvalid,
|
||||
RelFileNodeForkNum filenode, XLogRecParseState* recordblockstate, XLogPhyBlock *pblk = NULL);
|
||||
|
||||
|
@ -1047,6 +1090,7 @@ extern void UBTreeRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* bl
|
|||
extern void UBTree2RedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec,
|
||||
RedoBufferInfo *bufferinfo);
|
||||
|
||||
extern void HashRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
|
||||
XLogRecParseState* XactXlogCsnlogParseToBlock(XLogReaderState* record, uint32* blocknum, TransactionId xid,
|
||||
int nsubxids, TransactionId* subxids, CommitSeqNo csn, XLogRecParseState* recordstatehead);
|
||||
extern void XLogRecSetVmBlockState(XLogReaderState* record, uint32 blockid, XLogRecParseState* recordblockstate);
|
||||
|
@ -1189,6 +1233,7 @@ extern void XLogBlockSegDdlDoRealAction(XLogBlockHead* blockhead, void* blockrec
|
|||
extern void GinRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
|
||||
extern void GistRedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, RedoBufferInfo *bufferinfo);
|
||||
extern bool IsCheckPoint(const XLogRecParseState *parseState);
|
||||
|
||||
void redo_atomic_xlog_dispatch(uint8 opCode, RedoBufferInfo *redo_buf, const char *data);
|
||||
void seg_redo_new_page_copy_and_flush(BufferTag *tag, char *data, XLogRecPtr lsn);
|
||||
|
||||
|
|
|
@ -37,7 +37,8 @@
|
|||
*/
|
||||
#define XLR_SPECIAL_REL_UPDATE 0x01
|
||||
#define XLR_BTREE_UPGRADE_FLAG 0x02
|
||||
|
||||
/* If xlog record is the compress table creation */
|
||||
#define XLR_REL_COMPRESS 0X04
|
||||
#define XLR_IS_TOAST 0X08
|
||||
/* If xlog record is from toast page */
|
||||
|
||||
|
@ -84,7 +85,7 @@ typedef struct XLogRecordBlockHeader {
|
|||
#define BKID_HAS_TDE_PAGE (0x40)
|
||||
#define BKID_GET_BKID(id) (id & 0x3F)
|
||||
|
||||
/*
|
||||
/*
|
||||
* In segment-page storage, RelFileNode and block number are logic for XLog. Thus, we need record
|
||||
* physical location in xlog. This macro is used to check whether in such situation.
|
||||
*/
|
||||
|
|
|
@ -80,12 +80,14 @@ extern Relation heap_create(const char *relname,
|
|||
bool mapped_relation,
|
||||
bool allow_system_table_mods,
|
||||
int8 row_compress,
|
||||
Datum reloptions,
|
||||
Oid ownerid,
|
||||
bool skip_create_storage,
|
||||
TableAmType tam_type,
|
||||
int8 relindexsplit = 0,
|
||||
StorageType storage_type = HEAP_DISK,
|
||||
bool newcbi = false);
|
||||
bool newcbi = false,
|
||||
Oid accessMethodObjectId = 0);
|
||||
|
||||
extern bool heap_is_matview_init_state(Relation rel);
|
||||
|
||||
|
@ -98,7 +100,9 @@ heapCreatePartition(const char* part_name,
|
|||
Oid bucketOid,
|
||||
Oid ownerid,
|
||||
StorageType storage_type,
|
||||
bool newcbi = false);
|
||||
bool newcbi = false,
|
||||
Datum reloptions = Datum(0));
|
||||
|
||||
|
||||
extern Oid heap_create_with_catalog(const char *relname,
|
||||
Oid relnamespace,
|
||||
|
@ -120,7 +124,7 @@ extern Oid heap_create_with_catalog(const char *relname,
|
|||
bool use_user_acl,
|
||||
bool allow_system_table_mods,
|
||||
PartitionState *partTableState,
|
||||
int8 row_compress,
|
||||
int8 row_compress,
|
||||
HashBucketInfo *bucketinfo,
|
||||
bool record_dependce = true,
|
||||
List* ceLst = NULL,
|
||||
|
@ -200,7 +204,7 @@ extern void CheckAttributeType(const char *attname, Oid atttypid, Oid attcollati
|
|||
#ifdef PGXC
|
||||
/* Functions related to distribution data of relations */
|
||||
extern void AddRelationDistribution(const char *relname, Oid relid, DistributeBy *distributeby,
|
||||
PGXCSubCluster *subcluster, List *parentOids, TupleDesc descriptor, bool isinstallationgroup,
|
||||
PGXCSubCluster *subcluster, List *parentOids, TupleDesc descriptor, bool isinstallationgroup,
|
||||
bool isbucket = false, int bucketmaplen = 0);
|
||||
extern void GetRelationDistributionItems(Oid relid, DistributeBy *distributeby, TupleDesc descriptor, char *locatortype,
|
||||
int *hashalgorithm, int *hashbuckets, AttrNumber *attnum);
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include "utils/tuplesort.h"
|
||||
|
||||
#define DEFAULT_INDEX_TYPE "btree"
|
||||
#define DEFAULT_HASH_INDEX_TYPE "hash"
|
||||
#define DEFAULT_CSTORE_INDEX_TYPE "psort"
|
||||
#define DEFAULT_GIST_INDEX_TYPE "gist"
|
||||
#define CSTORE_BTREE_INDEX_TYPE "cbtree"
|
||||
|
|
|
@ -38,11 +38,23 @@ typedef struct xl_smgr_create {
|
|||
ForkNumber forkNum;
|
||||
} xl_smgr_create;
|
||||
|
||||
typedef struct xl_smgr_create_compress {
|
||||
xl_smgr_create xlrec;
|
||||
uint2 pageCompressOpts;
|
||||
} xl_smgr_create_compress;
|
||||
|
||||
typedef struct xl_smgr_truncate {
|
||||
BlockNumber blkno;
|
||||
RelFileNodeOld rnode;
|
||||
} xl_smgr_truncate;
|
||||
|
||||
typedef struct xl_smgr_truncate_compress {
|
||||
xl_smgr_truncate xlrec;
|
||||
uint2 pageCompressOpts;
|
||||
} xl_smgr_truncate_compress;
|
||||
|
||||
|
||||
|
||||
extern void log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum);
|
||||
|
||||
extern void smgr_redo(XLogReaderState *record);
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
DROP FUNCTION IF EXISTS pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8) CASCADE;
|
||||
DROP FUNCTION IF EXISTS pg_catalog.gs_read_block_from_remote(int4, int4, int4, int2, int2, int4, xid, int4, xid, boolean, int4) CASCADE;
|
|
@ -0,0 +1,2 @@
|
|||
DROP FUNCTION IF EXISTS pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8) CASCADE;
|
||||
DROP FUNCTION IF EXISTS pg_catalog.gs_read_block_from_remote(int4, int4, int4, int2, int2, int4, xid, int4, xid, boolean, int4) CASCADE;
|
|
@ -0,0 +1,23 @@
|
|||
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 4768;
|
||||
CREATE OR REPLACE FUNCTION pg_catalog.gs_read_block_from_remote
|
||||
( int4,
|
||||
int4,
|
||||
int4,
|
||||
int2,
|
||||
int2,
|
||||
int4,
|
||||
xid,
|
||||
int4,
|
||||
xid,
|
||||
boolean,
|
||||
int4)
|
||||
RETURNS SETOF record LANGUAGE INTERNAL ROWS 1 STRICT as 'gs_read_block_from_remote_compress';
|
||||
-- pg_read_binary_file_blocks()
|
||||
--
|
||||
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 8413;
|
||||
CREATE FUNCTION pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8,
|
||||
OUT path text,
|
||||
OUT blocknum int4,
|
||||
OUT len int4,
|
||||
OUT data bytea)
|
||||
AS 'pg_read_binary_file_blocks' LANGUAGE INTERNAL IMMUTABLE STRICT;
|
|
@ -0,0 +1,23 @@
|
|||
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 4768;
|
||||
CREATE OR REPLACE FUNCTION pg_catalog.gs_read_block_from_remote
|
||||
( int4,
|
||||
int4,
|
||||
int4,
|
||||
int2,
|
||||
int2,
|
||||
int4,
|
||||
xid,
|
||||
int4,
|
||||
xid,
|
||||
boolean,
|
||||
int4)
|
||||
RETURNS SETOF record LANGUAGE INTERNAL ROWS 1 STRICT as 'gs_read_block_from_remote_compress';
|
||||
-- pg_read_binary_file_blocks()
|
||||
--
|
||||
SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 8413;
|
||||
CREATE FUNCTION pg_catalog.pg_read_binary_file_blocks(IN inputpath text, IN startblocknum int8, IN count int8,
|
||||
OUT path text,
|
||||
OUT blocknum int4,
|
||||
OUT len int4,
|
||||
OUT data bytea)
|
||||
AS 'pg_read_binary_file_blocks' LANGUAGE INTERNAL IMMUTABLE STRICT;
|
|
@ -1199,6 +1199,7 @@ typedef struct knl_instance_context {
|
|||
knl_g_archive_context archive_obs_cxt;
|
||||
knl_g_archive_thread_info archive_thread_info;
|
||||
struct HTAB* ngroup_hash_table;
|
||||
struct HTAB* mmapCache;
|
||||
knl_g_hypo_context hypo_cxt;
|
||||
|
||||
knl_g_segment_context segment_cxt;
|
||||
|
|
|
@ -88,6 +88,7 @@ extern const uint32 SUPPORT_DATA_REPAIR;
|
|||
extern const uint32 SCAN_BATCH_MODE_VERSION_NUM;
|
||||
extern const uint32 PUBLICATION_VERSION_NUM;
|
||||
extern const uint32 ANALYZER_HOOK_VERSION_NUM;
|
||||
extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM;
|
||||
|
||||
extern void register_backend_version(uint32 backend_version);
|
||||
extern bool contain_backend_version(uint32 version_number);
|
||||
|
|
|
@ -1336,6 +1336,8 @@ typedef enum WaitEventIO {
|
|||
WAIT_EVENT_OBS_READ,
|
||||
WAIT_EVENT_OBS_WRITE,
|
||||
WAIT_EVENT_LOGCTRL_SLEEP,
|
||||
WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH,
|
||||
WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC,
|
||||
IO_EVENT_NUM = WAIT_EVENT_LOGCTRL_SLEEP - WAIT_EVENT_BUFFILE_READ + 1 // MUST be last, DO NOT use this value.
|
||||
} WaitEventIO;
|
||||
|
||||
|
|
|
@ -96,6 +96,13 @@ typedef struct buftag {
|
|||
BlockNumber blockNum; /* blknum relative to begin of reln */
|
||||
} BufferTag;
|
||||
|
||||
typedef struct buftagnocompress {
|
||||
RelFileNodeV2 rnode;
|
||||
ForkNumber forkNum;
|
||||
BlockNumber blockNum; /* blknum relative to begin of reln */
|
||||
} BufferTagSecondVer;
|
||||
|
||||
|
||||
typedef struct buftagnohbkt {
|
||||
RelFileNodeOld rnode; /* physical relation identifier */
|
||||
ForkNumber forkNum;
|
||||
|
|
|
@ -325,6 +325,7 @@ extern bool ConditionalLockBuffer(Buffer buffer);
|
|||
extern void LockBufferForCleanup(Buffer buffer);
|
||||
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
|
||||
extern bool ConditionalLockUHeapBufferForCleanup(Buffer buffer);
|
||||
extern bool IsBufferCleanupOK(Buffer buffer);
|
||||
extern bool HoldingBufferPinThatDelaysRecovery(void);
|
||||
extern void AsyncUnpinBuffer(volatile void* bufHdr, bool forgetBuffer);
|
||||
extern void AsyncCompltrPinBuffer(volatile void* bufHdr);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue