diff --git a/contrib/pg_xlogdump/rmgrdesc.cpp b/contrib/pg_xlogdump/rmgrdesc.cpp index 5605d0046..ce834d139 100644 --- a/contrib/pg_xlogdump/rmgrdesc.cpp +++ b/contrib/pg_xlogdump/rmgrdesc.cpp @@ -13,6 +13,7 @@ #include "access/gin.h" #include "access/gist_private.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" diff --git a/contrib/pgstattuple/pgstattuple.cpp b/contrib/pgstattuple/pgstattuple.cpp index 6b18b9a72..0760c570e 100644 --- a/contrib/pgstattuple/pgstattuple.cpp +++ b/contrib/pgstattuple/pgstattuple.cpp @@ -363,7 +363,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b Page page; OffsetNumber maxoff; - _hash_getlock(rel, blkno, HASH_SHARE); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy); page = BufferGetPage(buf); @@ -390,7 +389,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b } _hash_relbuf(rel, buf); - _hash_droplock(rel, blkno, HASH_SHARE); } /* diff --git a/src/common/backend/parser/parse_utilcmd.cpp b/src/common/backend/parser/parse_utilcmd.cpp index 49df36e08..d10b5a89d 100644 --- a/src/common/backend/parser/parse_utilcmd.cpp +++ b/src/common/backend/parser/parse_utilcmd.cpp @@ -3353,12 +3353,21 @@ IndexStmt* transformIndexStmt(Oid relid, IndexStmt* stmt, const char* queryStrin if (!isColStore && (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_INDEX_TYPE)) && (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIN_INDEX_TYPE)) && - (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE))) { - /* row store only support btree/gin/gist index */ + (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE)) && + (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE))) { + /* row store only support btree/gin/gist/hash index */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("access method \"%s\" does not support row store", stmt->accessMethod))); } + + if (0 == pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE) && + t_thrd.proc->workingVersionNum < SUPPORT_HASH_XLOG_VERSION_NUM) { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support row store", stmt->accessMethod))); + } + if (isColStore && (!isPsortMothed && !isCBtreeMethod && !isCGinBtreeMethod)) { /* column store support psort/cbtree/gin index */ ereport(ERROR, diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index 7f11d712b..637fc4e47 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -59,7 +59,7 @@ bool open_join_children = true; bool will_shutdown = false; /* hard-wired binary version number */ -const uint32 GRAND_VERSION_NUM = 92308; +const uint32 GRAND_VERSION_NUM = 92309; const uint32 MATVIEW_VERSION_NUM = 92213; const uint32 PARTIALPUSH_VERSION_NUM = 92087; @@ -79,6 +79,7 @@ const uint32 ML_OPT_MODEL_VERSION_NUM = 92284; const uint32 FIX_SQL_ADD_RELATION_REF_COUNT = 92291; const uint32 GENERATED_COL_VERSION_NUM = 92303; const uint32 ANALYZER_HOOK_VERSION_NUM = 92306; +const uint32 SUPPORT_HASH_XLOG_VERSION_NUM = 92309; /* This variable indicates wheather the instance is in progress of upgrade as a whole */ uint32 volatile WorkingGrandVersionNum = GRAND_VERSION_NUM; diff --git a/src/common/backend/utils/resowner/resowner.cpp b/src/common/backend/utils/resowner/resowner.cpp index 6599642e8..2bc0f262e 100644 --- a/src/common/backend/utils/resowner/resowner.cpp +++ b/src/common/backend/utils/resowner/resowner.cpp @@ -389,9 +389,6 @@ static void ResourceOwnerReleaseInternal( MemoryContextDelete(memContext); ResourceOwnerForgetGMemContext(t_thrd.utils_cxt.TopTransactionResourceOwner, memContext); } - - /* Clean up index scans too */ - ReleaseResources_hash(); } /* Let add-on modules get a chance too */ diff --git a/src/common/backend/utils/sort/tuplesort.cpp b/src/common/backend/utils/sort/tuplesort.cpp index 764c4d2d9..1a88db43e 100644 --- a/src/common/backend/utils/sort/tuplesort.cpp +++ b/src/common/backend/utils/sort/tuplesort.cpp @@ -109,6 +109,7 @@ #include #include "access/nbtree.h" +#include "access/hash.h" #include "access/tableam.h" #include "catalog/index.h" #include "commands/tablespace.h" @@ -389,6 +390,7 @@ struct Tuplesortstate { * These variables are specific to the IndexTuple case; they are set by * tuplesort_begin_index_xxx and used only by the IndexTuple routines. */ + Relation heapRel; /* table the index is being built on */ Relation indexRel; /* index being built */ /* These are specific to the index_btree subcase: */ @@ -396,7 +398,9 @@ struct Tuplesortstate { bool enforceUnique; /* complain if we find duplicate tuples */ /* These are specific to the index_hash subcase: */ - uint32 hash_mask; /* mask for sortable part of hash code */ + uint32 high_mask; /* masks for sortable part of hash code */ + uint32 low_mask; + uint32 max_buckets; /* * These variables are specific to the Datum case; they are set by @@ -917,7 +921,8 @@ Tuplesortstate* tuplesort_begin_index_btree( } Tuplesortstate* tuplesort_begin_index_hash( - Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem) + Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, + uint32 max_buckets, int workMem, bool randomAccess, int maxMem) { Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess); MemoryContext oldcontext; @@ -927,11 +932,12 @@ Tuplesortstate* tuplesort_begin_index_hash( #ifdef TRACE_SORT if (u_sess->attr.attr_common.trace_sort) { elog(LOG, - "begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c, maxMem = %d", - hash_mask, - workMem, - randomAccess ? 't' : 'f', - maxMem); + "begin index sort: high_mask = 0x%x, low_mask = 0x%x, " + "max_buckets = 0x%x, workMem = %d, randomAccess = %c", + high_mask, + low_mask, + max_buckets, + workMem, randomAccess ? 't' : 'f'); } #endif @@ -946,9 +952,12 @@ Tuplesortstate* tuplesort_begin_index_hash( #endif state->reversedirection = reversedirection_index_hash; + state->heapRel = heapRel; state->indexRel = indexRel; - state->hash_mask = hash_mask; + state->high_mask = high_mask; + state->low_mask = low_mask; + state->max_buckets = max_buckets; state->maxMem = maxMem * 1024L; (void)MemoryContextSwitchTo(oldcontext); @@ -3610,8 +3619,8 @@ static int comparetup_index_btree(const SortTuple* a, const SortTuple* b, Tuples static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tuplesortstate* state) { - uint32 hash1; - uint32 hash2; + Bucket bucket1; + Bucket bucket2; IndexTuple tuple1; IndexTuple tuple2; @@ -3620,13 +3629,17 @@ static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tupleso * that the first column of the index tuple is the hash key. */ Assert(!a->isnull1); - hash1 = DatumGetUInt32(a->datum1) & state->hash_mask; + bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1), + state->max_buckets, state->high_mask, + state->low_mask); Assert(!b->isnull1); - hash2 = DatumGetUInt32(b->datum1) & state->hash_mask; + bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1), + state->max_buckets, state->high_mask, + state->low_mask); - if (hash1 > hash2) { + if (bucket1 > bucket2) { return 1; - } else if (hash1 < hash2) { + } else if (bucket1 < bucket2) { return -1; } diff --git a/src/gausskernel/storage/access/hash/Makefile b/src/gausskernel/storage/access/hash/Makefile index 0ca2fdd60..c9b9e8c83 100644 --- a/src/gausskernel/storage/access/hash/Makefile +++ b/src/gausskernel/storage/access/hash/Makefile @@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean" endif endif endif -OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \ - hashsearch.o hashsort.o hashutil.o +OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o\ + hashsort.o hashutil.o hash_xlog.o include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/access/hash/README b/src/gausskernel/storage/access/hash/README index da68545e2..be5491be4 100644 --- a/src/gausskernel/storage/access/hash/README +++ b/src/gausskernel/storage/access/hash/README @@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to move primary bucket pages around after they are created. Primary bucket pages (henceforth just "bucket pages") are allocated in -power-of-2 groups, called "split points" in the code. Buckets 0 and 1 -are created when the index is initialized. At the first split, buckets 2 -and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated; -when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket -pages of a power-of-2 group appear consecutively in the index. This -addressing scheme allows the physical location of a bucket page to be -computed from the bucket number relatively easily, using only a small -amount of control information. We take the log2() of the bucket number -to determine which split point S the bucket belongs to, and then simply -add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the -metapage) to compute the physical address. hashm_spares[S] can be -interpreted as the total number of overflow pages that have been allocated -before the bucket pages of splitpoint S. hashm_spares[0] is always 0, -so that buckets 0 and 1 (which belong to splitpoint 0) always appear at -block numbers 1 and 2, just after the meta page. We always have -hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the -former. The difference between the two represents the number of overflow -pages appearing between the bucket page groups of splitpoints N and N+1. - +power-of-2 groups, called "split points" in the code. That means at every new +splitpoint we double the existing number of buckets. Allocating huge chunks +of bucket pages all at once isn't optimal and we will take ages to consume +those. To avoid this exponential growth of index size, we did use a trick to +break up allocation of buckets at the splitpoint into 4 equal phases. If +(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on +we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2)) +of total buckets at each phase of splitpoint group. Next quarter of allocation +will only happen if buckets of the previous phase have been already consumed. +For the initial splitpoint groups < 10 we will allocate all of their buckets in +single phase only, as number of buckets allocated at initial groups are small +in numbers. And for the groups >= 10 the allocation process is distributed +among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4 +different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces +indicate the number of buckets allocated within each phase of splitpoint group +10. And, for splitpoint group 11 and 12 allocation phases will be +{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We +can see that at each splitpoint group we double the total number of buckets +from the previous group but in an incremental phase. The bucket pages +allocated within one phase of a splitpoint group will appear consecutively in +the index. This addressing scheme allows the physical location of a bucket +page to be computed from the bucket number relatively easily, using only a +small amount of control information. If we look at the function +_hash_spareindex for a given bucket number we first compute the +splitpoint group it belongs to and then the phase to which the bucket belongs +to. Adding them we get the global splitpoint phase number S to which the +bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[] +is an array stored in the metapage) with given bucket number to compute its +physical address. The hashm_spares[S] can be interpreted as the total number +of overflow pages that have been allocated before the bucket pages of +splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1 +always appear at block numbers 1 and 2, just after the meta page. We always +have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the +former. The difference between the two represents the number of overflow pages +appearing between the bucket page groups of splitpoints phase N and N+1. (Note: the above describes what happens when filling an initially minimally -sized hash index. In practice, we try to estimate the required index size -and allocate a suitable number of splitpoints immediately, to avoid +sized hash index. In practice, we try to estimate the required index size and +allocate a suitable number of splitpoints phases immediately, to avoid expensive re-splitting during initial index build.) When S splitpoints exist altogether, the array entries hashm_spares[0] through hashm_spares[S] are valid; hashm_spares[S] records the current total number of overflow pages. New overflow pages are created as needed at the end of the index, and recorded by incrementing hashm_spares[S]. -When it is time to create a new splitpoint's worth of bucket pages, we +When it is time to create a new splitpoint phase's worth of bucket pages, we copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is stored in the hashm_ovflpoint field of the meta page). This has the effect of reserving the correct number of bucket pages at the end of the @@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during an index extension we crash after allocating filesystem space and before updating the metapage. Note that on filesystems that allow "holes" in files, it's entirely likely that pages before the logical EOF are not yet -allocated: when we allocate a new splitpoint's worth of bucket pages, we +allocated: when we allocate a new splitpoint phase's worth of bucket pages, we physically zero the last such page to force the EOF up, and the first such page will be used immediately, but the intervening pages are not written until needed. @@ -126,61 +142,98 @@ the initially created buckets. Lock Definitions ---------------- -We use both lmgr locks ("heavyweight" locks) and buffer context locks -(LWLocks) to control access to a hash index. lmgr locks are needed for -long-term locking since there is a (small) risk of deadlock, which we must -be able to detect. Buffer context locks are used for short-term access -control to individual pages of the index. +Concurrency control for hash indexes is provided using buffer content +locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL, +cleanup lock means that we hold an exclusive lock on the buffer and have +observed at some point after acquiring the lock that we hold the only pin +on that buffer. For hash indexes, a cleanup lock on a primary bucket page +represents the right to perform an arbitrary reorganization of the entire +bucket. Therefore, scans retain a pin on the primary bucket page for the +bucket they are currently scanning. Splitting a bucket requires a cleanup +lock on both the old and new primary bucket pages. VACUUM therefore takes +a cleanup lock on every bucket page in order to remove tuples. It can also +remove tuples copied to a new bucket by any previous split operation, because +the cleanup lock taken on the primary bucket page guarantees that no scans +which started prior to the most recent split can still be in progress. After +cleaning each page individually, it attempts to take a cleanup lock on the +primary bucket page in order to "squeeze" the bucket down to the minimum +possible number of pages. -We define the following lmgr locks for a hash index: +To avoid deadlocks, we must be consistent about the lock order in which we +lock the buckets for operations that requires locks on two different buckets. +We choose to always lock the lower-numbered bucket first. The metapage is +only ever locked after all bucket locks have been taken. -LockPage(rel, 0) represents the right to modify the hash-code-to-bucket -mapping. A process attempting to enlarge the hash table by splitting a -bucket must exclusive-lock this lock before modifying the metapage data -representing the mapping. Processes intending to access a particular -bucket must share-lock this lock until they have acquired lock on the -correct target bucket. -LockPage(rel, page), where page is the page number of a hash bucket page, -represents the right to split or compact an individual bucket. A process -splitting a bucket must exclusive-lock both old and new halves of the -bucket until it is done. A process doing VACUUM must exclusive-lock the -bucket it is currently purging tuples from. Processes doing scans or -insertions must share-lock the bucket they are scanning or inserting into. -(It is okay to allow concurrent scans and insertions.) +Metapage Caching +---------------- -The lmgr lock IDs corresponding to overflow pages are currently unused. -These are available for possible future refinements. +Both scanning the index and inserting tuples require locating the bucket +where a given tuple ought to be located. To do this, we need the bucket +count, highmask, and lowmask from the metapage; however, it's undesirable +for performance reasons to have to have to lock and pin the metapage for +every such operation. Instead, we retain a cached copy of the metapage +in each each backend's relcache entry. This will produce the correct +bucket mapping as long as the target bucket hasn't been split since the +last cache refresh. -Note that these lock definitions are conceptually distinct from any sort -of lock on the pages whose numbers they share. A process must also obtain -read or write buffer lock on the metapage or bucket page before accessing -said page. +To guard against the possibility that such a split has occurred, the +primary page of each bucket chain stores the number of buckets that +existed as of the time the bucket was last split, or if never split as +of the time it was created, in the space normally used for the +previous block number (that is, hasho_prevblkno). This doesn't cost +anything because the primary bucket page is always the first page in +the chain, and the previous block number is therefore always, in +reality, InvalidBlockNumber. -Processes performing hash index scans must hold share lock on the bucket -they are scanning throughout the scan. This seems to be essential, since -there is no reasonable way for a scan to cope with its bucket being split -underneath it. This creates a possibility of deadlock external to the -hash index code, since a process holding one of these locks could block -waiting for an unrelated lock held by another process. If that process -then does something that requires exclusive lock on the bucket, we have -deadlock. Therefore the bucket locks must be lmgr locks so that deadlock -can be detected and recovered from. This also forces the page-zero lock -to be an lmgr lock, because as we'll see below it is held while attempting -to acquire a bucket lock, and so it could also participate in a deadlock. +After computing the ostensibly-correct bucket number based on our cached +copy of the metapage, we lock the corresponding primary bucket page and +check whether the bucket count stored in hasho_prevblkno is greater than +our the number of buckets stored in our cached copy of the metapage. If +so, the bucket has certainly been split, because the must originally +have been less than the number of buckets that existed at that time and +can't have increased except due to a split. If not, the bucket can't have +been split, because a split would have created a new bucket with a higher +bucket number than any we'd seen previously. In the latter case, we've +locked the correct bucket and can proceed; in the former case, we must +release the lock on this bucket, lock the metapage, update our cache, +unlock the metapage, and retry. -Processes must obtain read (share) buffer context lock on any hash index -page while reading it, and write (exclusive) lock while modifying it. -To prevent deadlock we enforce these coding rules: no buffer lock may be -held long term (across index AM calls), nor may any buffer lock be held -while waiting for an lmgr lock, nor may more than one buffer lock -be held at a time by any one process. (The third restriction is probably -stronger than necessary, but it makes the proof of no deadlock obvious.) +Needing to retry occasionally might seem expensive, but the number of times +any given bucket can be split is limited to a few dozen no matter how +many times the hash index is accessed, because the total number of +buckets is limited to less than 2^32. On the other hand, the number of +times we access a bucket is unbounded and will be several orders of +magnitude larger even in unsympathetic cases. +(The metapage cache is new in v10. Older hash indexes had the primary +bucket page's hasho_prevblkno initialized to InvalidBuffer.) Pseudocode Algorithms --------------------- +Various flags that are used in hash index operations are described as below: + +The bucket-being-split and bucket-being-populated flags indicate that split +the operation is in progress for a bucket. During split operation, a +bucket-being-split flag is set on the old bucket and bucket-being-populated +flag is set on new bucket. These flags are cleared once the split operation +is finished. + +The split-cleanup flag indicates that a bucket which has been recently split +still contains tuples that were also copied to the new bucket; it essentially +marks the split as incomplete. Once we're certain that no scans which +started before the new bucket was fully populated are still in progress, we +can remove the copies from the old bucket and clear the flag. We insist that +this flag must be clear before splitting a bucket; thus, a bucket can't be +split again until the previous split is totally complete. + +The moved-by-split flag on a tuple indicates that tuple is moved from old to +new bucket. Concurrent scans will skip such tuples until the split operation +is finished. Once the tuple is marked as moved-by-split, it will remain so +forever but that does no harm. We have intentionally not cleared it as that +can generate an additional I/O which is not necessary. + The operations we need to support are: readers scanning the index for entries of a particular hash code (which by definition are all in the same bucket); insertion of a new tuple into the correct bucket; enlarging the @@ -195,57 +248,75 @@ track of available overflow pages. The reader algorithm is: - share-lock page 0 (to prevent active split) - read/sharelock meta page - compute bucket number for target hash key - release meta page - share-lock bucket page (to prevent split/compact of this bucket) - release page 0 share-lock + lock the primary bucket page of the target bucket + if the target bucket is still being populated by a split: + release the buffer content lock on current bucket page + pin and acquire the buffer content lock on old bucket in shared mode + release the buffer content lock on old bucket, but not pin + retake the buffer content lock on new bucket + arrange to scan the old bucket normally and the new bucket for + tuples which are not moved-by-split -- then, per read request: - read/sharelock current page of bucket - step to next page if necessary (no chaining of locks) + reacquire content lock on current page + step to next page if necessary (no chaining of content locks, but keep + the pin on the primary bucket throughout the scan; we also maintain + a pin on the page currently being scanned) get tuple - release current page + release content lock -- at scan shutdown: - release bucket share-lock + release all pins still held -By holding the page-zero lock until lock on the target bucket is obtained, -the reader ensures that the target bucket calculation is valid (otherwise -the bucket might be split before the reader arrives at it, and the target -entries might go into the new bucket). Holding the bucket sharelock for -the remainder of the scan prevents the reader's current-tuple pointer from -being invalidated by splits or compactions. Notice that the reader's lock -does not prevent other buckets from being split or compacted. +Holding the buffer pin on the primary bucket page for the whole scan prevents +the reader's current-tuple pointer from being invalidated by splits or +compactions. (Of course, other buckets can still be split or compacted.) To keep concurrency reasonably good, we require readers to cope with concurrent insertions, which means that they have to be able to re-find -their current scan position after re-acquiring the page sharelock. Since -deletion is not possible while a reader holds the bucket sharelock, and -we assume that heap tuple TIDs are unique, this can be implemented by +their current scan position after re-acquiring the buffer content lock on +page. Since deletion is not possible while a reader holds the pin on bucket, +and we assume that heap tuple TIDs are unique, this can be implemented by searching for the same heap tuple TID previously returned. Insertion does not move index entries across pages, so the previously-returned index entry should always be on the same page, at the same or higher offset number, as it was before. +To allow for scans during a bucket split, if at the start of the scan, the +bucket is marked as bucket-being-populated, it scan all the tuples in that +bucket except for those that are marked as moved-by-split. Once it finishes +the scan of all the tuples in the current bucket, it scans the old bucket from +which this bucket is formed by split. + The insertion algorithm is rather similar: - share-lock page 0 (to prevent active split) - read/sharelock meta page - compute bucket number for target hash key - release meta page - share-lock bucket page (to prevent split/compact of this bucket) - release page 0 share-lock --- (so far same as reader) - read/exclusive-lock current page of bucket - if full, release, read/exclusive-lock next page; repeat as needed + lock the primary bucket page of the target bucket +-- (so far same as reader, except for acquisition of buffer content lock in + exclusive mode on primary bucket page) + if the bucket-being-split flag is set for a bucket and pin count on it is + one, then finish the split + release the buffer content lock on current bucket + get the "new" bucket which was being populated by the split + scan the new bucket and form the hash table of TIDs + conditionally get the cleanup lock on old and new buckets + if we get the lock on both the buckets + finish the split using algorithm mentioned below for split + release the pin on old bucket and restart the insert from beginning. + if current page is full, first check if this page contains any dead tuples. + if yes, remove dead tuples from the current page and again check for the + availability of the space. If enough space found, insert the tuple else + release lock but not pin, read/exclusive-lock + next page; repeat as needed >> see below if no space in any page of bucket + take buffer content lock in exclusive mode on metapage insert tuple at appropriate place in page - write/release current page - release bucket share-lock - read/exclusive-lock meta page + mark current page dirty increment tuple count, decide if split needed - write/release meta page - done if no split needed, else enter Split algorithm below + mark meta page dirty + write WAL for insertion of tuple + release the buffer content lock on metapage + release buffer content lock on current page + if current page is not a bucket page, release the pin on bucket page + if split is needed, enter Split algorithm below + release the pin on metapage To speed searches, the index entries within any individual index page are kept sorted by hash code; the insertion code must take care to insert new @@ -254,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this as explained above. We only need the short-term buffer locks to ensure that readers do not see a partially-updated page. -It is clearly impossible for readers and inserters to deadlock, and in -fact this algorithm allows them a very high degree of concurrency. -(The exclusive metapage lock taken to update the tuple count is stronger -than necessary, since readers do not care about the tuple count, but the -lock is held for such a short time that this is probably not an issue.) +To avoid deadlock between readers and inserters, whenever there is a need +to lock multiple buckets, we always take in the order suggested in Lock +Definitions above. This algorithm allows them a very high degree of +concurrency. (The exclusive metapage lock taken to update the tuple count +is stronger than necessary, since readers do not care about the tuple count, +but the lock is held for such a short time that this is probably not an +issue.) When an inserter cannot find space in any existing page of a bucket, it must obtain an overflow page and add that page to the bucket's chain. @@ -269,82 +342,95 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets). The algorithm attempts, but does not necessarily succeed, to split one existing bucket in two, thereby lowering the fill ratio: - exclusive-lock page 0 (assert the right to begin a split) - read/exclusive-lock meta page - check split still needed - if split not needed anymore, drop locks and exit - decide which bucket to split - Attempt to X-lock old bucket number (definitely could fail) - Attempt to X-lock new bucket number (shouldn't fail, but...) - if above fail, drop locks and exit - update meta page to reflect new number of buckets - write/release meta page - release X-lock on page 0 - -- now, accesses to all other buckets can proceed. - Perform actual split of bucket, moving tuples as needed - >> see below about acquiring needed extra space - Release X-locks of old and new buckets + pin meta page and take buffer content lock in exclusive mode + check split still needed + if split not needed anymore, drop buffer content lock and pin and exit + decide which bucket to split + try to take a cleanup lock on that bucket; if fail, give up + if that bucket is still being split or has split-cleanup work: + try to finish the split and the cleanup work + if that succeeds, start over; if it fails, give up + mark the old and new buckets indicating split is in progress + mark both old and new buckets as dirty + write WAL for allocation of new page for split + copy the tuples that belongs to new bucket from old bucket, marking + them as moved-by-split + write WAL record for moving tuples to new page once the new page is full + or all the pages of old bucket are finished + release lock but not pin for primary bucket page of old bucket, + read/shared-lock next page; repeat as needed + clear the bucket-being-split and bucket-being-populated flags + mark the old bucket indicating split-cleanup + write WAL for changing the flags on both old and new buckets -Note the page zero and metapage locks are not held while the actual tuple -rearrangement is performed, so accesses to other buckets can proceed in -parallel; in fact, it's possible for multiple bucket splits to proceed -in parallel. - -Split's attempt to X-lock the old bucket number could fail if another -process holds S-lock on it. We do not want to wait if that happens, first -because we don't want to wait while holding the metapage exclusive-lock, -and second because it could very easily result in deadlock. (The other -process might be out of the hash AM altogether, and could do something -that blocks on another lock this process holds; so even if the hash -algorithm itself is deadlock-free, a user-induced deadlock could occur.) -So, this is a conditional LockAcquire operation, and if it fails we just -abandon the attempt to split. This is all right since the index is -overfull but perfectly functional. Every subsequent inserter will try to -split, and eventually one will succeed. If multiple inserters failed to -split, the index might still be overfull, but eventually, the index will +The split operation's attempt to acquire cleanup-lock on the old bucket number +could fail if another process holds any lock or pin on it. We do not want to +wait if that happens, because we don't want to wait while holding the metapage +exclusive-lock. So, this is a conditional LWLockAcquire operation, and if +it fails we just abandon the attempt to split. This is all right since the +index is overfull but perfectly functional. Every subsequent inserter will +try to split, and eventually one will succeed. If multiple inserters failed +to split, the index might still be overfull, but eventually, the index will not be overfull and split attempts will stop. (We could make a successful splitter loop to see if the index is still overfull, but it seems better to distribute the split overhead across successive insertions.) -A problem is that if a split fails partway through (eg due to insufficient -disk space) the index is left corrupt. The probability of that could be -made quite low if we grab a free page or two before we update the meta -page, but the only real solution is to treat a split as a WAL-loggable, -must-complete action. I'm not planning to teach hash about WAL in this -go-round. +If a split fails partway through (e.g. due to insufficient disk space or an +interrupt), the index will not be corrupted. Instead, we'll retry the split +every time a tuple is inserted into the old bucket prior to inserting the new +tuple; eventually, we should succeed. The fact that a split is left +unfinished doesn't prevent subsequent buckets from being split, but we won't +try to split the bucket again until the prior split is finished. In other +words, a bucket can be in the middle of being split for some time, but it can't +be in the middle of two splits at the same time. The fourth operation is garbage collection (bulk deletion): next bucket := 0 - read/sharelock meta page + pin metapage and take buffer content lock in exclusive mode fetch current max bucket number - release meta page + release meta page buffer content lock and pin while next bucket <= max bucket do - Acquire X lock on target bucket - Scan and remove tuples, compact free space as needed - Release X lock + acquire cleanup lock on primary bucket page + loop: + scan and remove tuples + mark the target page dirty + write WAL for deleting tuples from target page + if this is the last bucket page, break out of loop + pin and x-lock next page + release prior lock and pin (except keep pin on primary bucket page) + if the page we have locked is not the primary bucket page: + release lock and take exclusive lock on primary bucket page + if there are no other pins on the primary bucket page: + squeeze the bucket to remove free space + release the pin on primary bucket page next bucket ++ end loop - exclusive-lock meta page + pin metapage and take buffer content lock in exclusive mode check if number of buckets changed - if so, release lock and return to for-each-bucket loop + if so, release content lock and pin and return to for-each-bucket loop else update metapage tuple count - write/release meta page + mark meta page dirty and write WAL for update of metapage + release buffer content lock and pin -Note that this is designed to allow concurrent splits. If a split occurs, -tuples relocated into the new bucket will be visited twice by the scan, -but that does no harm. (We must however be careful about the statistics +Note that this is designed to allow concurrent splits and scans. If a split +occurs, tuples relocated into the new bucket will be visited twice by the +scan, but that does no harm. As we release the lock on bucket page during +cleanup scan of a bucket, it will allow concurrent scan to start on a bucket +and ensures that scan will always be behind cleanup. It is must to keep scans +behind cleanup, else vacuum could decrease the TIDs that are required to +complete the scan. Now, as the scan that returns multiple tuples from the +same bucket page always expect next valid TID to be greater than or equal to +the current TID, it might miss the tuples. This holds true for backward scans +as well (backward scans first traverse each bucket starting from first bucket +to last overflow page in the chain). We must be careful about the statistics reported by the VACUUM operation. What we can do is count the number of -tuples scanned, and believe this in preference to the stored tuple count -if the stored tuple count and number of buckets did *not* change at any -time during the scan. This provides a way of correcting the stored tuple -count if it gets out of sync for some reason. But if a split or insertion -does occur concurrently, the scan count is untrustworthy; instead, -subtract the number of tuples deleted from the stored tuple count and -use that.) - -The exclusive lock request could deadlock in some strange scenarios, but -we can just error out without any great harm being done. +tuples scanned, and believe this in preference to the stored tuple count if +the stored tuple count and number of buckets did *not* change at any time +during the scan. This provides a way of correcting the stored tuple count if +it gets out of sync for some reason. But if a split or insertion does occur +concurrently, the scan count is untrustworthy; instead, subtract the number of +tuples deleted from the stored tuple count and use that. Free Space Management @@ -360,25 +446,23 @@ overflow page to the free pool. Obtaining an overflow page: - read/exclusive-lock meta page + take metapage content lock in exclusive mode determine next bitmap page number; if none, exit loop - release meta page lock - read/exclusive-lock bitmap page + release meta page content lock + pin bitmap page and take content lock in exclusive mode search for a free page (zero bit in bitmap) if found: set bit in bitmap - write/release bitmap page - read/exclusive-lock meta page + mark bitmap page dirty + take metapage buffer content lock in exclusive mode if first-free-bit value did not change, - update it and write meta page - release meta page - return page number + update it and mark meta page dirty else (not found): - release bitmap page + release bitmap page buffer content lock loop back to try next bitmap page, if any -- here when we have checked all bitmap pages; we hold meta excl. lock extend index to add another overflow page; update meta information - write/release meta page + mark meta page dirty return page number It is slightly annoying to release and reacquire the metapage lock @@ -398,12 +482,17 @@ like this: -- having determined that no space is free in the target bucket: remember last page of bucket, drop write lock on it - call free-page-acquire routine re-write-lock last page of bucket if it is not last anymore, step to the last page - update (former) last page to point to new page + execute free-page-acquire (obtaining an overflow page) mechanism + described above + update (former) last page to point to the new page and mark buffer dirty write-lock and initialize new page, with back link to former last page - write and release former last page + write WAL for addition of overflow page + release the locks on meta page and bitmap page acquired in + free-page-acquire algorithm + release the lock on former last page + release the lock on new overflow page insert tuple into new page -- etc. @@ -418,27 +507,27 @@ free page; there can be no other process holding lock on it. Bucket splitting uses a similar algorithm if it has to extend the new bucket, but it need not worry about concurrent extension since it has -exclusive lock on the new bucket. +buffer content lock in exclusive mode on the new bucket. -Freeing an overflow page is done by garbage collection and by bucket -splitting (the old bucket may contain no-longer-needed overflow pages). -In both cases, the process holds exclusive lock on the containing bucket, -so need not worry about other accessors of pages in the bucket. The -algorithm is: +Freeing an overflow page requires the process to hold buffer content lock in +exclusive mode on the containing bucket, so need not worry about other +accessors of pages in the bucket. The algorithm is: delink overflow page from bucket chain (this requires read/update/write/release of fore and aft siblings) - read/share-lock meta page + pin meta page and take buffer content lock in shared mode determine which bitmap page contains the free space bit for page - release meta page - read/exclusive-lock bitmap page + release meta page buffer content lock + pin bitmap page and take buffer content lock in exclusive mode + retake meta page buffer content lock in exclusive mode + move (insert) tuples that belong to the overflow page being freed update bitmap bit - write/release bitmap page - if page number is less than what we saw as first-free-bit in meta: - read/exclusive-lock meta page + mark bitmap page dirty if page number is still less than first-free-bit, - update first-free-bit field and write meta page - release meta page + update first-free-bit field and mark meta page dirty + write WAL for delinking overflow page operation + release buffer content lock and pin + release meta page buffer content lock and pin We have to do it this way because we must clear the bitmap bit before changing the first-free-bit field (hashm_firstfree). It is possible that @@ -448,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to. What must be avoided is having first-free-bit greater than the actual first free bit, because then that free page would never be found by searchers. -All the freespace operations should be called while holding no buffer -locks. Since they need no lmgr locks, deadlock is not possible. +The reason of moving tuples from overflow page while delinking the later is +to make that as an atomic operation. Not doing so could lead to spurious reads +on standby. Basically, the user might see the same tuple twice. + + +WAL Considerations +------------------ + +The hash index operations like create index, insert, delete, bucket split, +allocate overflow page, and squeeze in themselves don't guarantee hash index +consistency after a crash. To provide robustness, we write WAL for each of +these operations. + +CREATE INDEX writes multiple WAL records. First, we write a record to cover +the initializatoin of the metapage, followed by one for each new bucket +created, followed by one for the initial bitmap page. It's not important for +index creation to appear atomic, because the index isn't yet visible to any +other transaction, and the creating transaction will roll back in the event of +a crash. It would be difficult to cover the whole operation with a single +write-ahead log record anyway, because we can log only a fixed number of +pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery. + +Ordinary item insertions (that don't force a page split or need a new overflow +page) are single WAL entries. They touch a single bucket page and the +metapage. The metapage is updated during replay as it is updated during +original operation. + +If an insertion causes the addition of an overflow page, there will be one +WAL entry for the new overflow page and second entry for insert itself. + +If an insertion causes a bucket split, there will be one WAL entry for insert +itself, followed by a WAL entry for allocating a new bucket, followed by a WAL +entry for each overflow bucket page in the new bucket to which the tuples are +moved from old bucket, followed by a WAL entry to indicate that split is +complete for both old and new buckets. A split operation which requires +overflow pages to complete the operation will need to write a WAL record for +each new allocation of an overflow page. + +As splitting involves multiple atomic actions, it's possible that the system +crashes between moving tuples from bucket pages of the old bucket to new +bucket. In such a case, after recovery, the old and new buckets will be +marked with bucket-being-split and bucket-being-populated flags respectively +which indicates that split is in progress for those buckets. The reader +algorithm works correctly, as it will scan both the old and new buckets when +the split is in progress as explained in the reader algorithm section above. + +We finish the split at next insert or split operation on the old bucket as +explained in insert and split algorithm above. It could be done during +searches, too, but it seems best not to put any extra updates in what would +otherwise be a read-only operation (updating is not possible in hot standby +mode anyway). It would seem natural to complete the split in VACUUM, but since +splitting a bucket might require allocating a new page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. + +Deletion of tuples from a bucket is performed for two reasons: to remove dead +tuples, and to remove tuples that were moved by a bucket split. A WAL entry +is made for each bucket page from which tuples are removed, and then another +WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples +are removed, a separate WAL entry is made to update the metapage. + +As deletion involves multiple atomic operations, it is quite possible that +system crashes after (a) removing tuples from some of the bucket pages, (b) +before clearing the garbage flag, or (c) before updating the metapage. If the +system crashes before completing (b), it will again try to clean the bucket +during next vacuum or insert after recovery which can have some performance +impact, but it will work fine. If the system crashes before completing (c), +after recovery there could be some additional splits until the next vacuum +updates the metapage, but the other operations like insert, delete and scan +will work correctly. We can fix this problem by actually updating the +metapage based on delete operation during replay, but it's not clear whether +it's worth the complication. + +A squeeze operation moves tuples from one of the buckets later in the chain to +one of the bucket earlier in chain and writes WAL record when either the +bucket to which it is writing tuples is filled or bucket from which it +is removing the tuples becomes empty. + +As a squeeze operation involves writing multiple atomic operations, it is +quite possible that the system crashes before completing the operation on +entire bucket. After recovery, the operations will work correctly, but +the index will remain bloated and this can impact performance of read and +insert operations until the next vacuum squeeze the bucket completely. Other Notes ----------- -All the shenanigans with locking prevent a split occurring while *another* -process is stopped in a given bucket. They do not ensure that one of -our *own* backend's scans is not stopped in the bucket, because lmgr -doesn't consider a process's own locks to conflict. So the Split -algorithm must check for that case separately before deciding it can go -ahead with the split. VACUUM does not have this problem since nothing -else can be happening within the vacuuming backend. - -Should we instead try to fix the state of any conflicting local scan? -Seems mighty ugly --- got to move the held bucket S-lock as well as lots -of other messiness. For now, just punt and don't split. +Clean up locks prevent a split from occurring while *another* process is stopped +in a given bucket. It also ensures that one of our *own* backend's scans is not +stopped in the bucket. diff --git a/src/gausskernel/storage/access/hash/hash.cpp b/src/gausskernel/storage/access/hash/hash.cpp index 915925d45..e02025876 100644 --- a/src/gausskernel/storage/access/hash/hash.cpp +++ b/src/gausskernel/storage/access/hash/hash.cpp @@ -3,8 +3,8 @@ * hash.cpp * Implementation of Margo Seltzer's Hashing package for postgres. * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,6 +20,8 @@ #include "knl/knl_variable.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/xloginsert.h" #include "access/tableam.h" #include "access/relscan.h" #include "catalog/index.h" @@ -34,6 +36,7 @@ typedef struct { HSpool *spool; /* NULL if not using spooling */ double indtuples; /* # tuples accepted into index */ + Relation heapRel; /* heap relation descriptor */ } HashBuildState; static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, const bool *isnull, bool tupleIsAlive, @@ -52,6 +55,7 @@ Datum hashbuild(PG_FUNCTION_ARGS) double reltuples; double allvisfrac; uint32 num_buckets; + long sort_threshold; HashBuildState buildstate; /* @@ -66,7 +70,7 @@ Datum hashbuild(PG_FUNCTION_ARGS) estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac, NULL); /* Initialize the hash index metadata page and initial buckets */ - num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); + num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then * (assuming their hash codes are pretty random) there will be no locality @@ -74,25 +78,38 @@ Datum hashbuild(PG_FUNCTION_ARGS) * then we'll thrash horribly. To prevent that scenario, we can sort the * tuples by (expected) bucket number. However, such a sort is useless * overhead when the index does fit in RAM. We choose to sort if the - * initial index size exceeds NBuffers. + * initial index size exceeds maintenance_work_mem, or the number of + * buffers usable for the index, whichever is less. (Limiting by the + * number of buffers should reduce thrashing between PG buffers and kernel + * buffers, which seems useful even if no physical I/O results. Limiting + * by maintenance_work_mem is useful to allow easy testing of the sort + * code path, and may be useful to DBAs as an additional control knob.) * * NOTE: this test will need adjustment if a bucket is ever different from - * one page. + * one page. Also, "initial index size" accounting does not include the + * metapage, nor the first bitmap page. */ - if (num_buckets >= (uint32)g_instance.attr.attr_storage.NBuffers) - buildstate.spool = _h_spoolinit(index, num_buckets, &indexInfo->ii_desc); + sort_threshold = (u_sess->attr.attr_memory.maintenance_work_mem * 1024L) / BLCKSZ; + if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) + sort_threshold = Min(sort_threshold, g_instance.attr.attr_storage.NBuffers); + else + sort_threshold = Min(sort_threshold, u_sess->storage_cxt.NLocBuffer); + + if (num_buckets >= (uint32)sort_threshold) + buildstate.spool = _h_spoolinit(heap, index, num_buckets, &indexInfo->ii_desc); else buildstate.spool = NULL; /* prepare to build the index */ buildstate.indtuples = 0; + buildstate.heapRel = heap; /* do the heap scan */ reltuples = tableam_index_build_scan(heap, index, indexInfo, true, hashbuildCallback, (void*)&buildstate); if (buildstate.spool != NULL) { /* sort the tuples and insert them into the index */ - _h_indexbuild(buildstate.spool); + _h_indexbuild(buildstate.spool, buildstate.heapRel); _h_spooldestroy(buildstate.spool); } @@ -114,7 +131,7 @@ Datum hashbuildempty(PG_FUNCTION_ARGS) { Relation index = (Relation)PG_GETARG_POINTER(0); - _hash_metapinit(index, 0, INIT_FORKNUM); + _hash_init(index, 0, INIT_FORKNUM); PG_RETURN_VOID(); } @@ -126,21 +143,24 @@ static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, con void *state) { HashBuildState *buildstate = (HashBuildState *)state; + Datum index_values[1]; + bool index_isnull[1]; IndexTuple itup; - /* Hash indexes don't index nulls, see notes in hashinsert */ - if (isnull[0]) { + /* convert data to a hash key; on failure, do not insert anything */ + if (!_hash_convert_tuple(index, + values, isnull, + index_values, index_isnull)) return; - } /* Either spool the tuple for sorting, or just put it into the index */ if (buildstate->spool != NULL) { - _h_spool(buildstate->spool, &htup->t_self, values, isnull); + _h_spool(buildstate->spool, &htup->t_self, index_values, index_isnull); } else { /* form an index tuple and point it at the heap tuple */ - itup = _hash_form_tuple(index, values, isnull); + itup = index_form_tuple(RelationGetDescr(index), index_values, index_isnull); itup->t_tid = htup->t_self; - _hash_doinsert(index, itup); + _hash_doinsert(index, itup, buildstate->heapRel); pfree(itup); } @@ -159,30 +179,22 @@ Datum hashinsert(PG_FUNCTION_ARGS) Datum *values = (Datum *)PG_GETARG_POINTER(1); bool *isnull = (bool *)PG_GETARG_POINTER(2); ItemPointer ht_ctid = (ItemPointer)PG_GETARG_POINTER(3); - -#ifdef NOT_USED Relation heapRel = (Relation)PG_GETARG_POINTER(4); - IndexUniqueCheck checkUnique = (IndexUniqueCheck)PG_GETARG_INT32(5); -#endif + Datum index_values[1]; + bool index_isnull[1]; IndexTuple itup; - /* - * If the single index key is null, we don't insert it into the index. - * Hash tables support scans on '='. Relational algebra says that A = B - * returns null if either A or B is null. This means that no - * qualification used in an index scan could ever return true on a null - * attribute. It also means that indices can't be used by ISNULL or - * NOTNULL scans, but that's an artifact of the strategy map architecture - * chosen in 1986, not of the way nulls are handled here. - */ - if (isnull[0]) - PG_RETURN_BOOL(false); + /* convert data to a hash key; on failure, do not insert anything */ + if (!_hash_convert_tuple(rel, + values, isnull, + index_values, index_isnull)) + return false; - /* generate an index tuple */ - itup = _hash_form_tuple(rel, values, isnull); + /* form an index tuple and point it at the heap tuple */ + itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); itup->t_tid = *ht_ctid; - _hash_doinsert(rel, itup); + _hash_doinsert(rel, itup, heapRel); pfree(itup); @@ -212,7 +224,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS) * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) - _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); /* * If we've already initialized this scan, we can just advance it in the @@ -224,16 +236,21 @@ Datum hashgettuple(PG_FUNCTION_ARGS) /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking - * for the TID we previously returned. (Because we hold share lock on - * the bucket, no deletions or splits could have occurred; therefore - * we can expect that the TID still exists in the current index page, - * at an offset >= where we were.) + * for the TID we previously returned. (Because we hold a pin on the + * primary bucket page, no deletions or splits could have occurred; + * therefore we can expect that the TID still exists in the current + * index page, at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); + + /* + * We don't need test for old snapshot here as the current buffer is + * pinned, so vacuum can't clean the page. + */ maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; @@ -253,14 +270,22 @@ Datum hashgettuple(PG_FUNCTION_ARGS) */ if (scan->kill_prior_tuple) { /* - * Yes, so mark it by setting the LP_DEAD state in the item flags. + * Yes, so remember it for later. (We'll deal with all such tuples + * at once right after leaving the index page or at end of scan.) + * In case if caller reverses the indexscan direction it is quite + * possible that the same item might get entered multiple times. + * But, we don't detect that; instead, we just forget any excess + * entries. */ - ItemIdMarkDead(PageGetItemId(page, offnum)); + if (so->killedItems == NULL) + so->killedItems = (HashScanPosItem *)palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem)); - /* - * Since this can be redone later if needed, mark as a hint. - */ - MarkBufferDirtyHint(buf, true); + if (so->numKilled < MaxIndexTuplesPerPage) { + so->killedItems[so->numKilled].heapTid = so->hashso_heappos; + so->killedItems[so->numKilled].indexOffset = + ItemPointerGetOffsetNumber(&(so->hashso_curpos)); + so->numKilled++; + } } /* @@ -285,7 +310,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS) /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) - _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; @@ -353,17 +378,20 @@ Datum hashbeginscan(PG_FUNCTION_ARGS) scan = RelationGetIndexScan(rel, nkeys, norderbys); so = (HashScanOpaque)palloc(sizeof(HashScanOpaqueData)); - so->hashso_bucket_valid = false; - so->hashso_bucket_blkno = 0; so->hashso_curbuf = InvalidBuffer; + so->hashso_bucket_buf = InvalidBuffer; + so->hashso_split_bucket_buf = InvalidBuffer; /* set position invalid (this will cause _hash_first call) */ ItemPointerSetInvalid(&(so->hashso_curpos)); ItemPointerSetInvalid(&(so->hashso_heappos)); - scan->opaque = so; + so->hashso_buc_populated = false; + so->hashso_buc_split = false; - /* register scan in case we change pages it's using */ - _hash_regscan(scan); + so->killedItems = NULL; + so->numKilled = 0; + + scan->opaque = so; PG_RETURN_POINTER(scan); } @@ -381,14 +409,13 @@ Datum hashrescan(PG_FUNCTION_ARGS) Relation rel = scan->indexRelation; /* release any pin we still hold */ - if (BufferIsValid(so->hashso_curbuf)) - _hash_dropbuf(rel, so->hashso_curbuf); - so->hashso_curbuf = InvalidBuffer; + if (so->numKilled > 0) { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + _hash_kill_items(scan); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); + } - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; + _hash_dropscanbuf(rel, so); /* set position invalid (this will cause _hash_first call) */ ItemPointerSetInvalid(&(so->hashso_curpos)); @@ -400,10 +427,11 @@ Datum hashrescan(PG_FUNCTION_ARGS) rc = memmove_s(scan->keyData, (unsigned)scan->numberOfKeys * sizeof(ScanKeyData), scankey, (unsigned)scan->numberOfKeys * sizeof(ScanKeyData)); securec_check(rc, "", ""); - - so->hashso_bucket_valid = false; } + so->hashso_buc_populated = false; + so->hashso_buc_split = false; + PG_RETURN_VOID(); } @@ -416,18 +444,20 @@ Datum hashendscan(PG_FUNCTION_ARGS) HashScanOpaque so = (HashScanOpaque)scan->opaque; Relation rel = scan->indexRelation; - /* don't need scan registered anymore */ - _hash_dropscan(scan); + /* + * Before leaving current page, deal with any killed items. Also, ensure + * that we acquire lock on current page before calling _hash_kill_items. + */ + if (so->numKilled > 0) { + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); + _hash_kill_items(scan); + LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); + } - /* release any pin we still hold */ - if (BufferIsValid(so->hashso_curbuf)) - _hash_dropbuf(rel, so->hashso_curbuf); - so->hashso_curbuf = InvalidBuffer; + _hash_dropscanbuf(rel, so); - /* release lock on bucket, too */ - if (so->hashso_bucket_blkno) - _hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE); - so->hashso_bucket_blkno = 0; + if (so->killedItems != NULL) + pfree(so->killedItems); pfree(so); scan->opaque = NULL; @@ -458,6 +488,9 @@ Datum hashrestrpos(PG_FUNCTION_ARGS) * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * + * This function also deletes the tuples that are moved by split to other + * bucket. + * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum hashbulkdelete(PG_FUNCTION_ARGS) @@ -473,29 +506,24 @@ Datum hashbulkdelete(PG_FUNCTION_ARGS) Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; - Buffer metabuf; + Buffer metabuf = InvalidBuffer; HashMetaPage metap; - HashMetaPageData local_metapage; - errno_t rc; + HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* - * Read the metapage to fetch original bucket and tuple counts. Also, we - * keep a copy of the last-seen metapage so that we can use its - * hashm_spares[] values to compute bucket page addresses. This is a bit - * hokey but perfectly safe, since the interesting entries in the spares - * array cannot change under us; and it beats rereading the metapage for - * each bucket. + * We need a copy of the metapage so that we can use its hashm_spares[] + * values to compute bucket page addresses, but a cached copy should be + * good enough. (If not, we'll detect that further down and refresh the + * cache as necessary.) */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = HashPageGetMeta(BufferGetPage(metabuf)); - orig_maxbucket = metap->hashm_maxbucket; - orig_ntuples = metap->hashm_ntuples; - rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage)); - securec_check(rc, "", ""); - _hash_relbuf(rel, metabuf); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(cachedmetap != NULL); + + orig_maxbucket = cachedmetap->hashm_maxbucket; + orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; @@ -505,90 +533,85 @@ loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; - bool bucket_dirty = false; + Buffer bucket_buf; + Buffer buf; + HashPageOpaque bucket_opaque; + Page page; + bool split_cleanup = false; /* Get address of bucket's start page */ - bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); + bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); - /* Exclusive-lock the bucket so we can shrink it */ - _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); - - /* Shouldn't have any active scans locally, either */ - if (_hash_has_active_scan(rel, cur_bucket)) - ereport(ERROR, - (errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), (errmsg("hash index has active scan during VACUUM.")))); - - /* Scan each page in bucket */ blkno = bucket_blkno; - while (BlockNumberIsValid(blkno)) { - Buffer buf; - Page page; - HashPageOpaque opaque; - OffsetNumber offno; - OffsetNumber maxoffno; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; - vacuum_delay_point(); + /* + * We need to acquire a cleanup lock on the primary bucket page to out + * wait concurrent scans before deleting the dead tuples. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); + LockBufferForCleanup(buf); + _hash_checkpage(rel, buf, LH_BUCKET_PAGE); - buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); - page = BufferGetPage(buf); - opaque = (HashPageOpaque)PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == cur_bucket); + page = BufferGetPage(buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); - /* Scan each tuple in page */ - maxoffno = PageGetMaxOffsetNumber(page); - for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { - IndexTuple itup; - ItemPointer htup; - - itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offno)); - htup = &(itup->t_tid); - if (callback(htup, callback_state, InvalidOid)) { - /* mark the item for deletion */ - deletable[ndeletable++] = offno; - tuples_removed += 1; - } else - num_index_tuples += 1; - } + /* + * If the bucket contains tuples that are moved by split, then we need + * to delete such tuples. We can't delete such tuples if the split + * operation on bucket is not finished as those are needed by scans. + */ + if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) { + split_cleanup = true; /* - * Apply deletions and write page if needed, advance to next page. + * This bucket might have been split since we last held a lock on + * the metapage. If so, hashm_maxbucket, hashm_highmask and + * hashm_lowmask might be old enough to cause us to fail to remove + * tuples left behind by the most recent split. To prevent that, + * now that the primary page of the target bucket has been locked + * (and thus can't be further split), check whether we need to + * update our cached metapage data. */ - blkno = opaque->hasho_nextblkno; - - if (ndeletable > 0) { - PageIndexMultiDelete(page, deletable, ndeletable); - _hash_wrtbuf(rel, buf); - bucket_dirty = true; - } else - _hash_relbuf(rel, buf); + Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); + if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) { + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + } } - /* If we deleted anything, try to compact free space */ - if (bucket_dirty) - _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); + bucket_buf = buf; - /* Release bucket lock */ - _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); + hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, + cachedmetap->hashm_maxbucket, + cachedmetap->hashm_highmask, + cachedmetap->hashm_lowmask, &tuples_removed, + &num_index_tuples, split_cleanup, + callback, callback_state); + + _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } + if (BufferIsInvalid(metabuf)) + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); + /* Write-lock metapage and check for split since we started */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); + if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ - cur_maxbucket = metap->hashm_maxbucket; - rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage)); - securec_check(rc, "", ""); - _hash_relbuf(rel, metabuf); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(cachedmetap != NULL); + cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ + START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so @@ -609,7 +632,27 @@ loop_top: num_index_tuples = metap->hashm_ntuples; } - _hash_wrtbuf(rel, metabuf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_hash_update_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.ntuples = metap->hashm_ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); + + XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + _hash_relbuf(rel, metabuf); /* return statistics */ if (stats == NULL) @@ -645,9 +688,244 @@ Datum hashvacuumcleanup(PG_FUNCTION_ARGS) PG_RETURN_POINTER(stats); } -void hash_redo(XLogReaderState *record) +/* + * Helper function to perform deletion of index entries from a bucket. + * + * This function expects that the caller has acquired a cleanup lock on the + * primary bucket page, and will return with a write lock again held on the + * primary bucket page. The lock won't necessarily be held continuously, + * though, because we'll release it when visiting overflow pages. + * + * It would be very bad if this function cleaned a page while some other + * backend was in the midst of scanning it, because hashgettuple assumes + * that the next valid TID will be greater than or equal to the current + * valid TID. There can't be any concurrent scans in progress when we first + * enter this function because of the cleanup lock we hold on the primary + * bucket page, but as soon as we release that lock, there might be. We + * handle that by conspiring to prevent those scans from passing our cleanup + * scan. To do that, we lock the next page in the bucket chain before + * releasing the lock on the previous page. (This type of lock chaining is + * not ideal, so we might want to look for a better solution at some point.) + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. + */ +void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, + BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, + uint32 maxbucket, uint32 highmask, uint32 lowmask, + double *tuples_removed, double *num_index_tuples, + bool split_cleanup, + IndexBulkDeleteCallback callback, void *callback_state) { - ereport(PANIC, (errmsg("hash_redo: unimplemented"))); + BlockNumber blkno; + Buffer buf; + Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; + bool bucket_dirty = false; + + blkno = bucket_blkno; + buf = bucket_buf; + + if (split_cleanup) + new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, + lowmask, maxbucket); + + /* Scan each page in bucket */ + for (;;) { + HashPageOpaque opaque; + OffsetNumber offno; + OffsetNumber maxoffno; + Buffer next_buf; + Page page; + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + bool retain_pin = false; + bool clear_dead_marking = false; + + vacuum_delay_point(); + + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* Scan each tuple in page */ + maxoffno = PageGetMaxOffsetNumber(page); + for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { + ItemPointer htup; + IndexTuple itup; + Bucket bucket; + bool kill_tuple = false; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); + htup = &(itup->t_tid); + + /* + * To remove the dead tuples, we strictly want to rely on results + * of callback function. refer btvacuumpage for detailed reason. + */ + if (callback && callback(htup, callback_state, InvalidOid)) { + kill_tuple = true; + if (tuples_removed) + *tuples_removed += 1; + } else if (split_cleanup) { + /* delete the tuples that are moved by split. */ + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + maxbucket, highmask, lowmask); + /* mark the item for deletion */ + if (bucket != cur_bucket) { + /* + * We expect tuples to either belong to current bucket or + * new_bucket. This is ensured because we don't allow + * further splits from bucket that contains garbage. See + * comments in _hash_expandtable. + */ + Assert(bucket == new_bucket); + kill_tuple = true; + } + } + + if (kill_tuple) { + /* mark the item for deletion */ + deletable[ndeletable++] = offno; + } else { + /* we're keeping it, so count it */ + if (num_index_tuples) + *num_index_tuples += 1; + } + } + + /* retain the pin on primary bucket page till end of bucket scan */ + if (blkno == bucket_blkno) + retain_pin = true; + else + retain_pin = false; + + blkno = opaque->hasho_nextblkno; + + /* + * Apply deletions, advance to next page and write page if needed. + */ + if (ndeletable > 0) { + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + bucket_dirty = true; + + /* + * Let us mark the page as clean if vacuum removes the DEAD tuples + * from an index page. We do this by clearing + * LH_PAGE_HAS_DEAD_TUPLES flag. + */ + if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) { + opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + clear_dead_marking = true; + } + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_hash_delete xlrec; + XLogRecPtr recptr; + + xlrec.clear_dead_marking = clear_dead_marking; + xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashDelete); + + /* + * bucket buffer needs to be registered to ensure that we can + * acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_primary_bucket_page) { + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + } + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); + if (!xlrec.is_primary_bucket_page) { + PageSetLSN(BufferGetPage(bucket_buf), recptr); + } + PageSetLSN(BufferGetPage(buf), recptr); + } + + END_CRIT_SECTION(); + } + + /* bail out if there are no more pages to scan. */ + if (!BlockNumberIsValid(blkno)) + break; + + next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + /* + * release the lock on previous page after acquiring the lock on next + * page + */ + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + buf = next_buf; + } + + /* + * lock the bucket page to clear the garbage flag and squeeze the bucket. + * if the current buffer is same as bucket buffer, then we already have + * lock on bucket page. + */ + if (buf != bucket_buf) { + _hash_relbuf(rel, buf); + LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); + } + + /* + * Clear the garbage flag from bucket after deleting the tuples that are + * moved by split. We purposefully clear the flag before squeeze bucket, + * so that after restart, vacuum shouldn't again try to delete the moved + * by split tuples. + */ + if (split_cleanup) { + HashPageOpaque bucket_opaque; + Page page; + + page = BufferGetPage(bucket_buf); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + MarkBufferDirty(bucket_buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + } + + /* + * If we have deleted anything, try to compact free space. For squeezing + * the bucket, we must have a cleanup lock, else it can impact the + * ordering of tuples for a scan that has started before it. + */ + if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) + _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); + else + LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); } Datum hashmerge(PG_FUNCTION_ARGS) diff --git a/src/gausskernel/storage/access/hash/hash_xlog.cpp b/src/gausskernel/storage/access/hash/hash_xlog.cpp new file mode 100644 index 000000000..444729bf7 --- /dev/null +++ b/src/gausskernel/storage/access/hash/hash_xlog.cpp @@ -0,0 +1,861 @@ +/* ------------------------------------------------------------------------- + * + * hash_xlog.cpp + * WAL replay logic for hash index. + * + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/gausskernel/storage/access/hash/hash_xlog.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/xlogproc.h" +#include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/xlogutils.h" +#include "access/xlog.h" +#include "access/transam.h" +#include "access/xlogproc.h" +#include "storage/procarray.h" +#include "miscadmin.h" + +/* + * replay a hash index meta page + */ +static void hash_xlog_init_meta_page(XLogReaderState *record) +{ + RedoBufferInfo metabuf; + ForkNumber forknum; + + /* create the index' metapage */ + XLogInitBufferForRedo(record, 0, &metabuf); + Assert(BufferIsValid(metabuf.buf)); + HashRedoInitMetaPageOperatorPage(&metabuf, XLogRecGetData(record)); + MarkBufferDirty(metabuf.buf); + + /* + * Force the on-disk state of init forks to always be in sync with the + * state in shared buffers. See XLogReadBufferForRedoExtended. We need + * special handling for init forks as create index operations don't log a + * full page image of the metapage. + */ + XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(metabuf.buf); + + /* all done */ + UnlockReleaseBuffer(metabuf.buf); +} + +/* + * replay a hash index bitmap page + */ +static void hash_xlog_init_bitmap_page(XLogReaderState *record) +{ + RedoBufferInfo bitmapbuf; + RedoBufferInfo metabuf; + ForkNumber forknum; + + /* + * Initialize bitmap page + */ + XLogInitBufferForRedo(record, 0, &bitmapbuf); + HashRedoInitBitmapPageOperatorBitmapPage(&bitmapbuf, XLogRecGetData(record)); + MarkBufferDirty(bitmapbuf.buf); + + /* + * Force the on-disk state of init forks to always be in sync with the + * state in shared buffers. See XLogReadBufferForRedoExtended. We need + * special handling for init forks as create index operations don't log a + * full page image of the metapage. + */ + XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(bitmapbuf.buf); + UnlockReleaseBuffer(bitmapbuf.buf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the bitmap page. But during replay it's not + * necessary to hold that lock, since nobody can see it yet; the + * creating transaction hasn't yet committed. + */ + HashRedoInitBitmapPageOperatorMetaPage(&metabuf); + MarkBufferDirty(metabuf.buf); + + XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL); + if (forknum == INIT_FORKNUM) + FlushOneBuffer(metabuf.buf); + } + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); +} + +/* + * replay a hash index insert without split + */ +static void hash_xlog_insert(XLogReaderState *record) +{ + RedoBufferInfo buffer; + RedoBufferInfo metabuf; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + HashRedoInsertOperatorPage(&buffer, XLogRecGetData(record), datapos, datalen); + MarkBufferDirty(buffer.buf); + } + if (BufferIsValid(buffer.buf)) + UnlockReleaseBuffer(buffer.buf); + + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the page we inserted into. But during replay it's + * not necessary to hold that lock, since no other index updates can + * be happening concurrently. + */ + HashRedoInsertOperatorMetaPage(&metabuf); + MarkBufferDirty(metabuf.buf); + } + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); +} + +/* + * replay addition of overflow page for hash index + */ +static void hash_xlog_add_ovfl_page(XLogReaderState* record) +{ + RedoBufferInfo leftbuf; + RedoBufferInfo ovflbuf; + RedoBufferInfo metabuf; + BlockNumber leftblk; + BlockNumber rightblk; + char *data = NULL; + Size datalen; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); + XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); + + XLogInitBufferForRedo(record, 0, &ovflbuf); + Assert(BufferIsValid(ovflbuf.buf)); + + data = XLogRecGetBlockData(record, 0, &datalen); + HashRedoAddOvflPageOperatorOvflPage(&ovflbuf, leftblk, data, datalen); + MarkBufferDirty(ovflbuf.buf); + + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { + HashRedoAddOvflPageOperatorLeftPage(&leftbuf, rightblk); + MarkBufferDirty(leftbuf.buf); + } + + if (BufferIsValid(leftbuf.buf)) + UnlockReleaseBuffer(leftbuf.buf); + UnlockReleaseBuffer(ovflbuf.buf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the overflow pages. But during replay it's not + * necessary to hold those locks, since no other index updates can be + * happening concurrently. + */ + if (XLogRecHasBlockRef(record, 2)) { + RedoBufferInfo mapbuffer; + + if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { + data = XLogRecGetBlockData(record, 2, &datalen); + + HashRedoAddOvflPageOperatorMapPage(&mapbuffer, data); + MarkBufferDirty(mapbuffer.buf); + } + if (BufferIsValid(mapbuffer.buf)) + UnlockReleaseBuffer(mapbuffer.buf); + } + + if (XLogRecHasBlockRef(record, 3)) { + RedoBufferInfo newmapbuf; + + XLogInitBufferForRedo(record, 3, &newmapbuf); + + HashRedoAddOvflPageOperatorNewmapPage(&newmapbuf, XLogRecGetData(record)); + MarkBufferDirty(newmapbuf.buf); + + UnlockReleaseBuffer(newmapbuf.buf); + } + + if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) { + data = XLogRecGetBlockData(record, 4, &datalen); + + HashRedoAddOvflPageOperatorMetaPage(&metabuf, XLogRecGetData(record), data, datalen); + MarkBufferDirty(metabuf.buf); + } + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); +} + +/* + * replay allocation of page for split operation + */ +static void hash_xlog_split_allocate_page(XLogReaderState *record) +{ + RedoBufferInfo oldbuf; + RedoBufferInfo newbuf; + RedoBufferInfo metabuf; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + char *data = NULL; + XLogRedoAction action; + + /* + * To be consistent with normal operation, here we take cleanup locks on + * both the old and new buckets even though there can't be any concurrent + * inserts. + */ + + /* replay the record for old bucket */ + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the special space is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { + HashRedoSplitAllocatePageOperatorObukPage(&oldbuf, XLogRecGetData(record)); + MarkBufferDirty(oldbuf.buf); + } + + /* replay the record for new bucket */ + XLogInitBufferForRedo(record, 1, &newbuf); + HashRedoSplitAllocatePageOperatorNbukPage(&newbuf, XLogRecGetData(record)); + if (!IsBufferCleanupOK(newbuf.buf)) + elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); + MarkBufferDirty(newbuf.buf); + + /* + * We can release the lock on old bucket early as well but doing here to + * consistent with normal operation. + */ + if (BufferIsValid(oldbuf.buf)) + UnlockReleaseBuffer(oldbuf.buf); + if (BufferIsValid(newbuf.buf)) + UnlockReleaseBuffer(newbuf.buf); + + /* + * Note: in normal operation, we'd update the meta page while still + * holding lock on the old and new bucket pages. But during replay it's + * not necessary to hold those locks, since no other bucket splits can be + * happening concurrently. + */ + + /* replay the record for metapage changes */ + if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) { + data = XLogRecGetBlockData(record, 2, &datalen); + + HashRedoSplitAllocatePageOperatorMetaPage(&metabuf, XLogRecGetData(record), data); + MarkBufferDirty(metabuf.buf); + } + + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); +} + +/* + * replay of split operation + */ +static void hash_xlog_split_page(XLogReaderState *record) +{ + RedoBufferInfo buf; + + if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED) + elog(ERROR, "Hash split record did not contain a full-page image"); + + if (BufferIsValid(buf.buf)) + UnlockReleaseBuffer(buf.buf); +} + +/* + * replay completion of split operation + */ +static void hash_xlog_split_complete(XLogReaderState *record) +{ + RedoBufferInfo oldbuf; + RedoBufferInfo newbuf; + XLogRedoAction action; + + /* replay the record for old bucket */ + action = XLogReadBufferForRedo(record, 0, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { + HashRedoSplitCompleteOperatorObukPage(&oldbuf, XLogRecGetData(record)); + MarkBufferDirty(oldbuf.buf); + } + if (BufferIsValid(oldbuf.buf)) + UnlockReleaseBuffer(oldbuf.buf); + + /* replay the record for new bucket */ + action = XLogReadBufferForRedo(record, 1, &newbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { + HashRedoSplitCompleteOperatorNbukPage(&newbuf, XLogRecGetData(record)); + MarkBufferDirty(newbuf.buf); + } + if (BufferIsValid(newbuf.buf)) + UnlockReleaseBuffer(newbuf.buf); +} + +/* + * replay move of page contents for squeeze operation of hash index + */ +static void hash_xlog_move_page_contents(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); + RedoBufferInfo bucketbuf; + RedoBufferInfo writebuf; + RedoBufferInfo deletebuf; + XLogRedoAction action; + + bucketbuf.buf = InvalidBuffer; + writebuf.buf = InvalidBuffer; + deletebuf.buf = InvalidBuffer; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) { + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + } else { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + PageSetLSN(bucketbuf.pageinfo.page, lsn); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) { + char *data = NULL; + Size datalen; + + data = XLogRecGetBlockData(record, 1, &datalen); + + HashXlogMoveAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen); + + MarkBufferDirty(writebuf.buf); + } + + /* replay the record for deleting entries from overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) { + char *ptr = NULL; + Size len; + + ptr = XLogRecGetBlockData(record, 2, &len); + + HashXlogMoveDeleteOvflPageOperatorPage(&deletebuf, (void *)ptr, len); + + MarkBufferDirty(deletebuf.buf); + } + + /* + * Replay is complete, now we can release the buffers. We release locks at + * end of replay operation to ensure that we hold lock on primary bucket + * page till end of operation. We can optimize by releasing the lock on + * write buffer as soon as the operation for same is complete, if it is + * not same as primary bucket page, but that doesn't seem to be worth + * complicating the code. + */ + if (BufferIsValid(deletebuf.buf)) + UnlockReleaseBuffer(deletebuf.buf); + + if (BufferIsValid(writebuf.buf)) + UnlockReleaseBuffer(writebuf.buf); + + if (BufferIsValid(bucketbuf.buf)) + UnlockReleaseBuffer(bucketbuf.buf); +} + +/* + * replay squeeze page operation of hash index + */ +static void hash_xlog_squeeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); + RedoBufferInfo bucketbuf; + RedoBufferInfo writebuf; + RedoBufferInfo ovflbuf; + RedoBufferInfo prevbuf; + RedoBufferInfo mapbuf; + XLogRedoAction action; + + bucketbuf.buf = InvalidBuffer; + prevbuf.buf = InvalidBuffer; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) { + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + } else { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + PageSetLSN(bucketbuf.pageinfo.page, lsn); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) { + char *data = NULL; + Size datalen; + + data = XLogRecGetBlockData(record, 1, &datalen); + + HashXlogSqueezeAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen); + + MarkBufferDirty(writebuf.buf); + } + + /* replay the record for initializing overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) { + HashXlogSqueezeInitOvflbufOperatorPage(&ovflbuf, XLogRecGetData(record)); + + MarkBufferDirty(ovflbuf.buf); + } + if (BufferIsValid(ovflbuf.buf)) + UnlockReleaseBuffer(ovflbuf.buf); + + /* replay the record for page previous to the freed overflow page */ + if (!xldata->is_prev_bucket_same_wrt && + XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) { + HashXlogSqueezeUpdatePrevPageOperatorPage(&prevbuf, XLogRecGetData(record)); + + MarkBufferDirty(prevbuf.buf); + } + if (BufferIsValid(prevbuf.buf)) + UnlockReleaseBuffer(prevbuf.buf); + + /* replay the record for page next to the freed overflow page */ + if (XLogRecHasBlockRef(record, 4)) { + RedoBufferInfo nextbuf; + + if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) { + HashXlogSqueezeUpdateNextPageOperatorPage(&nextbuf, XLogRecGetData(record)); + + MarkBufferDirty(nextbuf.buf); + } + if (BufferIsValid(nextbuf.buf)) + UnlockReleaseBuffer(nextbuf.buf); + } + + if (BufferIsValid(writebuf.buf)) + UnlockReleaseBuffer(writebuf.buf); + + if (BufferIsValid(bucketbuf.buf)) + UnlockReleaseBuffer(bucketbuf.buf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the primary bucket page and overflow pages. But + * during replay it's not necessary to hold those locks, since no other + * index updates can be happening concurrently. + */ + /* replay the record for bitmap page */ + if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { + char *data = NULL; + Size datalen; + + data = XLogRecGetBlockData(record, 5, &datalen); + HashXlogSqueezeUpdateBitmapOperatorPage(&mapbuf, (void *)data); + + MarkBufferDirty(mapbuf.buf); + } + if (BufferIsValid(mapbuf.buf)) + UnlockReleaseBuffer(mapbuf.buf); + + /* replay the record for meta page */ + if (XLogRecHasBlockRef(record, 6)) { + RedoBufferInfo metabuf; + + if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) { + char *data = NULL; + Size datalen; + + data = XLogRecGetBlockData(record, 6, &datalen); + HashXlogSqueezeUpdateMateOperatorPage(&metabuf, (void *)data); + + MarkBufferDirty(metabuf.buf); + } + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); + } +} + +/* + * replay delete operation of hash index + */ +static void hash_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); + RedoBufferInfo bucketbuf; + RedoBufferInfo deletebuf; + XLogRedoAction action; + + bucketbuf.buf = InvalidBuffer; + + /* + * Ensure we have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_primary_bucket_page) { + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); + } else { + /* + * we don't care for return value as the purpose of reading bucketbuf + * is to ensure a cleanup lock on primary bucket page. + */ + (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); + + PageSetLSN(bucketbuf.pageinfo.page, lsn); + + action = XLogReadBufferForRedo(record, 1, &deletebuf); + } + + /* replay the record for deleting entries in bucket page */ + if (action == BLK_NEEDS_REDO) { + char *ptr = NULL; + Size len; + + ptr = XLogRecGetBlockData(record, 1, &len); + + HashXlogDeleteBlockOperatorPage(&deletebuf, XLogRecGetData(record), (void *)ptr, len); + + MarkBufferDirty(deletebuf.buf); + } + if (BufferIsValid(deletebuf.buf)) + UnlockReleaseBuffer(deletebuf.buf); + + if (BufferIsValid(bucketbuf.buf)) + UnlockReleaseBuffer(bucketbuf.buf); +} + +/* + * replay split cleanup flag operation for primary bucket page. + */ +static void hash_xlog_split_cleanup(XLogReaderState *record) +{ + RedoBufferInfo buffer; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + HashXlogSplitCleanupOperatorPage(&buffer); + + MarkBufferDirty(buffer.buf); + } + if (BufferIsValid(buffer.buf)) + UnlockReleaseBuffer(buffer.buf); +} + +/* + * replay for update meta page + */ +static void hash_xlog_update_meta_page(XLogReaderState *record) +{ + RedoBufferInfo metabuf; + + if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) { + HashXlogUpdateMetaOperatorPage(&metabuf, XLogRecGetData(record)); + + MarkBufferDirty(metabuf.buf); + } + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); +} + +/* + * Get the latestRemovedXid from the heap pages pointed at by the index + * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid, + * on which this function is based. + */ +static TransactionId hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record) +{ + xl_hash_vacuum_one_page *xlrec; + OffsetNumber *unused = NULL; + Buffer ibuffer; + Buffer hbuffer; + Page ipage; + Page hpage; + RelFileNode rnode; + BlockNumber blkno; + ItemId iitemid; + ItemId hitemid; + IndexTuple itup; + BlockNumber hblkno; + OffsetNumber hoffnum; + TransactionId latestRemovedXid = InvalidTransactionId; + int i; + + xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record); + + /* + * If there's nothing running on the standby we don't need to derive a + * full latestRemovedXid value, so use a fast path out of here. This + * returns InvalidTransactionId, and so will conflict with all HS + * transactions; but since we just worked out that that's zero people, + * it's OK. + * + * XXX There is a race condition here, which is that a new backend might + * start just after we look. If so, it cannot need to conflict, but this + * coding will result in throwing a conflict anyway. + */ + if (CountDBBackends(InvalidOid) == 0) + return latestRemovedXid; + + /* + * Check if WAL replay has reached a consistent database state. If not, we + * must PANIC. See the definition of + * btree_xlog_delete_get_latestRemovedXid for more details. + */ + if (!t_thrd.xlog_cxt.reachedConsistency) + elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data"); + + /* + * Get index page. If the DB is consistent, this should not fail, nor + * should any of the heap page fetches below. If one does, we return + * InvalidTransactionId to cancel all HS transactions. That's probably + * overkill, but it's safe, and certainly better than panicking here. + */ + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); + + if (!BufferIsValid(ibuffer)) + return InvalidTransactionId; + LockBuffer(ibuffer, HASH_READ); + ipage = (Page) BufferGetPage(ibuffer); + + /* + * Loop through the deleted index items to obtain the TransactionId from + * the heap items they point to. + */ + unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage); + + for (i = 0; i < xlrec->ntuples; i++) { + /* + * Identify the index tuple about to be deleted. + */ + iitemid = PageGetItemId(ipage, unused[i]); + itup = (IndexTuple) PageGetItem(ipage, iitemid); + + /* + * Locate the heap page that the index tuple points at + */ + hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); + + if (!BufferIsValid(hbuffer)) { + UnlockReleaseBuffer(ibuffer); + return InvalidTransactionId; + } + LockBuffer(hbuffer, HASH_READ); + hpage = (Page) BufferGetPage(hbuffer); + + /* + * Look up the heap tuple header that the index tuple points at by + * using the heap node supplied with the xlrec. We can't use + * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. + * Note that we are not looking at tuple data here, just headers. + */ + hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); + hitemid = PageGetItemId(hpage, hoffnum); + + /* + * Follow any redirections until we find something useful. + */ + while (ItemIdIsRedirected(hitemid)) { + hoffnum = ItemIdGetRedirect(hitemid); + hitemid = PageGetItemId(hpage, hoffnum); + CHECK_FOR_INTERRUPTS(); + } + + /* + * If the heap item has storage, then read the header and use that to + * set latestRemovedXid. + * + * Some LP_DEAD items may not be accessible, so we ignore them. + */ + if (ItemIdHasStorage(hitemid)) { + HeapTupleData tuple; + tuple.t_data = (HeapTupleHeader) PageGetItem(hpage, hitemid); + HeapTupleCopyBaseFromPage(&tuple, &hpage); + HeapTupleHeaderAdvanceLatestRemovedXid(&tuple, &latestRemovedXid); + } else if (ItemIdIsDead(hitemid)) { + /* + * Conjecture: if hitemid is dead then it had xids before the xids + * marked on LP_NORMAL items. So we just ignore this item and move + * onto the next, for the purposes of calculating + * latestRemovedxids. + */ + } else + Assert(!ItemIdIsUsed(hitemid)); + + UnlockReleaseBuffer(hbuffer); + } + + UnlockReleaseBuffer(ibuffer); + + /* + * If all heap tuples were LP_DEAD then we will be returning + * InvalidTransactionId here, which avoids conflicts. This matches + * existing logic which assumes that LP_DEAD tuples must already be older + * than the latestRemovedXid on the cleanup record that set them as + * LP_DEAD, hence must already have generated a conflict. + */ + return latestRemovedXid; +} + +/* + * replay delete operation in hash index to remove + * tuples marked as DEAD during index tuple insertion. + */ +static void hash_xlog_vacuum_one_page(XLogReaderState *record) +{ + RedoBufferInfo buffer; + RedoBufferInfo metabuf; + XLogRedoAction action; + + /* + * If we have any conflict processing to do, it must happen before we + * update the page. + * + * Hash index records that are marked as LP_DEAD and being removed during + * hash index tuple insertion can conflict with standby queries. You might + * think that vacuum records would conflict as well, but we've handled + * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid + * cleaned by the vacuum of the heap and so we can resolve any conflicts + * just once when that arrives. After that we know that no conflicts + * exist from individual hash index vacuum records on that index. + */ + if (InHotStandby) { + TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record); + RelFileNode rnode; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); + } + + action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); + + if (action == BLK_NEEDS_REDO) { + Size len; + + len = XLogRecGetDataLen(record); + HashXlogVacuumOnePageOperatorPage(&buffer, XLogRecGetData(record), len); + + MarkBufferDirty(buffer.buf); + } + if (BufferIsValid(buffer.buf)) + UnlockReleaseBuffer(buffer.buf); + + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { + HashXlogVacuumMateOperatorPage(&metabuf, XLogRecGetData(record)); + MarkBufferDirty(metabuf.buf); + } + if (BufferIsValid(metabuf.buf)) + UnlockReleaseBuffer(metabuf.buf); +} + +void hash_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) { + case XLOG_HASH_INIT_META_PAGE: + hash_xlog_init_meta_page(record); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + hash_xlog_init_bitmap_page(record); + break; + case XLOG_HASH_INSERT: + hash_xlog_insert(record); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + hash_xlog_add_ovfl_page(record); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + hash_xlog_split_allocate_page(record); + break; + case XLOG_HASH_SPLIT_PAGE: + hash_xlog_split_page(record); + break; + case XLOG_HASH_SPLIT_COMPLETE: + hash_xlog_split_complete(record); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + hash_xlog_move_page_contents(record); + break; + case XLOG_HASH_SQUEEZE_PAGE: + hash_xlog_squeeze_page(record); + break; + case XLOG_HASH_DELETE: + hash_xlog_delete(record); + break; + case XLOG_HASH_SPLIT_CLEANUP: + hash_xlog_split_cleanup(record); + break; + case XLOG_HASH_UPDATE_META_PAGE: + hash_xlog_update_meta_page(record); + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + hash_xlog_vacuum_one_page(record); + break; + default: + elog(PANIC, "hash_redo: unknown op code %u", info); + } +} + +bool IsHashVacuumPages(XLogReaderState *record) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if (XLogRecGetRmid(record) == RM_HASH_ID) { + if (info == XLOG_HASH_DELETE) { + return true; + } + } + + return false; +} diff --git a/src/gausskernel/storage/access/hash/hashinsert.cpp b/src/gausskernel/storage/access/hash/hashinsert.cpp index 60a87e67f..6c28075a2 100644 --- a/src/gausskernel/storage/access/hash/hashinsert.cpp +++ b/src/gausskernel/storage/access/hash/hashinsert.cpp @@ -3,8 +3,8 @@ * hashinsert.cpp * Item insertion in hash tables for Postgres. * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -17,21 +17,30 @@ #include "knl/knl_variable.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/heapam.h" +#include "access/xloginsert.h" +#include "miscadmin.h" #include "utils/rel.h" #include "utils/rel_gs.h" +#include "storage/lock/lwlock.h" +#include "storage/buf/buf_internals.h" + +static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode); /* * _hash_doinsert() -- Handle insertion of a single index tuple. * - * This routine is called by the public interface routines, hashbuild - * and hashinsert. By here, itup is completely filled in. + * This routine is called by the public interface routines, hashbuild + * and hashinsert. By here, itup is completely filled in. */ -void _hash_doinsert(Relation rel, IndexTuple itup) +void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel) { Buffer buf; + Buffer bucket_buf; Buffer metabuf; HashMetaPage metap; - BlockNumber blkno; + HashMetaPage usedmetap = NULL; Page metapage; Page page; HashPageOpaque pageopaque; @@ -39,7 +48,7 @@ void _hash_doinsert(Relation rel, IndexTuple itup) bool do_expand = false; uint32 hashkey; Bucket bucket; - + OffsetNumber itup_off; /* * Get the hash key for the item (it's stored in the index tuple itself). */ @@ -49,16 +58,16 @@ void _hash_doinsert(Relation rel, IndexTuple itup) itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ - /* - * Acquire shared split lock so we can compute the target bucket safely - * (see README). - */ - _hash_getlock(rel, 0, HASH_SHARE); - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); +restart_insert: + + /* + * Read the metapage. We don't lock it yet; HashMaxItemSize() will + * examine pd_pagesize_version, but that can't change so we can examine it + * without a lock. + */ + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); metapage = BufferGetPage(metabuf); - metap = HashPageGetMeta(metapage); /* * Check whether the item can fit on a hash page at all. (Eventually, we @@ -73,87 +82,154 @@ void _hash_doinsert(Relation rel, IndexTuple itup) (unsigned long)HashMaxItemSize(metapage)), errhint("Values larger than a buffer page cannot be indexed."))); - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); + /* Lock the primary bucket page for the target bucket. */ + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, &usedmetap); + Assert(usedmetap != NULL); - blkno = BUCKET_TO_BLKNO(metap, bucket); + /* remember the primary bucket buffer to release the pin on it at end. */ + bucket_buf = buf; - /* release lock on metapage, but keep pin since we'll need it again */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); - - /* - * Acquire share lock on target bucket; then we can release split lock. - */ - _hash_getlock(rel, blkno, HASH_SHARE); - - _hash_droplock(rel, 0, HASH_SHARE); - - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); page = BufferGetPage(buf); - pageopaque = (HashPageOpaque)PageGetSpecialPointer(page); - Assert(pageopaque->hasho_bucket == bucket); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = pageopaque->hasho_bucket; + + /* + * If this bucket is in the process of being split, try to finish the + * split before inserting, because that might create room for the + * insertion to proceed without allocating an additional overflow page. + * It's only interesting to finish the split if we're trying to insert + * into the bucket from which we're removing tuples (the "old" bucket), + * not if we're trying to insert into the bucket into which tuples are + * being moved (the "new" bucket). + */ + if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) { + /* release the lock on bucket buffer, before completing the split. */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + _hash_finish_split(rel, metabuf, buf, bucket, + usedmetap->hashm_maxbucket, + usedmetap->hashm_highmask, + usedmetap->hashm_lowmask); + + /* release the pin on old and meta buffer. retry for insert. */ + _hash_dropbuf(rel, buf); + _hash_dropbuf(rel, metabuf); + goto restart_insert; + } /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { + BlockNumber nextblkno; + + /* + * Check if current page has any DEAD tuples. If yes, delete these + * tuples and see if we can get a space for the new item to be + * inserted before moving to the next page in the bucket chain. + */ + if (H_HAS_DEAD_TUPLES(pageopaque)) { + if (IsBufferCleanupOK(buf)) { + _hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node); + + if (PageGetFreeSpace(page) >= itemsz) + break; /* OK, now we have enough space */ + } + } + /* * no space on this page; check for an overflow page */ - BlockNumber nextblkno = pageopaque->hasho_nextblkno; + nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll - * find out next pass through the loop test above. + * find out next pass through the loop test above. we always + * release both the lock and pin if this is an overflow page, but + * only the lock if this is the primary bucket page, since the pin + * on the primary bucket must be retained throughout the scan. */ - _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_relbuf(rel, buf); + else + LockBuffer(buf, BUFFER_LOCK_UNLOCK); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. - * - * release our write lock without modifying buffer */ - _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); + + /* release our write lock without modifying buffer */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* chain to a new overflow page */ - buf = _hash_addovflpage(rel, metabuf, buf); + buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } - pageopaque = (HashPageOpaque)PageGetSpecialPointer(page); - Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } - /* found page with enough space, so add the item here */ - (void)_hash_pgaddtup(rel, buf, itemsz, itup); - - /* write and release the modified page */ - _hash_wrtbuf(rel, buf); - - /* We can drop the bucket lock now */ - _hash_droplock(rel, blkno, HASH_SHARE); - /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* found page with enough space, so add the item here */ + itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); + MarkBufferDirty(buf); + + /* metapage operations */ + metap = HashPageGetMeta(metapage); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double)metap->hashm_ffactor * (metap->hashm_maxbucket + 1); - /* Write out the metapage and drop lock, but keep pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_hash_insert xlrec; + XLogRecPtr recptr; + + xlrec.offnum = itup_off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInsert); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock on metapage, but keep pin */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Release the modified page and ensure to release the pin on primary + * page. + */ + _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_dropbuf(rel, bucket_buf); /* Attempt to split if a split is needed */ if (do_expand) @@ -192,3 +268,130 @@ OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple return itup_off; } + +/* + * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the index. + * + * This routine has same requirements for locking and tuple ordering as + * _hash_pgaddtup(). + * + * Returns the offset number array at which the tuples were inserted. + */ +void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups) +{ + OffsetNumber itup_off; + Page page; + uint32 hashkey; + int i; + + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + + for (i = 0; i < nitups; i++) { + Size itemsize; + + itemsize = IndexTupleDSize(*itups[i]); + itemsize = MAXALIGN(itemsize); + + /* Find where to insert the tuple (preserving page's hashkey ordering) */ + hashkey = _hash_get_indextuple_hashkey(itups[i]); + itup_off = _hash_binsearch(page, hashkey); + + itup_offsets[i] = itup_off; + + if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); + } +} + +/* + * _hash_vacuum_one_page - vacuum just one index page. + * + * Try to remove LP_DEAD items from the given page. We must acquire cleanup + * lock on the page being modified before calling this function. + */ + +static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode) +{ + OffsetNumber deletable[MaxOffsetNumber]; + int ndeletable = 0; + OffsetNumber offnum; + OffsetNumber maxoff; + Page page = BufferGetPage(buf); + HashPageOpaque pageopaque; + HashMetaPage metap; + + /* Scan each tuple in page to see if it is marked as LP_DEAD */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) { + /* + * Write-lock the meta page so that we can decrement tuple count. + */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint + * anyway. + */ + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + metap = HashPageGetMeta(BufferGetPage(metabuf)); + metap->hashm_ntuples -= ndeletable; + + MarkBufferDirty(buf); + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_hash_vacuum_one_page xlrec; + XLogRecPtr recptr; + + xlrec.hnode = hnode; + xlrec.ntuples = ndeletable; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage); + + /* + * We need the target-offsets array whether or not we store the + * whole buffer, to allow us to find the latestRemovedXid on a + * standby server. + */ + XLogRegisterData((char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* + * Releasing write lock on meta page as we have updated the tuple + * count. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + } +} diff --git a/src/gausskernel/storage/access/hash/hashovfl.cpp b/src/gausskernel/storage/access/hash/hashovfl.cpp index d1c3368ea..eee71ea2d 100644 --- a/src/gausskernel/storage/access/hash/hashovfl.cpp +++ b/src/gausskernel/storage/access/hash/hashovfl.cpp @@ -3,8 +3,8 @@ * hashovfl.cpp * Overflow page management code for the Postgres hash access method * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -20,10 +20,12 @@ #include "knl/knl_variable.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" #include "utils/rel.h" #include "utils/rel_gs.h" -static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); /* @@ -46,13 +48,13 @@ static BlockNumber bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum) * Convert to absolute page number by adding the number of bucket pages * that exist before this split point. */ - return (BlockNumber)(((uint32)1 << i) + ovflbitnum); + return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum); } /* * Convert overflow page block number to bit number for free-page bitmap. */ -static uint32 blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) +uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) { uint32 splitnum = metap->hashm_ovflpoint; uint32 i; @@ -60,54 +62,84 @@ static uint32 blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) /* Determine the split number containing this page */ for (i = 1; i <= splitnum; i++) { - if (ovflblkno <= (BlockNumber)((uint32)1 << i)) + if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i)) break; /* oops */ - bitnum = ovflblkno - ((uint32)1 << i); - if (bitnum <= metap->hashm_spares[i]) - return bitnum - 1; /* -1 to convert 1-based to 0-based */ + bitnum = ovflblkno - _hash_get_totalbuckets(i); + + /* + * bitnum has to be greater than number of overflow page added in + * previous split point. The overflow page at this splitnum (i) if any + * should start from (_hash_get_totalbuckets(i) + + * metap->hashm_spares[i - 1] + 1). + */ + if (bitnum > metap->hashm_spares[i - 1] && bitnum <= metap->hashm_spares[i]) + return bitnum - 1; /* -1 to convert 1-based to 0-based */ } - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid overflow block number %u", ovflblkno))); + + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", ovflblkno))); return 0; /* keep compiler quiet */ } /* * _hash_addovflpage * - * Add an overflow page to the bucket whose last page is pointed to by 'buf'. + * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * - * On entry, the caller must hold a pin but no lock on 'buf'. The pin is - * dropped before exiting (we assume the caller is not interested in 'buf' - * anymore). The returned overflow page will be pinned and write-locked; - * it is guaranteed to be empty. + * On entry, the caller must hold a pin but no lock on 'buf'. The pin is + * dropped before exiting (we assume the caller is not interested in 'buf' + * anymore) if not asked to retain. The pin will be retained only for the + * primary bucket. The returned overflow page will be pinned and + * write-locked; it is guaranteed to be empty. * - * The caller must hold a pin, but no lock, on the metapage buffer. - * That buffer is returned in the same state. - * - * The caller must hold at least share lock on the bucket, to ensure that - * no one else tries to compact the bucket meanwhile. This guarantees that - * 'buf' won't stop being part of the bucket while it's unlocked. + * The caller must hold a pin, but no lock, on the metapage buffer. + * That buffer is returned in the same state. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ -Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) +Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; - - /* allocate and lock an empty overflow page */ - ovflbuf = _hash_getovflpage(rel, metabuf); + HashMetaPage metap; + Buffer mapbuf = InvalidBuffer; + Buffer newmapbuf = InvalidBuffer; + BlockNumber blkno; + BlockNumber newmap_blkno = InvalidBlockNumber; + uint32 orig_firstfree; + uint32 splitnum; + uint32 *freep = NULL; + uint32 max_ovflpg; + uint32 bit; + uint32 bitmap_page_bit; + uint32 first_page; + uint32 last_bit; + uint32 last_page; + uint32 i; + uint32 j; + bool page_found = false; /* - * Write-lock the tail page. It is okay to hold two buffer locks here - * since there cannot be anyone else contending for access to ovflbuf. + * Write-lock the tail page. Here, we need to maintain locking order such + * that, first acquire the lock on tail page of bucket, then on meta page + * to find and lock the bitmap page and if it is found, then lock on meta + * page is released, then finally acquire the lock on new overflow buffer. + * We need this locking order to avoid deadlock with backends that are + * doing inserts. + * + * Note: We could have avoided locking many buffers here if we made two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. */ - _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); @@ -124,55 +156,21 @@ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) break; /* we assume we do not need to write the unmodified page */ - _hash_relbuf(rel, buf); + if (retain_pin) { + /* pin will be retained only for the primary bucket page */ + Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } else { + _hash_relbuf(rel, buf); + } + + retain_pin = false; buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } - /* now that we have correct backlink, initialize new overflow page */ - ovflpage = BufferGetPage(ovflbuf); - ovflopaque = (HashPageOpaque)PageGetSpecialPointer(ovflpage); - ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); - ovflopaque->hasho_nextblkno = InvalidBlockNumber; - ovflopaque->hasho_bucket = pageopaque->hasho_bucket; - ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; - ovflopaque->hasho_page_id = HASHO_PAGE_ID; - - MarkBufferDirty(ovflbuf); - - /* logically chain overflow page to previous page */ - pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); - _hash_wrtbuf(rel, buf); - - return ovflbuf; -} - -/* - * Find an available overflow page and return it. The returned buffer - * is pinned and write-locked, and has had _hash_pageinit() applied, - * but it is caller's responsibility to fill the special space. - * - * The caller must hold a pin, but no lock, on the metapage buffer. - * That buffer is left in the same state at exit. - */ -static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) -{ - HashMetaPage metap; - Buffer mapbuf = 0; - Buffer newbuf; - BlockNumber blkno; - uint32 orig_firstfree; - uint32 splitnum; - uint32 *freep = NULL; - uint32 max_ovflpg; - uint32 bit; - uint32 first_page; - uint32 last_bit; - uint32 last_page; - uint32 i, j; - /* Get exclusive lock on the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); @@ -209,25 +207,44 @@ static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { - if (freep[j] != ALL_SET) + if (freep[j] != ALL_SET) { + page_found = true; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + bitmap_page_bit = bit; + + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); + /* Calculate address of the recycled overflow page */ + blkno = bitno_to_blkno(metap, bit); + + /* Fetch and init the recycled page */ + ovflbuf = _hash_getinitbuf(rel, blkno); + goto found; + } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); + mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); } /* @@ -244,8 +261,15 @@ static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; + + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); } else { /* * Nothing to do here; since the page will be past the last used page, @@ -254,7 +278,8 @@ static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) } /* Calculate address of the new overflow page */ - bit = metap->hashm_spares[splitnum]; + bit = BufferIsValid(newmapbuf) ? + metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* @@ -262,60 +287,140 @@ static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. + * + * It is okay to hold two buffer locks here (one on tail page of bucket + * and other on new overflow page) since there cannot be anyone else + * contending for access to ovflbuf. */ - newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); - - metap->hashm_spares[splitnum]++; - - /* - * Adjust hashm_firstfree to avoid redundant searches. But don't risk - * changing it if someone moved it while we were searching bitmap pages. - */ - if (metap->hashm_firstfree == orig_firstfree) - metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); - - return newbuf; + ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); found: - /* convert bit to bit number within page */ - bit += _hash_firstfreebit(freep[j]); - - /* mark page "in use" in the bitmap */ - SETBIT(freep, bit); - _hash_wrtbuf(rel, mapbuf); - - /* Reacquire exclusive lock on the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); - - /* convert bit to absolute bit number */ - bit += (i << BMPG_SHIFT(metap)); - - /* Calculate address of the recycled overflow page */ - blkno = bitno_to_blkno(metap, bit); /* - * Adjust hashm_firstfree to avoid redundant searches. But don't risk + * Do the update. No ereport(ERROR) until changes are logged. We want to + * log the changes for bitmap page and overflow page together to avoid + * loss of pages in case the new page is added. + */ + START_CRIT_SECTION(); + + if (page_found) { + Assert(BufferIsValid(mapbuf)); + + /* mark page "in use" in the bitmap */ + SETBIT(freep, bitmap_page_bit); + MarkBufferDirty(mapbuf); + } else { + /* update the count to indicate new overflow page is added */ + metap->hashm_spares[splitnum]++; + + if (BufferIsValid(newmapbuf)) { + _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(newmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + newmap_blkno = BufferGetBlockNumber(newmapbuf); + metap->hashm_mapp[metap->hashm_nmaps] = newmap_blkno; + metap->hashm_nmaps++; + metap->hashm_spares[splitnum]++; + MarkBufferDirty(metabuf); + } + + /* + * for new overflow page, we don't need to explicitly set the bit in + * bitmap page, as by default that will be set to "in use". + */ + } + + /* + * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); - } else { - /* We didn't change the metapage, so no need to write */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + MarkBufferDirty(metabuf); } - /* Fetch, init, and return the recycled page */ - return _hash_getinitbuf(rel, blkno); + /* initialize new overflow page */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + XLogRecPtr recptr; + xl_hash_add_ovfl_page xlrec; + + xlrec.bmpage_found = page_found; + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); + + XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); + XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + + if (BufferIsValid(mapbuf)) { + XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); + } + + if (BufferIsValid(newmapbuf)) + XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); + + XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); + if (BufferIsValid(newmapbuf)) + XLogRegisterBufData(4, (char *) &newmap_blkno, sizeof(BlockNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); + + PageSetLSN(BufferGetPage(ovflbuf), recptr); + PageSetLSN(BufferGetPage(buf), recptr); + + if (BufferIsValid(mapbuf)) + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (BufferIsValid(newmapbuf)) + PageSetLSN(BufferGetPage(newmapbuf), recptr); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + if (retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); + + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newmapbuf)) + _hash_relbuf(rel, newmapbuf); + + return ovflbuf; } /* - * Return the number of the first bit that is not set in the word 'map'. + * _hash_firstfreebit() + * + * Return the number of the first bit that is not set in the word 'map'. */ static uint32 _hash_firstfreebit(uint32 map) { @@ -334,20 +439,31 @@ static uint32 _hash_firstfreebit(uint32 map) } /* - * Remove this overflow page from its bucket's chain, and mark the page as - * free. On entry, ovflbuf is write-locked; it is released before exiting. + * _hash_freeovflpage() * - * Since this function is invoked in VACUUM, we provide an access strategy - * parameter that controls fetches of the bucket pages. + * Remove this overflow page from its bucket's chain, and mark the page as + * free. On entry, ovflbuf is write-locked; it is released before exiting. * - * Returns the block number of the page that followed the given page - * in the bucket, or InvalidBlockNumber if no following page. + * Add the tuples (itups) to wbuf in this function. We could do that in the + * caller as well, but the advantage of doing it here is we can easily write + * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and + * removal of overflow page has to done as an atomic operation, otherwise + * during replay on standby users might find duplicate records. * - * NB: caller must not hold lock on metapage, nor on either page that's - * adjacent in the bucket chain. The caller had better hold exclusive lock - * on the bucket, too. + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. + * + * Returns the block number of the page that followed the given page + * in the bucket, or InvalidBlockNumber if no following page. + * + * NB: caller must not hold lock on metapage, nor on page, that's next to + * ovflbuf in the bucket chain. We don't acquire the lock on page that's + * prior to ovflbuf in chain if it is same as wbuf because the caller already + * has a lock on same. */ -BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy) +BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; @@ -356,13 +472,18 @@ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrateg BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; + BlockNumber writeblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep = NULL; uint32 ovflbitno; - int32 bitmappage, bitmapbit; + int32 bitmappage; + int32 bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; + Buffer prevbuf = InvalidBuffer; + Buffer nextbuf = InvalidBuffer; + bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); @@ -371,51 +492,40 @@ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrateg ovflopaque = (HashPageOpaque)PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; + writeblkno = BufferGetBlockNumber(wbuf); bucket = ovflopaque->hasho_bucket; - /* - * Zero the page for debugging's sake; then write and release it. (Note: - * if we failed to zero the page here, we'd have problems with the Assert - * in _hash_pageinit() when the page is reused.) - */ - MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); - _hash_wrtbuf(rel, ovflbuf); - /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being - * deleted. No concurrency issues since we hold exclusive lock on the - * entire bucket. + * deleted. Concurrency issues are avoided by using lock chaining as + * described atop hashbucketcleanup. */ if (BlockNumberIsValid(prevblkno)) { - Buffer prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, - bstrategy); - Page prevpage = BufferGetPage(prevbuf); - HashPageOpaque prevopaque = (HashPageOpaque)PageGetSpecialPointer(prevpage); - - Assert(prevopaque->hasho_bucket == bucket); - prevopaque->hasho_nextblkno = nextblkno; - _hash_wrtbuf(rel, prevbuf); + if (prevblkno == writeblkno) + prevbuf = wbuf; + else + prevbuf = _hash_getbuf_with_strategy(rel, + prevblkno, + HASH_WRITE, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, + bstrategy); } - if (BlockNumberIsValid(nextblkno)) { - Buffer nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); - Page nextpage = BufferGetPage(nextbuf); - HashPageOpaque nextopaque = (HashPageOpaque)PageGetSpecialPointer(nextpage); + if (BlockNumberIsValid(nextblkno)) + nextbuf = _hash_getbuf_with_strategy(rel, + nextblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); - Assert(nextopaque->hasho_bucket == bucket); - nextopaque->hasho_prevblkno = prevblkno; - _hash_wrtbuf(rel, nextbuf); - } + /* Note: bstrategy is intentionally not used for metapage and bitmap */ - /* - * Note: bstrategy is intentionally not used for metapage and bitmap - * Read the metapage so we can determine which bitmap page to use - */ + /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ - ovflbitno = blkno_to_bitno(metap, ovflblkno); + ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); @@ -425,109 +535,241 @@ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrateg blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - /* Clear the bitmap bit to indicate that this overflow page is free */ + /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); - CLRBIT(freep, bitmapbit); - _hash_wrtbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* This operation needs to log multiple tuples, prepare WAL for that */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being careful to preserve + * hashkey ordering. (If we insert many tuples into the same "write" page + * it would be worth qsort'ing them). + */ + if (nitups > 0) { + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + } + + /* + * Reinitialize the freed overflow page. Just zeroing the page won't + * work, because WAL replay routines expect pages to be initialized. See + * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are + * careful to make the special space valid here so that tools like + * pageinspect won't get confused. + */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + if (BufferIsValid(prevbuf)) { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(nextbuf)) { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + MarkBufferDirty(nextbuf); + } + + /* Clear the bitmap bit to indicate that this overflow page is free */ + CLRBIT(freep, bitmapbit); + MarkBufferDirty(mapbuf); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; - _hash_wrtbuf(rel, metabuf); - } else { - /* no need to change metapage */ - _hash_relbuf(rel, metabuf); + update_metap = true; + MarkBufferDirty(metabuf); } + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_hash_squeeze_page xlrec; + XLogRecPtr recptr; + int i; + + xlrec.prevblkno = prevblkno; + xlrec.nextblkno = nextblkno; + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); + xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); + + /* + * bucket buffer needs to be registered to ensure that we can acquire + * a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + if (xlrec.ntups > 0) { + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + } + + XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); + + /* + * If prevpage and the writepage (block in which we are moving tuples + * from overflow) are same, then no need to separately register + * prevpage. During replay, we can directly update the nextblock in + * writepage. + */ + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); + + if (BufferIsValid(nextbuf)) + XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); + + if (update_metap) { + XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); + } + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); + + if (!xlrec.is_prim_bucket_same_wrt) { + PageSetLSN(BufferGetPage(bucketbuf), recptr); + } + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(ovflbuf), recptr); + + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + PageSetLSN(BufferGetPage(prevbuf), recptr); + if (BufferIsValid(nextbuf)) + PageSetLSN(BufferGetPage(nextbuf), recptr); + + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (update_metap) + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* release previous bucket if it is not same as write bucket */ + if (BufferIsValid(prevbuf) && prevblkno != writeblkno) + _hash_relbuf(rel, prevbuf); + + if (BufferIsValid(ovflbuf)) + _hash_relbuf(rel, ovflbuf); + + if (BufferIsValid(nextbuf)) + _hash_relbuf(rel, nextbuf); + + _hash_relbuf(rel, mapbuf); + _hash_relbuf(rel, metabuf); + return nextblkno; } /* - * Initialize a new bitmap page. The metapage has a write-lock upon - * entering the function, and must be written by caller after return. + * _hash_initbitmapbuffer() * - * 'blkno' is the block number of the new bitmap page. - * - * All bits in the new bitmap page are set to "1", indicating "in use". + * Initialize a new bitmap page. All bits in the new bitmap page are set to + * "1", indicating "in use". */ -void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum) +void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) { - Buffer buf; Page pg; HashPageOpaque op; uint32 *freep = NULL; - /* - * It is okay to write-lock the new bitmap page while holding metapage - * write lock, because no one else could be contending for the new page. - * Also, the metapage lock makes it safe to extend the index using - * _hash_getnewbuf. - * - * There is some loss of concurrency in possibly doing I/O for the new - * page while holding the metapage lock, but this path is taken so seldom - * that it's not worth worrying about. - */ - buf = _hash_getnewbuf(rel, blkno, forkNum); pg = BufferGetPage(buf); + /* initialize the page */ + if (initpage) + _hash_pageinit(pg, BufferGetPageSize(buf)); + /* initialize the page's special space */ - op = (HashPageOpaque)PageGetSpecialPointer(pg); + op = (HashPageOpaque) PageGetSpecialPointer(pg); op->hasho_prevblkno = InvalidBlockNumber; op->hasho_nextblkno = InvalidBlockNumber; - op->hasho_bucket = INVALID_BUCKET_NUM; + op->hasho_bucket = -1; op->hasho_flag = LH_BITMAP_PAGE; op->hasho_page_id = HASHO_PAGE_ID; /* set all of the bits to 1 */ freep = HashPageGetBitmap(pg); - errno_t rc = memset_s(freep, HashGetMaxBitmapSize(pg), 0xFF, BMPGSZ_BYTE(metap)); - securec_check(rc, "", ""); + MemSet(freep, 0xFF, bmsize); - /* write out the new bitmap page (releasing write lock and pin) */ - _hash_wrtbuf(rel, buf); - - /* add the new bitmap page to the metapage's list of bitmaps */ - /* metapage already has a write lock */ - if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) - ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); - - metap->hashm_mapp[metap->hashm_nmaps] = blkno; - - metap->hashm_nmaps++; + /* + * Set pd_lower just past the end of the bitmap page data. We could even + * set pd_lower equal to pd_upper, but this is more precise and makes the + * page look compressible to xlog.c. + */ + ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; } + /* - * Try to squeeze the tuples onto pages occurring earlier in the - * bucket chain in an attempt to free overflow pages. When we start - * the "squeezing", the page from which we start taking tuples (the - * "read" page) is the last bucket in the bucket chain and the page - * onto which we start squeezing tuples (the "write" page) is the - * first page in the bucket chain. The read page works backward and - * the write page works forward; the procedure terminates when the - * read page and write page are the same page. + * _hash_squeezebucket(rel, bucket) * - * At completion of this procedure, it is guaranteed that all pages in - * the bucket are nonempty, unless the bucket is totally empty (in - * which case all overflow pages will be freed). The original implementation - * required that to be true on entry as well, but it's a lot easier for - * callers to leave empty overflow pages and let this guy clean it up. + * Try to squeeze the tuples onto pages occurring earlier in the + * bucket chain in an attempt to free overflow pages. When we start + * the "squeezing", the page from which we start taking tuples (the + * "read" page) is the last bucket in the bucket chain and the page + * onto which we start squeezing tuples (the "write" page) is the + * first page in the bucket chain. The read page works backward and + * the write page works forward; the procedure terminates when the + * read page and write page are the same page. * - * Caller must hold exclusive lock on the target bucket. This allows - * us to safely lock multiple pages in the bucket. + * At completion of this procedure, it is guaranteed that all pages in + * the bucket are nonempty, unless the bucket is totally empty (in + * which case all overflow pages will be freed). The original implementation + * required that to be true on entry as well, but it's a lot easier for + * callers to leave empty overflow pages and let this guy clean it up. * - * Since this function is invoked in VACUUM, we provide an access strategy - * parameter that controls fetches of the bucket pages. + * Caller must acquire cleanup lock on the primary page of the target + * bucket to exclude any scans that are in progress, which could easily + * be confused into returning the same tuple more than once or some tuples + * not at all by the rearrangement we are performing here. To prevent + * any concurrent scan to cross the squeeze scan we use lock chaining + * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. + * + * We need to retain a pin on the primary bucket to ensure that no concurrent + * split can start. + * + * Since this function is invoked in VACUUM, we provide an access strategy + * parameter that controls fetches of the bucket pages. */ -void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) +void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, + Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; @@ -537,20 +779,21 @@ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; - bool wbuf_dirty = false; /* - * start squeezing into the base bucket page. + * start squeezing into the primary bucket page. */ wblkno = bucket_blkno; - wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE, bstrategy); + wbuf = bucket_buf; wpage = BufferGetPage(wbuf); - wopaque = (HashPageOpaque)PageGetSpecialPointer(wpage); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); + /* - * if there aren't any overflow pages, there's nothing to squeeze. + * if there aren't any overflow pages, there's nothing to squeeze. caller + * is responsible for releasing the pin on primary bucket page. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { - _hash_relbuf(rel, wbuf); + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); return; } @@ -566,76 +809,187 @@ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); - rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); rpage = BufferGetPage(rbuf); - ropaque = (HashPageOpaque)PageGetSpecialPointer(rpage); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ - wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size tups_size[MaxIndexTuplesPerPage]; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + uint16 ndeletable = 0; + uint16 nitups = 0; + Size all_tups_size = 0; + int i; + bool retain_pin = false; +readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; - itup = (IndexTuple)PageGetItem(rpage, PageGetItemId(rpage, roffnum)); + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) + continue; + + itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for - * this item. Exit if we reach the read page. + * this item and all other accumulated items. Exit if we reach + * the read page. */ - while (PageGetFreeSpace(wpage) < itemsz) { + while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) { + Buffer next_wbuf = InvalidBuffer; + bool tups_moved = false; + Assert(!PageIsEmpty(wpage)); + if (wblkno == bucket_blkno) + retain_pin = true; + wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); - if (wbuf_dirty) - _hash_wrtbuf(rel, wbuf); + /* don't need to move to next page if we reached the read page */ + if (wblkno != rblkno) + next_wbuf = _hash_getbuf_with_strategy(rel, + wblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); + + if (nitups > 0) { + Assert(nitups == ndeletable); + + /* + * This operation needs to log multiple tuples, prepare + * WAL for that. + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(0, 3 + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being + * careful to preserve hashkey ordering. (If we insert + * many tuples into the same "write" page it would be + * worth qsort'ing them). + */ + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + + /* Delete tuples we already moved off read page */ + PageIndexMultiDelete(rpage, deletable, ndeletable); + MarkBufferDirty(rbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + XLogRecPtr recptr; + xl_hash_move_page_contents xlrec; + + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); + + /* + * bucket buffer needs to be registered to ensure that + * we can acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); + + if (!xlrec.is_prim_bucket_same_wrt) { + PageSetLSN(BufferGetPage(bucket_buf), recptr); + } + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(rbuf), recptr); + } + + END_CRIT_SECTION(); + + tups_moved = true; + } + + /* + * release the lock on previous page after acquiring the lock + * on next page + */ + if (retain_pin) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { - if (ndeletable > 0) { - /* Delete tuples we already moved off read page */ - PageIndexMultiDelete(rpage, deletable, ndeletable); - _hash_wrtbuf(rel, rbuf); - } else - _hash_relbuf(rel, rbuf); + _hash_relbuf(rel, rbuf); return; } - wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); + wbuf = next_wbuf; wpage = BufferGetPage(wbuf); - wopaque = (HashPageOpaque)PageGetSpecialPointer(wpage); + wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); - wbuf_dirty = false; - } + retain_pin = false; - /* - * we have found room so insert on the "write" page, being careful - * to preserve hashkey ordering. (If we insert many tuples into - * the same "write" page it would be worth qsort'ing instead of - * doing repeated _hash_pgaddtup.) - */ - (void)_hash_pgaddtup(rel, wbuf, itemsz, itup); - wbuf_dirty = true; + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + ndeletable = 0; + + /* + * after moving the tuples, rpage would have been compacted, + * so we need to rescan it. + */ + if (tups_moved) + goto readpage; + } /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; + + /* + * we need a copy of index tuples as they can be freed as part of + * overflow page, however we need them to write a WAL record in + * _hash_freeovflpage. + */ + itups[nitups] = CopyIndexTuple(itup); + tups_size[nitups++] = itemsz; + all_tups_size += itemsz; } /* @@ -647,31 +1001,36 @@ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the - * removed page. However, in that case we are done anyway, so we can - * simply drop the write lock before calling _hash_freeovflpage. + * removed page. In that case, we don't need to lock it again. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); + /* free this overflow page (releases rbuf) */ + _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, + tups_size, nitups, bstrategy); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { - /* yes, so release wbuf lock first */ - if (wbuf_dirty) - _hash_wrtbuf(rel, wbuf); + /* retain the pin on primary bucket page till end of bucket scan */ + if (wblkno == bucket_blkno) + LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); - /* free this overflow page (releases rbuf) */ - _hash_freeovflpage(rel, rbuf, bstrategy); - /* done */ return; } - /* free this overflow page, then get the previous one */ - _hash_freeovflpage(rel, rbuf, bstrategy); - - rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); + rbuf = _hash_getbuf_with_strategy(rel, + rblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); rpage = BufferGetPage(rbuf); - ropaque = (HashPageOpaque)PageGetSpecialPointer(rpage); + ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } diff --git a/src/gausskernel/storage/access/hash/hashpage.cpp b/src/gausskernel/storage/access/hash/hashpage.cpp index cf4301a74..26e9ff224 100644 --- a/src/gausskernel/storage/access/hash/hashpage.cpp +++ b/src/gausskernel/storage/access/hash/hashpage.cpp @@ -3,8 +3,8 @@ * hashpage.cpp * Hash table page management code for the Postgres hash access method * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -31,14 +31,18 @@ #include "knl/knl_variable.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "access/xloginsert.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/smgr.h" #include "utils/aiomem.h" static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks); -static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, - BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask); +static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, + Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, + uint32 maxbucket, uint32 highmask, uint32 lowmask); +static void log_split_page(Relation rel, Buffer buf); /* * We use high-concurrency locking on hash indexes (see README for an overview @@ -49,42 +53,6 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Buck */ #define USELOCKING(rel) (!RELATION_IS_LOCAL(rel)) -/* - * _hash_getlock() -- Acquire an lmgr lock. - * - * 'whichlock' should be zero to acquire the split-control lock, or the - * block number of a bucket's primary bucket page to acquire the per-bucket - * lock. (See README for details of the use of these locks.) - * - * 'access' must be HASH_SHARE or HASH_EXCLUSIVE. - */ -void _hash_getlock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - LockPage(rel, whichlock, access); -} - -/* - * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free. - * - * Same as above except we return FALSE without blocking if lock isn't free. - */ -bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - return ConditionalLockPage(rel, whichlock, access); - else - return true; -} - -/* - * _hash_droplock() -- Release an lmgr lock. - */ -void _hash_droplock(Relation rel, BlockNumber whichlock, int access) -{ - if (USELOCKING(rel)) - UnlockPage(rel, whichlock, access); -} /* * _hash_getbuf() -- Get a buffer by block number for read or write. @@ -121,18 +89,44 @@ Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) return buf; } +/* + * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup. + * + * We read the page and try to acquire a cleanup lock. If we get it, + * we return the buffer; otherwise, we return InvalidBuffer. + */ +Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags) +{ + Buffer buf; + + if (blkno == P_NEW) + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("hash AM does not use P_NEW"))); + + buf = ReadBuffer(rel, blkno); + + if (!ConditionalLockBufferForCleanup(buf)) { + ReleaseBuffer(buf); + return InvalidBuffer; + } + + /* ref count and lock type are correct */ + _hash_checkpage(rel, buf, flags); + + return buf; +} + /* * _hash_getinitbuf() -- Get and initialize a buffer by block number. * * This must be used only to fetch pages that are known to be before * the index's filesystem EOF, but are to be filled from scratch. - * _hash_pageinit() is applied automatically. Otherwise it has + * _hash_pageinit() is applied automatically. Otherwise it has * effects similar to _hash_getbuf() with access = HASH_WRITE. - * + * * When this routine returns, a write lock is set on the * requested buffer and its reference count has been incremented * (ie, the buffer is "locked and pinned"). - * + * * P_NEW is disallowed because this routine can only be used * to access pages that are known to be before the filesystem EOF. * Extending the index should be done with _hash_getnewbuf. @@ -155,6 +149,34 @@ Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno) return buf; } +/* + * _hash_initbuf() -- Get and initialize a buffer by bucket number. + */ +void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage) +{ + HashPageOpaque pageopaque; + Page page; + + page = BufferGetPage(buf); + + /* initialize the page */ + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* + * Set hasho_prevblkno with current hashm_maxbucket. This value will be + * used to validate cached HashMetaPageData. See + * _hash_getbucketbuf_from_hashkey(). + */ + pageopaque->hasho_prevblkno = max_bucket; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_bucket = num_bucket; + pageopaque->hasho_flag = flag; + pageopaque->hasho_page_id = HASHO_PAGE_ID; +} + /* * _hash_getnewbuf() -- Get a new page at the end of the index. * @@ -165,7 +187,9 @@ Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno) * EOF but before updating the metapage to reflect the added page.) * * It is caller's responsibility to ensure that only one process can - * extend the index at a time. + * extend the index at a time. In practice, this function is called + * only while holding write lock on the metapage, because adding a page + * is always associated with an update of metapage data. */ Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) { @@ -245,48 +269,37 @@ void _hash_dropbuf(Relation rel, Buffer buf) } /* - * _hash_wrtbuf() -- write a hash page to disk. + * _hash_dropscanbuf() -- release buffers used in scan. * - * This routine releases the lock held on the buffer and our refcount - * for it. It is an error to call _hash_wrtbuf() without a write lock - * and a pin on the buffer. - * - * NOTE: this routine should go away when/if hash indexes are WAL-ified. - * The correct sequence of operations is to mark the buffer dirty, then - * write the WAL record, then release the lock and pin; so marking dirty - * can't be combined with releasing. + * This routine unpins the buffers used during scan on which we + * hold no lock. */ -void _hash_wrtbuf(Relation rel, Buffer buf) +void _hash_dropscanbuf(Relation rel, HashScanOpaque so) { - MarkBufferDirty(buf); - UnlockReleaseBuffer(buf); + /* release pin we hold on primary bucket page */ + if (BufferIsValid(so->hashso_bucket_buf) && so->hashso_bucket_buf != so->hashso_curbuf) + _hash_dropbuf(rel, so->hashso_bucket_buf); + so->hashso_bucket_buf = InvalidBuffer; + + /* release pin we hold on primary bucket page of bucket being split */ + if (BufferIsValid(so->hashso_split_bucket_buf) && so->hashso_split_bucket_buf != so->hashso_curbuf) + _hash_dropbuf(rel, so->hashso_split_bucket_buf); + so->hashso_split_bucket_buf = InvalidBuffer; + + /* release any pin we still hold */ + if (BufferIsValid(so->hashso_curbuf)) + _hash_dropbuf(rel, so->hashso_curbuf); + so->hashso_curbuf = InvalidBuffer; + + /* reset split scan */ + so->hashso_buc_populated = false; + so->hashso_buc_split = false; } -/* - * _hash_chgbufaccess() -- Change the lock type on a buffer, without - * dropping our pin on it. - * - * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK, - * the last indicating that no buffer-level lock is held or wanted. - * - * When from_access == HASH_WRITE, we assume the buffer is dirty and tell - * bufmgr it must be written out. If the caller wants to release a write - * lock on a page that's not been modified, it's okay to pass from_access - * as HASH_READ (a bit ugly, but handy in some places). - */ -void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access) -{ - if (from_access == HASH_WRITE) - MarkBufferDirty(buf); - if (from_access != HASH_NOLOCK) - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - if (to_access != HASH_NOLOCK) - LockBuffer(buf, to_access); -} /* - * _hash_metapinit() -- Initialize the metadata page of a hash index, - * the initial buckets, and the initial bitmap page. + * _hash_init() -- Initialize the metadata page of a hash index, + * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The @@ -296,30 +309,37 @@ void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ -uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) +uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { - HashMetaPage metap; - HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; + Buffer bitmapbuf; Page pg; + HashMetaPage metap; + RegProcedure procid; uint32 data_width; uint32 item_width; uint32 ffactor; - double dnumbuckets; uint32 num_buckets; - uint32 log2_num_buckets; uint32 i; + bool use_wal = false; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)))); + /* + * WAL log creation of pages if the relation is persistent, or this is the + * init fork. Init forks for unlogged relations always need to be WAL + * logged. + */ + use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; + /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full - * as the user-settable fillfactor parameter says. We can compute it + * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); @@ -330,55 +350,189 @@ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) if (ffactor < 10) ffactor = 10; - /* - * Choose the number of initial bucket pages to match the fill factor - * given the estimated number of tuples. We round up the result to the - * next power of 2, however, and always force at least 2 bucket pages. The - * upper limit is determined by considerations explained in _hash_expandtable(). - */ - dnumbuckets = num_tuples / ffactor; - if (dnumbuckets <= 2.0) - num_buckets = 2; - else if (dnumbuckets >= (double)0x40000000) - num_buckets = 0x40000000; - else - num_buckets = ((uint32)1) << _hash_log2((uint32)dnumbuckets); - - log2_num_buckets = _hash_log2(num_buckets); - Assert(num_buckets == (((uint32)1) << log2_num_buckets)); - Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); + procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() - * calls to occur. This ensures that the smgr level has the right idea of + * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. + * + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); - pg = BufferGetPage(metabuf); + _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); + MarkBufferDirty(metabuf); - pageopaque = (HashPageOpaque)PageGetSpecialPointer(pg); + pg = BufferGetPage(metabuf); + metap = HashPageGetMeta(pg); + + /* XLOG stuff */ + if (use_wal) { + xl_hash_init_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.num_tuples = num_tuples; + xlrec.procid = metap->hashm_procid; + xlrec.ffactor = metap->hashm_ffactor; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + num_buckets = metap->hashm_maxbucket + 1; + + /* + * Release buffer lock on the metapage while we initialize buckets. + * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS + * won't accomplish anything. It's a bad idea to hold buffer locks for + * long intervals in any case, since that can block the bgwriter. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Initialize and WAL Log the first N buckets + */ + for (i = 0; i < num_buckets; i++) { + BlockNumber blkno; + + /* Allow interrupts, in case N is huge */ + CHECK_FOR_INTERRUPTS(); + + blkno = BUCKET_TO_BLKNO(metap, i); + buf = _hash_getnewbuf(rel, blkno, forkNum); + _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); + MarkBufferDirty(buf); + + if (use_wal) + log_newpage(&rel->rd_node, + forkNum, + blkno, + BufferGetPage(buf), + true); + _hash_relbuf(rel, buf); + } + + /* Now reacquire buffer lock on metapage */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Initialize bitmap page + */ + bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); + _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + + metap->hashm_nmaps++; + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (use_wal) { + xl_hash_init_bitmap_page xlrec; + XLogRecPtr recptr; + + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); + XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); + + /* + * This is safe only because nobody else can be modifying the index at + * this stage; it's only visible to the transaction that is creating + * it. + */ + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); + + PageSetLSN(BufferGetPage(bitmapbuf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + /* all done */ + _hash_relbuf(rel, bitmapbuf); + _hash_relbuf(rel, metabuf); + + return num_buckets; +} + +/* + * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. + */ +void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, + uint16 ffactor, bool initpage) +{ + HashMetaPage metap; + HashPageOpaque pageopaque; + Page page; + double dnumbuckets; + uint32 num_buckets; + uint32 spare_index; + uint32 i; + + /* + * Choose the number of initial bucket pages to match the fill factor + * given the estimated number of tuples. We round up the result to the + * total number of buckets which has to be allocated before using its + * _hashm_spare element. However always force at least 2 bucket pages. The + * upper limit is determined by considerations explained in + * _hash_expandtable(). + */ + Assert(ffactor != 0); + dnumbuckets = num_tuples / ffactor; + if (dnumbuckets <= 2.0) + num_buckets = 2; + else if (dnumbuckets >= (double) 0x40000000) + num_buckets = 0x40000000; + else + num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); + + spare_index = _hash_spareindex(num_buckets); + Assert(spare_index < HASH_MAX_SPLITPOINTS); + + page = BufferGetPage(buf); + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; - pageopaque->hasho_bucket = INVALID_BUCKET_NUM; + pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; - metap = HashPageGetMeta(pg); + metap = HashPageGetMeta(page); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; - metap->hashm_ffactor = (uint16)ffactor; - metap->hashm_bsize = (uint16)HashGetMaxBitmapSize(pg); + metap->hashm_ffactor = ffactor; + metap->hashm_bsize = HashGetMaxBitmapSize(page); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { - if (((uint32)1 << i) <= metap->hashm_bsize) + if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); - metap->hashm_bmsize = (uint32)1 << i; + metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); @@ -387,15 +541,20 @@ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ - metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); + metap->hashm_procid = procid; /* * We initialize the index with N buckets, 0 .. N-1, occupying physical - * blocks 1 to N. The first freespace bitmap page is in block N+1. Since - * N is a power of 2, we can set the masks this way: + * blocks 1 to N. The first freespace bitmap page is in block N+1. */ - metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; - metap->hashm_highmask = (num_buckets << 1) - 1; + metap->hashm_maxbucket = num_buckets - 1; + + /* + * Set highmask as next immediate ((2 ^ x) - 1), which should be + * sufficient to cover num_buckets. + */ + metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1; + metap->hashm_lowmask = (metap->hashm_highmask >> 1); errno_t ret = memset_s(metap->hashm_spares, sizeof(metap->hashm_spares), 0, sizeof(metap->hashm_spares)); securec_check(ret, "", ""); @@ -403,65 +562,34 @@ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) securec_check(ret, "", ""); /* Set up mapping for one spare page after the initial splitpoints */ - metap->hashm_spares[log2_num_buckets] = 1; - metap->hashm_ovflpoint = log2_num_buckets; + metap->hashm_spares[spare_index] = 1; + metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* - * Release buffer lock on the metapage while we initialize buckets. - * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS - * won't accomplish anything. It's a bad idea to hold buffer locks for - * long intervals in any case, since that can block the bgwriter. + * Set pd_lower just past the end of the metadata. This is to log full + * page image of metapage in xloginsert.c. */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); - - /* - * Initialize the first N buckets - */ - for (i = 0; i < num_buckets; i++) { - /* Allow interrupts, in case N is huge */ - CHECK_FOR_INTERRUPTS(); - - buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); - pg = BufferGetPage(buf); - pageopaque = (HashPageOpaque)PageGetSpecialPointer(pg); - pageopaque->hasho_prevblkno = InvalidBlockNumber; - pageopaque->hasho_nextblkno = InvalidBlockNumber; - pageopaque->hasho_bucket = i; - pageopaque->hasho_flag = LH_BUCKET_PAGE; - pageopaque->hasho_page_id = HASHO_PAGE_ID; - _hash_wrtbuf(rel, buf); - } - - /* Now reacquire buffer lock on metapage */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); - - /* - * Initialize first bitmap page - */ - _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); - - /* all done */ - _hash_wrtbuf(rel, metabuf); - - return num_buckets; + ((PageHeader) page)->pd_lower = + ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; } /* - * _hash_pageinit() -- Initialize a new hash index page. + * _hash_pageinit() -- Initialize a new hash index page. */ void _hash_pageinit(Page page, Size size) { - Assert(PageIsNew(page)); PageInit(page, size, sizeof(HashPageOpaqueData)); } /* * Attempt to expand the hash table by creating one new bucket. * - * This will silently do nothing if it cannot get the needed locks. + * This will silently do nothing if we don't get cleanup lock on old or + * new bucket. * - * The caller should hold no locks on the hash index. + * Complete the pending splits and remove the tuples from old bucket, + * if there are any left over from the previous split. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. @@ -474,37 +602,36 @@ void _hash_expandtable(Relation rel, Buffer metabuf) uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; + Buffer buf_nblkno; + Buffer buf_oblkno; + Page opage; + Page npage; + HashPageOpaque oopaque; + HashPageOpaque nopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; + bool metap_update_masks = false; + bool metap_update_splitpoint = false; + +restart_expand: /* - * Obtain the page-zero lock to assert the right to begin a split (see - * README). - * - * Note: deadlock should be impossible here. Our own backend could only be - * holding bucket sharelocks due to stopped indexscans; those will not - * block other holders of the page-zero lock, who are only interested in - * acquiring bucket sharelocks themselves. Exclusive bucket locks are - * only taken here and in hashbulkdelete, and neither of these operations - * needs any additional locks to complete. (If, due to some flaw in this - * reasoning, we manage to deadlock anyway, it's okay to error out; the - * index will be left in a consistent state.) + * Write-lock the meta page. It used to be necessary to acquire a + * heavyweight lock to begin a split, but that is no longer required. */ - _hash_getlock(rel, 0, HASH_EXCLUSIVE); - - /* Write-lock the meta page */ - _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); + /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ - if (metap->hashm_ntuples <= (double)metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) + if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* @@ -519,17 +646,23 @@ void _hash_expandtable(Relation rel, Buffer metabuf) * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. * - * If you change this, see also the maximum initial number of buckets in _hash_metapinit(). + * If you change this, see also the maximum initial number of buckets in + * _hash_init(). */ - if (metap->hashm_maxbucket >= (uint32)0x7FFFFFFE) + if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* - * Determine which bucket is to be split, and attempt to lock the old - * bucket. If we can't get the lock, give up. + * Determine which bucket is to be split, and attempt to take cleanup lock + * on the old bucket. If we can't get the lock, give up. * - * The lock protects us against other backends, but not against our own - * backend. Must check for active scans separately. + * The cleanup lock protects us not only against other backends, but + * against our own backend as well. + * + * The cleanup lock is mainly to protect the split from concurrent + * inserts. See src/backend/access/hash/README, Lock Definitions for + * further details. Due to this locking restriction, if there is any + * pending scan, the split will give up which is not good, but harmless. */ new_bucket = metap->hashm_maxbucket + 1; @@ -537,14 +670,84 @@ void _hash_expandtable(Relation rel, Buffer metabuf) start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); - if (_hash_has_active_scan(rel, old_bucket)) + buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); + if (!buf_oblkno) goto fail; - if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) - goto fail; + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* - * Likewise lock the new bucket (should never fail). + * We want to finish the split from a bucket as there is no apparent + * benefit by not doing so and it will make the code complicated to finish + * the split that involves multiple buckets considering the case where new + * split also fails. We don't need to consider the new bucket for + * completing the split here as it is not possible that a re-split of new + * bucket starts when there is still a pending split from old bucket. + */ + if (H_BUCKET_BEING_SPLIT(oopaque)) { + /* + * Copy bucket mapping info now; refer the comment in code below where + * we copy this information before calling _hash_splitbucket to see + * why this is okay. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + /* + * Release the lock on metapage and old_bucket, before completing the + * split. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); + + _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, + highmask, lowmask); + + /* release the pin on old buffer and retry for expand. */ + _hash_dropbuf(rel, buf_oblkno); + + goto restart_expand; + } + + /* + * Clean the tuples remained from the previous split. This operation + * requires cleanup lock and we already have one on the old bucket, so + * let's do it. We also don't want to allow further splits from the bucket + * till the garbage of previous split is cleaned. This has two + * advantages; first, it helps in avoiding the bloat due to garbage and + * second is, during cleanup of bucket, we are always sure that the + * garbage tuples belong to most recently split bucket. On the contrary, + * if we allow cleanup of bucket after meta page is updated to indicate + * the new split and before the actual split, the cleanup operation won't + * be able to decide whether the tuple has been moved to the newly created + * bucket and ended up deleting such tuples. + */ + if (H_NEEDS_SPLIT_CLEANUP(oopaque)) { + /* + * Copy bucket mapping info now; refer to the comment in code below + * where we copy this information before calling _hash_splitbucket to + * see why this is okay. + */ + maxbucket = metap->hashm_maxbucket; + highmask = metap->hashm_highmask; + lowmask = metap->hashm_lowmask; + + /* Release the metapage lock. */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, + maxbucket, highmask, lowmask, NULL, NULL, true, + NULL, NULL); + + _hash_dropbuf(rel, buf_oblkno); + + goto restart_expand; + } + + /* + * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because @@ -553,89 +756,168 @@ void _hash_expandtable(Relation rel, Buffer metabuf) */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); - if (_hash_has_active_scan(rel, new_bucket)) - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("scan in progress on supposedly new bucket"))); - - if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) - ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not get lock on supposedly new bucket"))); /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to allocate a new batch of bucket pages. + * If the split point is increasing we need to allocate a new batch of + * bucket pages. */ - spare_ndx = _hash_log2(new_bucket + 1); + spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { + uint32 buckets_to_add; + Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* - * The number of buckets in the new splitpoint is equal to the total - * number already in existence, i.e. new_bucket. Currently this maps - * one-to-one to blocks required, but someday we may need a more - * complicated calculation here. + * We treat allocation of buckets as a separate WAL-logged action. + * Even if we fail after this operation, won't leak bucket pages; + * rather, the next split will consume this space. In any case, even + * without failure we don't use all the space in one split operation. */ - if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { + buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; + if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) { /* can't split due to BlockNumber overflow */ - _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); - _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + _hash_relbuf(rel, buf_oblkno); goto fail; } } /* - * Okay to proceed with split. Update the metapage bucket mapping info. - * - * Since we are scribbling on the metapage data right in the shared - * buffer, any failure in this next little bit leaves us with a big + * Physically allocate the new bucket's primary page. We want to do this + * before changing the metapage's mapping info, in case we can't get the + * disk space. Ideally, we don't need to check for cleanup lock on new + * bucket as no other backend could find this bucket unless meta page is + * updated. However, it is good to be consistent with old bucket locking. + */ + buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); + if (!IsBufferCleanupOK(buf_nblkno)) { + _hash_relbuf(rel, buf_oblkno); + _hash_relbuf(rel, buf_nblkno); + goto fail; + } + + /* + * Since we are scribbling on the pages in the shared buffers, establish a + * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back - * to disk. We don't really expect any failure, but just to be sure, - * establish a critical section. + * to disk. */ START_CRIT_SECTION(); + /* + * Okay to proceed with split. Update the metapage bucket mapping info. + */ metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; + metap_update_masks = true; } /* - * If the split point is increasing (hashm_maxbucket's log base 2 - * increases), we need to adjust the hashm_spares[] array and - * hashm_ovflpoint so that future overflow pages will be created beyond - * this new batch of bucket pages. + * If the split point is increasing we need to adjust the hashm_spares[] + * array and hashm_ovflpoint so that future overflow pages will be created + * beyond this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; + metap_update_splitpoint = true; } - /* Done mucking with metapage */ - END_CRIT_SECTION(); + MarkBufferDirty(metabuf); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of - * date before _hash_splitbucket finishes. That's okay, since all it + * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; - /* Write out the metapage and drop lock, but keep pin */ - _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque)PageGetSpecialPointer(opage); - /* Release split lock; okay for other splits to occur now */ - _hash_droplock(rel, 0, HASH_EXCLUSIVE); + /* + * Mark the old bucket to indicate that split is in progress. (At + * operation end, we will clear the split-in-progress flag.) Also, for a + * primary bucket page, hasho_prevblkno stores the number of buckets that + * existed as of the last split, so we must update that value here. + */ + oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; + oopaque->hasho_prevblkno = maxbucket; + + MarkBufferDirty(buf_oblkno); + + npage = BufferGetPage(buf_nblkno); + + /* + * initialize the new bucket's primary page and mark it to indicate that + * split is in progress. + */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = maxbucket; + nopaque->hasho_nextblkno = InvalidBlockNumber; + nopaque->hasho_bucket = new_bucket; + nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; + nopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(buf_nblkno); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_hash_split_allocate_page xlrec; + XLogRecPtr recptr; + + xlrec.new_bucket = maxbucket; + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + xlrec.flags = 0; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); + XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); + + if (metap_update_masks) { + xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; + XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); + XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); + } + + if (metap_update_splitpoint) { + xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; + XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, sizeof(uint32)); + XLogRegisterBufData(2, (char *) &metap->hashm_spares[metap->hashm_ovflpoint], sizeof(uint32)); + } + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); + + PageSetLSN(BufferGetPage(buf_oblkno), recptr); + PageSetLSN(BufferGetPage(buf_nblkno), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* Relocate records to the new bucket */ - _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, start_oblkno, start_nblkno, maxbucket, highmask, lowmask); + _hash_splitbucket(rel, metabuf, + old_bucket, new_bucket, + buf_oblkno, buf_nblkno, NULL, + maxbucket, highmask, lowmask); - /* Release bucket locks, allowing others to access them */ - _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); - _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); + /* all done, now release the pins on primary buckets. */ + _hash_dropbuf(rel, buf_oblkno); + _hash_dropbuf(rel, buf_nblkno); return; @@ -643,12 +925,10 @@ void _hash_expandtable(Relation rel, Buffer metabuf) fail: /* We didn't write the metapage, so just drop lock */ - _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); - - /* Release split lock */ - _hash_droplock(rel, 0, HASH_EXCLUSIVE); + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); } + /* * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages * @@ -665,7 +945,7 @@ fail: * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. - * We need to interlock against _hash_getovflpage() adding a new overflow page + * We need to interlock against _hash_addovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. @@ -676,7 +956,9 @@ fail: static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; - char *zerobuf = NULL; + char zerobuf[BLCKSZ]; + Page page; + HashPageOpaque ovflopaque; lastblock = firstblock + nblocks - 1; /* @@ -686,68 +968,92 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nbl if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; - zerobuf = (char *)adio_align_alloc(BLCKSZ); - if (zerobuf != NULL) { - MemSet(zerobuf, 0, BLCKSZ); - } + page = (Page)zerobuf; + + /* + * Initialize the page. Just zeroing the page won't work; see + * _hash_freeovflpage for similar usage. We take care to make the special + * space valid for the benefit of tools such as pageinspect. + */ + _hash_pageinit(page, BLCKSZ); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = -1; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + if (RelationNeedsWAL(rel)) + log_newpage(&rel->rd_node, + MAIN_FORKNUM, + lastblock, + zerobuf, + true); RelationOpenSmgr(rel); PageSetChecksumInplace(zerobuf, lastblock); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); - adio_align_free(zerobuf); - return true; } + /* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * - * We are splitting a bucket that consists of a base bucket page and zero - * or more overflow (bucket chain) pages. We must relocate tuples that - * belong in the new bucket, and compress out any free space in the old + * This routine is used to partition the tuples between old and new bucket and + * is used to finish the incomplete split operations. To finish the previously + * interrupted split operation, the caller needs to fill htab. If htab is set, + * then we skip the movement of tuples that exists in htab, otherwise NULL + * value of htab indicates movement of all the tuples that belong to the new * bucket. * - * The caller must hold exclusive locks on both buckets to ensure that + * We are splitting a bucket that consists of a base bucket page and zero + * or more overflow (bucket chain) pages. We must relocate tuples that + * belong in the new bucket. + * + * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) + * + * Split needs to retain pin on primary bucket pages of both old and new + * buckets till end of operation. This is to prevent vacuum from starting + * while a split is in progress. + * + * In addition, the caller must have created the new bucket's base page, + * which is passed in buffer nbuf, pinned and write-locked. The lock will be + * released here and pin must be released by the caller. (The API is set up + * this way because we must do _hash_getnewbuf() before releasing the metapage + * write lock. So instead of passing the new bucket's start block number, we + * pass an actual buffer.) */ -static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, - BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) +static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, + Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { - BlockNumber oblkno; - BlockNumber nblkno; - Buffer obuf; - Buffer nbuf; + Buffer bucket_obuf; + Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; + OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size all_tups_size = 0; + int i; + uint16 nitups = 0; - /* - * It should be okay to simultaneously write-lock pages from each bucket, - * since no one else can be trying to acquire buffer lock on pages of - * either bucket. - */ - oblkno = start_oblkno; - obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); + bucket_obuf = obuf; opage = BufferGetPage(obuf); - oopaque = (HashPageOpaque)PageGetSpecialPointer(opage); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); - nblkno = start_nblkno; - nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); + bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); - - /* initialize the new bucket's primary page */ - nopaque = (HashPageOpaque)PageGetSpecialPointer(npage); - nopaque->hasho_prevblkno = InvalidBlockNumber; - nopaque->hasho_nextblkno = InvalidBlockNumber; - nopaque->hasho_bucket = nbucket; - nopaque->hasho_flag = LH_BUCKET_PAGE; - nopaque->hasho_page_id = HASHO_PAGE_ID; + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* * Partition the tuples in the old bucket between the old bucket and the @@ -756,10 +1062,9 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Buck * once per page in old bucket. */ for (;;) { + BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; - OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); @@ -767,43 +1072,83 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Buck IndexTuple itup; Size itemsz; Bucket bucket; + bool found = false; + + /* skip dead tuples */ + if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) + continue; /* - * Fetch the item's hash key (conveniently stored in the item) and - * determine which bucket it now belongs in. + * Before inserting a tuple, probe the hash table containing TIDs + * of tuples belonging to new bucket, if we find a match, then + * skip that tuple, else fetch the item's hash key (conveniently + * stored in the item) and determine which bucket it now belongs + * in. */ - itup = (IndexTuple)PageGetItem(opage, PageGetItemId(opage, ooffnum)); - bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); + itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); + + if (htab) + (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); + + if (found) + continue; + + bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), + maxbucket, highmask, lowmask); + if (bucket == nbucket) { + IndexTuple new_itup; + + /* + * make a copy of index tuple as we have to scribble on it. + */ + new_itup = CopyIndexTuple(itup); + + /* + * mark the index tuple as moved by split, such tuples are + * skipped by scan if there is split in progress for a bucket. + */ + new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; + /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ - itemsz = IndexTupleDSize(*itup); + itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); - if (PageGetFreeSpace(npage) < itemsz) { - /* write out nbuf and drop lock, but keep pin */ - _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); + + if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { + /* + * Change the shared buffer state in critical section, + * otherwise any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); + MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + /* chain to a new overflow page */ - nbuf = _hash_addovflpage(rel, metabuf, nbuf); + nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); - /* we don't need nblkno or nopaque within the loop */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } - /* - * Insert tuple on new page, using _hash_pgaddtup to ensure - * correct ordering by hashkey. This is a tad inefficient - * since we may have to shuffle itempointers repeatedly. - * Possible future improvement: accumulate all the items for - * the new page and qsort them before insertion. - */ - (void)_hash_pgaddtup(rel, nbuf, itemsz, itup); - - /* - * Mark tuple for deletion from old page. - */ - deletable[ndeletable++] = ooffnum; + itups[nitups++] = new_itup; + all_tups_size += itemsz; } else { /* * the tuple stays on this page, so nothing to do. @@ -814,35 +1159,382 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Buck oblkno = oopaque->hasho_nextblkno; - /* - * Done scanning this old page. If we moved any tuples, delete them - * from the old page. - */ - if (ndeletable > 0) { - PageIndexMultiDelete(opage, deletable, ndeletable); - _hash_wrtbuf(rel, obuf); - } else { + /* retain the pin on the old primary bucket */ + if (obuf == bucket_obuf) + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + else _hash_relbuf(rel, obuf); - } /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { + /* + * Change the shared buffer state in critical section, otherwise + * any error could make it unrecoverable. + */ + START_CRIT_SECTION(); + + _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); + MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + END_CRIT_SECTION(); + + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); break; } /* Else, advance to next old page */ - obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); + obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); - oopaque = (HashPageOpaque)PageGetSpecialPointer(opage); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning - * the tuples. Before quitting, call _hash_squeezebucket to ensure the - * tuples remaining in the old bucket (including the overflow pages) are - * packed as tightly as possible. The new bucket is already tight. + * the tuples. Mark the old and new buckets to indicate split is + * finished. + * + * To avoid deadlocks due to locking order of buckets, first lock the old + * bucket and then the new bucket. */ - _hash_wrtbuf(rel, nbuf); + LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); + opage = BufferGetPage(bucket_obuf); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); - _hash_squeezebucket(rel, obucket, start_oblkno, NULL); + LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); + npage = BufferGetPage(bucket_nbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + START_CRIT_SECTION(); + + oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; + nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; + + /* + * After the split is finished, mark the old bucket to indicate that it + * contains deletable tuples. We will clear split-cleanup flag after + * deleting such tuples either at the end of split or at the next split + * from old bucket or at the time of vacuum. + */ + oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; + + /* + * now write the buffers, here we don't release the locks as caller is + * responsible to release locks. + */ + MarkBufferDirty(bucket_obuf); + MarkBufferDirty(bucket_nbuf); + + if (RelationNeedsWAL(rel)) { + XLogRecPtr recptr; + xl_hash_split_complete xlrec; + + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + + XLogBeginInsert(); + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); + + XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); + XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); + + PageSetLSN(BufferGetPage(bucket_obuf), recptr); + PageSetLSN(BufferGetPage(bucket_nbuf), recptr); + } + + END_CRIT_SECTION(); + + /* + * If possible, clean up the old bucket. We might not be able to do this + * if someone else has a pin on it, but if not then we can go ahead. This + * isn't absolutely necessary, but it reduces bloat; if we don't do it + * now, VACUUM will do it eventually, but maybe not until new overflow + * pages have been allocated. Note that there's no need to clean up the + * new bucket. + */ + if (IsBufferCleanupOK(bucket_obuf)) { + LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); + hashbucketcleanup(rel, obucket, bucket_obuf, + BufferGetBlockNumber(bucket_obuf), NULL, + maxbucket, highmask, lowmask, NULL, NULL, true, + NULL, NULL); + } else { + LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); + } +} + +/* + * _hash_finish_split() -- Finish the previously interrupted split operation + * + * To complete the split operation, we form the hash table of TIDs in new + * bucket which is then used by split operation to skip tuples that are + * already moved before the split operation was previously interrupted. + * + * The caller must hold a pin, but no lock, on the metapage and old bucket's + * primary page buffer. The buffers are returned in the same state. (The + * metapage is only touched if it becomes necessary to add or remove overflow + * pages.) + */ +void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, + uint32 maxbucket, uint32 highmask, uint32 lowmask) +{ + HASHCTL hash_ctl; + HTAB *tidhtab; + Buffer bucket_nbuf = InvalidBuffer; + Buffer nbuf; + Page npage; + BlockNumber nblkno; + BlockNumber bucket_nblkno; + HashPageOpaque npageopaque; + Bucket nbucket; + bool found; + errno_t rc = EOK; + + /* Initialize hash tables used to track TIDs */ + rc = memset_s(&hash_ctl, sizeof(hash_ctl), 0, sizeof(hash_ctl)); + securec_check_c(rc, "", ""); + hash_ctl.keysize = sizeof(ItemPointerData); + hash_ctl.entrysize = sizeof(ItemPointerData); + hash_ctl.hcxt = CurrentMemoryContext; + + tidhtab = hash_create("bucket ctids", + 256, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket); + + /* + * Scan the new bucket and build hash table of TIDs + */ + for (;;) { + OffsetNumber noffnum; + OffsetNumber nmaxoffnum; + + nbuf = _hash_getbuf(rel, nblkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + + /* remember the primary bucket buffer to acquire cleanup lock on it. */ + if (nblkno == bucket_nblkno) + bucket_nbuf = nbuf; + + npage = BufferGetPage(nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + + /* Scan each tuple in new page */ + nmaxoffnum = PageGetMaxOffsetNumber(npage); + for (noffnum = FirstOffsetNumber; noffnum <= nmaxoffnum; noffnum = OffsetNumberNext(noffnum)) { + IndexTuple itup; + + /* Fetch the item's TID and insert it in hash table. */ + itup = (IndexTuple) PageGetItem(npage, PageGetItemId(npage, noffnum)); + + (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); + + Assert(!found); + } + + nblkno = npageopaque->hasho_nextblkno; + + /* + * release our write lock without modifying buffer and ensure to + * retain the pin on primary bucket. + */ + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); + + /* Exit loop if no more overflow pages in new bucket */ + if (!BlockNumberIsValid(nblkno)) + break; + } + + /* + * Conditionally get the cleanup lock on old and new buckets to perform + * the split operation. If we don't get the cleanup locks, silently give + * up and next insertion on old bucket will try again to complete the + * split. + */ + if (!ConditionalLockBufferForCleanup(obuf)) { + hash_destroy(tidhtab); + return; + } + if (!ConditionalLockBufferForCleanup(bucket_nbuf)) { + LockBuffer(obuf, BUFFER_LOCK_UNLOCK); + hash_destroy(tidhtab); + return; + } + + npage = BufferGetPage(bucket_nbuf); + npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nbucket = npageopaque->hasho_bucket; + + _hash_splitbucket(rel, metabuf, obucket, + nbucket, obuf, bucket_nbuf, tidhtab, + maxbucket, highmask, lowmask); + + _hash_dropbuf(rel, bucket_nbuf); + hash_destroy(tidhtab); +} + +/* + * log_split_page() -- Log the split operation + * + * We log the split operation when the new page in new bucket gets full, + * so we log the entire page. + * + * 'buf' must be locked by the caller which is also responsible for unlocking + * it. + */ +static void log_split_page(Relation rel, Buffer buf) +{ + if (RelationNeedsWAL(rel)) { + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + } +} + +/* + * _hash_getcachedmetap() -- Returns cached metapage data. + * + * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on + * the metapage. If not set, we'll set it before returning if we have to + * refresh the cache, and return with a pin but no lock on it; caller is + * responsible for releasing the pin. + * + * We refresh the cache if it's not initialized yet or force_refresh is true. + */ +HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) +{ + Page page; + + Assert(metabuf); + if (force_refresh || rel->rd_amcache == NULL) { + char *cache = NULL; + error_t rc = EOK; + + /* + * It's important that we don't set rd_amcache to an invalid value. + * Either MemoryContextAlloc or _hash_getbuf could fail, so don't + * install a pointer to the newly-allocated storage in the actual + * relcache entry until both have succeeeded. + */ + if (rel->rd_amcache == NULL) + cache = (char*)MemoryContextAlloc(rel->rd_indexcxt, sizeof(HashMetaPageData)); + + /* Read the metapage. */ + if (BufferIsValid(*metabuf)) + LockBuffer(*metabuf, BUFFER_LOCK_SHARE); + else + *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, + LH_META_PAGE); + page = BufferGetPage(*metabuf); + + /* Populate the cache. */ + if (rel->rd_amcache == NULL) + rel->rd_amcache = cache; + rc = memcpy_s(rel->rd_amcache, sizeof(HashMetaPageData), HashPageGetMeta(page), sizeof(HashMetaPageData)); + securec_check_c(rc, "", ""); + + /* Release metapage lock, but keep the pin. */ + LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK); + } + + return (HashMetaPage) rel->rd_amcache; +} + +/* + * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given + * hashkey. + * + * Bucket pages do not move or get removed once they are allocated. This give + * us an opportunity to use the previously saved metapage contents to reach + * the target bucket buffer, instead of reading from the metapage every time. + * This saves one buffer access every time we want to reach the target bucket + * buffer, which is very helpful savings in bufmgr traffic and contention. + * + * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the + * bucket buffer has to be locked for reading or writing. + * + * The out parameter cachedmetap is set with metapage contents used for + * hashkey to bucket buffer mapping. Some callers need this info to reach the + * old bucket in case of bucket split, see _hash_doinsert(). + */ +Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, + HashMetaPage *cachedmetap) +{ + HashMetaPage metap; + Buffer buf; + Buffer metabuf = InvalidBuffer; + Page page; + Bucket bucket; + BlockNumber blkno; + HashPageOpaque opaque; + + /* We read from target bucket buffer, hence locking is must. */ + Assert(access == HASH_READ || access == HASH_WRITE); + + metap = _hash_getcachedmetap(rel, &metabuf, false); + Assert(metap != NULL); + + /* + * Loop until we get a lock on the correct target bucket. + */ + for (;;) { + /* + * Compute the target bucket number, and convert to block number. + */ + bucket = _hash_hashkey2bucket(hashkey, + metap->hashm_maxbucket, + metap->hashm_highmask, + metap->hashm_lowmask); + + blkno = BUCKET_TO_BLKNO(metap, bucket); + + /* Fetch the primary bucket page for the bucket */ + buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + Assert(opaque->hasho_prevblkno != InvalidBlockNumber); + + /* + * If this bucket hasn't been split, we're done. + */ + if (opaque->hasho_prevblkno <= metap->hashm_maxbucket) + break; + + /* Drop lock on this buffer, update cached metapage, and retry. */ + _hash_relbuf(rel, buf); + metap = _hash_getcachedmetap(rel, &metabuf, true); + Assert(metap != NULL); + } + + if (BufferIsValid(metabuf)) + _hash_dropbuf(rel, metabuf); + + if (cachedmetap) + *cachedmetap = metap; + + return buf; } diff --git a/src/gausskernel/storage/access/hash/hashscan.cpp b/src/gausskernel/storage/access/hash/hashscan.cpp deleted file mode 100644 index 5f012abf4..000000000 --- a/src/gausskernel/storage/access/hash/hashscan.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* ------------------------------------------------------------------------- - * - * hashscan.cpp - * manage scans on hash tables - * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/gausskernel/storage/access/hash/hashscan.cpp - * - * ------------------------------------------------------------------------- - */ -#include "postgres.h" -#include "knl/knl_variable.h" - -#include "access/hash.h" -#include "access/relscan.h" -#include "utils/memutils.h" -#include "utils/rel.h" -#include "utils/rel_gs.h" -#include "utils/resowner.h" - -/* - * We track all of a backend's active scans on hash indexes using a list - * of HashScanListData structs, which are allocated in t_thrd.top_mem_cxt. - * It's okay to use a long-lived context because we rely on the ResourceOwner - * mechanism to clean up unused entries after transaction or subtransaction - * abort. We can't safely keep the entries in the executor's per-query - * context, because that might be already freed before we get a chance to - * clean up the list. (XXX seems like there should be a better way to - * manage this...) - */ -typedef struct HashScanListData { - IndexScanDesc hashsl_scan; - ResourceOwner hashsl_owner; - struct HashScanListData *hashsl_next; -} HashScanListData; - -typedef HashScanListData *HashScanList; - -/* - * ReleaseResources_hash() --- clean up hash subsystem resources. - * - * This is here because it needs to touch this module's static var HashScans. - */ -void ReleaseResources_hash(void) -{ - HashScanList l = NULL; - HashScanList prev = NULL; - HashScanList next = NULL; - - /* - * Release all HashScanList items belonging to the current ResourceOwner. - * Note that we do not release the underlying IndexScanDesc; that's in - * executor memory and will go away on its own (in fact quite possibly has - * gone away already, so we mustn't try to touch it here). - * - * Note: this should be a no-op during normal query shutdown. However, in - * an abort situation ExecutorEnd is not called and so there may be open - * index scans to clean up. - */ - prev = NULL; - - for (l = u_sess->exec_cxt.HashScans; l != NULL; l = next) { - next = l->hashsl_next; - if (l->hashsl_owner == t_thrd.utils_cxt.CurrentResourceOwner) { - if (prev == NULL) - u_sess->exec_cxt.HashScans = next; - else - prev->hashsl_next = next; - - pfree(l); - /* prev does not change */ - } else - prev = l; - } -} - -/* - * _hash_regscan() -- register a new scan. - */ -void _hash_regscan(IndexScanDesc scan) -{ - HashScanList new_el; - - new_el = (HashScanList)MemoryContextAlloc( - SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sizeof(HashScanListData)); - new_el->hashsl_scan = scan; - new_el->hashsl_owner = t_thrd.utils_cxt.CurrentResourceOwner; - new_el->hashsl_next = u_sess->exec_cxt.HashScans; - u_sess->exec_cxt.HashScans = new_el; -} - -/* - * _hash_dropscan() -- drop a scan from the scan list - */ -void _hash_dropscan(IndexScanDesc scan) -{ - HashScanList chk = NULL; - HashScanList last = NULL; - - last = NULL; - for (chk = u_sess->exec_cxt.HashScans; chk != NULL && chk->hashsl_scan != scan; chk = chk->hashsl_next) - last = chk; - - if (chk == NULL) - ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("hash scan list trashed"))); - - if (last == NULL) - u_sess->exec_cxt.HashScans = chk->hashsl_next; - else - last->hashsl_next = chk->hashsl_next; - - pfree(chk); -} - -/* - * Is there an active scan in this bucket? - */ -bool _hash_has_active_scan(Relation rel, Bucket bucket) -{ - Oid relid = RelationGetRelid(rel); - HashScanList l = NULL; - - for (l = u_sess->exec_cxt.HashScans; l != NULL; l = l->hashsl_next) { - if (relid == l->hashsl_scan->indexRelation->rd_id) { - HashScanOpaque so = (HashScanOpaque)l->hashsl_scan->opaque; - - if (so->hashso_bucket_valid && so->hashso_bucket == bucket) - return true; - } - } - - return false; -} diff --git a/src/gausskernel/storage/access/hash/hashsearch.cpp b/src/gausskernel/storage/access/hash/hashsearch.cpp index 3088cd78c..b5339bbfe 100644 --- a/src/gausskernel/storage/access/hash/hashsearch.cpp +++ b/src/gausskernel/storage/access/hash/hashsearch.cpp @@ -3,8 +3,8 @@ * hashsearch.cpp * search code for postgres hash tables * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -64,40 +64,131 @@ bool _hash_next(IndexScanDesc scan, ScanDirection dir) } /* - * Advance to next page in a bucket, if any. + * Advance to next page in a bucket, if any. If we are scanning the bucket + * being populated during split operation then this function advances to the + * bucket being split after the last bucket page of bucket being populated. */ -static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) +static void _hash_readnext(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep) { BlockNumber blkno; + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque)scan->opaque; + bool block_found = false; blkno = (*opaquep)->hasho_nextblkno; - _hash_relbuf(rel, *bufp); + + /* + * Retain the pin on primary bucket page till the end of scan. Refer the + * comments in _hash_first to know the reason of retaining pin. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, *bufp); + *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); if (BlockNumberIsValid(blkno)) { *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); + block_found = true; + } else if (so->hashso_buc_populated && !so->hashso_buc_split) { + /* + * end of bucket, scan bucket being split if there was a split in + * progress at the start of scan. + */ + *bufp = so->hashso_split_bucket_buf; + + /* + * buffer for bucket being split must be valid as we acquire the pin + * on it before the start of scan and retain it till end of scan. + */ + Assert(BufferIsValid(*bufp)); + + LockBuffer(*bufp, BUFFER_LOCK_SHARE); + + /* + * setting hashso_buc_split to true indicates that we are scanning + * bucket being split. + */ + so->hashso_buc_split = true; + + block_found = true; + } + + if (block_found) { *pagep = BufferGetPage(*bufp); - *opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } } /* - * Advance to previous page in a bucket, if any. + * Advance to previous page in a bucket, if any. If the current scan has + * started during split operation then this function advances to bucket + * being populated after the first bucket page of bucket being split. */ -static void _hash_readprev(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) +static void _hash_readprev(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep) { BlockNumber blkno; + Relation rel = scan->indexRelation; + HashScanOpaque so = (HashScanOpaque) scan->opaque; + bool haveprevblk; + blkno = (*opaquep)->hasho_prevblkno; - _hash_relbuf(rel, *bufp); + /* + * Retain the pin on primary bucket page till the end of scan. Refer the + * comments in _hash_first to know the reason of retaining pin. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) { + LockBuffer(*bufp, BUFFER_LOCK_UNLOCK); + haveprevblk = false; + } else { + _hash_relbuf(rel, *bufp); + haveprevblk = true; + } *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); - if (BlockNumberIsValid(blkno)) { - *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + if (haveprevblk) { + Assert(BlockNumberIsValid(blkno)); + *bufp = _hash_getbuf(rel, blkno, HASH_READ, + LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); - *opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + + /* + * We always maintain the pin on bucket page for whole scan operation, + * so releasing the additional pin we have acquired here. + */ + if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) + _hash_dropbuf(rel, *bufp); + } else if (so->hashso_buc_populated && so->hashso_buc_split) { + /* + * end of bucket, scan bucket being populated if there was a split in + * progress at the start of scan. + */ + *bufp = so->hashso_bucket_buf; + + /* + * buffer for bucket being populated must be valid as we acquire the + * pin on it before the start of scan and retain it till end of scan. + */ + Assert(BufferIsValid(*bufp)); + + LockBuffer(*bufp, BUFFER_LOCK_SHARE); + *pagep = BufferGetPage(*bufp); + *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); + + /* move to the end of bucket chain */ + while (BlockNumberIsValid((*opaquep)->hasho_nextblkno)) + _hash_readnext(scan, bufp, pagep, opaquep); + + /* + * setting hashso_buc_split to false indicates that we are scanning + * bucket being populated. + */ + so->hashso_buc_split = false; } } @@ -117,12 +208,9 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir) ScanKey cur; uint32 hashkey; Bucket bucket; - BlockNumber blkno; Buffer buf; - Buffer metabuf; Page page; HashPageOpaque opaque; - HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; @@ -174,48 +262,71 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir) so->hashso_sk_hash = hashkey; - /* - * Acquire shared split lock so we can compute the target bucket safely - * (see README). - */ - _hash_getlock(rel, 0, HASH_SHARE); - - /* Read the metapage */ - metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); - metap = HashPageGetMeta(BufferGetPage(metabuf)); - - /* - * Compute the target bucket number, and convert to block number. - */ - bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); - - blkno = BUCKET_TO_BLKNO(metap, bucket); - - /* done with the metapage */ - _hash_relbuf(rel, metabuf); - - /* - * Acquire share lock on target bucket; then we can release split lock. - */ - _hash_getlock(rel, blkno, HASH_SHARE); - - _hash_droplock(rel, 0, HASH_SHARE); - - /* Update scan opaque state to show we have lock on the bucket */ - so->hashso_bucket = bucket; - so->hashso_bucket_valid = true; - so->hashso_bucket_blkno = blkno; - - /* Fetch the primary bucket page for the bucket */ - buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); + buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL); page = BufferGetPage(buf); - opaque = (HashPageOpaque)PageGetSpecialPointer(page); - Assert(opaque->hasho_bucket == bucket); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket = opaque->hasho_bucket; + + so->hashso_bucket_buf = buf; + /* + * If a bucket split is in progress, then while scanning the bucket being + * populated, we need to skip tuples that were copied from bucket being + * split. We also need to maintain a pin on the bucket being split to + * ensure that split-cleanup work done by vacuum doesn't remove tuples + * from it till this scan is done. We need to maintain a pin on the + * bucket being populated to ensure that vacuum doesn't squeeze that + * bucket till this scan is complete; otherwise, the ordering of tuples + * can't be maintained during forward and backward scans. Here, we have + * to be cautious about locking order: first, acquire the lock on bucket + * being split; then, release the lock on it but not the pin; then, + * acquire a lock on bucket being populated and again re-verify whether + * the bucket split is still in progress. Acquiring the lock on bucket + * being split first ensures that the vacuum waits for this scan to + * finish. + */ + if (H_BUCKET_BEING_POPULATED(opaque)) { + BlockNumber old_blkno; + Buffer old_buf; + + old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket); + + /* + * release the lock on new bucket and re-acquire it after acquiring + * the lock on old bucket. + */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE); + + /* + * remember the split bucket buffer so as to use it later for + * scanning. + */ + so->hashso_split_bucket_buf = old_buf; + LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); + + LockBuffer(buf, BUFFER_LOCK_SHARE); + page = BufferGetPage(buf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + Assert(opaque->hasho_bucket == bucket); + + if (H_BUCKET_BEING_POPULATED(opaque)) { + so->hashso_buc_populated = true; + } else { + _hash_dropbuf(rel, so->hashso_split_bucket_buf); + so->hashso_split_bucket_buf = InvalidBuffer; + } + } /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { - while (BlockNumberIsValid(opaque->hasho_nextblkno)) - _hash_readnext(rel, &buf, &page, &opaque); + /* + * Backward scans that start during split needs to start from end of + * bucket being split. + */ + while (BlockNumberIsValid(opaque->hasho_nextblkno) || + (so->hashso_buc_populated && !so->hashso_buc_split)) + _hash_readnext(scan, &buf, &page, &opaque); } /* Now find the first tuple satisfying the qualification */ @@ -239,6 +350,12 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir) * false. Else, return true and set the hashso_curpos for the * scan to the right thing. * + * Here we need to ensure that if the scan has started during split, then + * skip the tuples that are moved by split while scanning bucket being + * populated and then scan the bucket being split to cover all such + * tuples. This is done to ensure that we don't miss tuples in the scans + * that are started during split. + * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. @@ -283,9 +400,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) do { switch (dir) { case ForwardScanDirection: - if (offnum != InvalidOffsetNumber) + if (offnum != InvalidOffsetNumber) { offnum = OffsetNumberNext(offnum); /* move forward */ - else { + } else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); } @@ -298,14 +415,27 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (offnum <= maxoff) { Assert(offnum >= FirstOffsetNumber); itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum)); + /* + * skip the tuples that are moved by split operation + * for the scan that has started when split was in + * progress + */ + if (so->hashso_buc_populated && !so->hashso_buc_split && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) { + offnum = OffsetNumberNext(offnum); /* move forward */ + continue; + } + if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } - + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _hash_kill_items(scan); /* * ran off the end of this page, try the next */ - _hash_readnext(rel, &buf, &page, &opaque); + _hash_readnext(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch(page, so->hashso_sk_hash); @@ -318,9 +448,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) break; case BackwardScanDirection: - if (offnum != InvalidOffsetNumber) + if (offnum != InvalidOffsetNumber) { offnum = OffsetNumberPrev(offnum); /* move back */ - else { + } else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } @@ -333,14 +463,26 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) if (offnum >= FirstOffsetNumber) { Assert(offnum <= maxoff); itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum)); + /* + * skip the tuples that are moved by split operation + * for the scan that has started when split was in + * progress + */ + if (so->hashso_buc_populated && !so->hashso_buc_split && + (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) { + offnum = OffsetNumberPrev(offnum); /* move back */ + continue; + } if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } - + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _hash_kill_items(scan); /* * ran off the end of this page, try the next */ - _hash_readprev(rel, &buf, &page, &opaque); + _hash_readprev(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); @@ -360,9 +502,16 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) } if (itup == NULL) { - /* we ran off the end of the bucket without finding a match */ + /* + * We ran off the end of the bucket without finding a match. + * Release the pin on bucket buffers. Normally, such pins are + * released at end of scan, however scrolling cursors can + * reacquire the bucket lock and pin in the same scan multiple + * times. + */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); + _hash_dropscanbuf(rel, so); return false; } diff --git a/src/gausskernel/storage/access/hash/hashsort.cpp b/src/gausskernel/storage/access/hash/hashsort.cpp index 1c64ceceb..f5a9aab4e 100644 --- a/src/gausskernel/storage/access/hash/hashsort.cpp +++ b/src/gausskernel/storage/access/hash/hashsort.cpp @@ -14,8 +14,8 @@ * plenty of locality of access. * * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION @@ -37,15 +37,23 @@ struct HSpool { Tuplesortstate *sortstate; /* state data for tuplesort.c */ Relation index; + /* + * We sort the hash keys based on the buckets they belong to. Below masks + * are used in _hash_hashkey2bucket to determine the bucket of given hash + * key. + */ + uint32 high_mask; + uint32 low_mask; + uint32 max_buckets; }; + /* * create and initialize a spool structure */ -HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo) +HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void *meminfo) { HSpool *hspool = (HSpool *)palloc0(sizeof(HSpool)); - uint32 hash_mask; UtilityDesc *desc = (UtilityDesc *)meminfo; int work_mem = (desc->query_mem[0] > 0) ? desc->query_mem[0] : u_sess->attr.attr_memory.maintenance_work_mem; int max_mem = (desc->query_mem[1] > 0) ? desc->query_mem[1] : 0; @@ -57,18 +65,26 @@ HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo) * num_buckets buckets in the index, the appropriate mask can be computed * as follows. * - * Note: at present, the passed-in num_buckets is always a power of 2, so - * we could just compute num_buckets - 1. We prefer not to assume that - * here, though. + * NOTE : This hash mask calculation should be in sync with similar + * calculation in _hash_init_metabuffer. */ - hash_mask = (((uint32)1) << _hash_log2(num_buckets)) - 1; + hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1; + hspool->low_mask = (hspool->high_mask >> 1); + hspool->max_buckets = num_buckets - 1; /* * We size the sort area as maintenance_work_mem rather than work_mem to * speed index creation. This should be OK since a single backend can't * run multiple index creations in parallel. */ - hspool->sortstate = tuplesort_begin_index_hash(index, hash_mask, work_mem, false, max_mem); + hspool->sortstate = tuplesort_begin_index_hash(heap, + index, + hspool->high_mask, + hspool->low_mask, + hspool->max_buckets, + work_mem, + false, + max_mem); return hspool; } @@ -94,7 +110,7 @@ void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, const bool *isnul * given a spool loaded by successive calls to _h_spool, * create an entire index. */ -void _h_indexbuild(HSpool *hspool) +void _h_indexbuild(HSpool *hspool, Relation heapRel) { IndexTuple itup; bool should_free = false; @@ -102,7 +118,7 @@ void _h_indexbuild(HSpool *hspool) tuplesort_performsort(hspool->sortstate); while ((itup = tuplesort_getindextuple(hspool->sortstate, true, &should_free)) != NULL) { - _hash_doinsert(hspool->index, itup); + _hash_doinsert(hspool->index, itup, heapRel); if (should_free) pfree(itup); } diff --git a/src/gausskernel/storage/access/hash/hashutil.cpp b/src/gausskernel/storage/access/hash/hashutil.cpp index 859585973..1f33f295f 100644 --- a/src/gausskernel/storage/access/hash/hashutil.cpp +++ b/src/gausskernel/storage/access/hash/hashutil.cpp @@ -3,8 +3,8 @@ * hashutil.cpp * Utility code for Postgres hash implementation. * - * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * @@ -22,7 +22,9 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/rel_gs.h" +#include "storage/buf/buf_internals.h" +#define CALC_NEW_BUCKET(old_bucket, lowmask) ((old_bucket) | ((lowmask) + 1)) /* * _hash_checkqual -- does the index tuple satisfy the scan conditions? */ @@ -133,6 +135,70 @@ uint32 _hash_log2(uint32 num) return i; } +/* + * _hash_spareindex -- returns spare index / global splitpoint phase of the bucket + */ +uint32 _hash_spareindex(uint32 num_bucket) +{ + uint32 splitpoint_group; + uint32 splitpoint_phases; + + splitpoint_group = _hash_log2(num_bucket); + + if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + return splitpoint_group; + + /* account for single-phase groups */ + splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; + + /* account for multi-phase groups before splitpoint_group */ + splitpoint_phases += + ((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) << + HASH_SPLITPOINT_PHASE_BITS); + + /* account for phases within current group */ + splitpoint_phases += + (((num_bucket - 1) >> + (splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) & + HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */ + + return splitpoint_phases; +} + +/* + * _hash_get_totalbuckets -- returns total number of buckets allocated till + * the given splitpoint phase. + */ +uint32 _hash_get_totalbuckets(uint32 splitpoint_phase) +{ + uint32 splitpoint_group; + uint32 total_buckets; + uint32 phases_within_splitpoint_group; + + if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) + return (1 << splitpoint_phase); + + /* get splitpoint's group */ + splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE; + splitpoint_group += + ((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >> + HASH_SPLITPOINT_PHASE_BITS); + + /* account for buckets before splitpoint_group */ + total_buckets = (1 << (splitpoint_group - 1)); + + /* account for buckets within splitpoint_group */ + phases_within_splitpoint_group = + (((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) & + HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */ + total_buckets += + (((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) * + phases_within_splitpoint_group); + + return total_buckets; +} + + /* * _hash_checkpage -- sanity checks on the format of all hash pages * @@ -216,25 +282,36 @@ uint32 _hash_get_indextuple_hashkey(IndexTuple itup) } /* - * _hash_form_tuple - form an index tuple containing hash code only + * _hash_convert_tuple - convert raw index data to hash key + * + * Inputs: values and isnull arrays for the user data column(s) + * Outputs: values and isnull arrays for the index tuple, suitable for + * passing to index_form_tuple(). + * + * Returns true if successful, false if not (because there are null values). + * On a false result, the given data need not be indexed. + * + * Note: callers know that the index-column arrays are always of length 1. + * In principle, there could be more than one input column, though we do not + * currently support that. */ -IndexTuple _hash_form_tuple(Relation index, Datum *values, const bool *isnull) +bool _hash_convert_tuple(Relation index, + Datum *user_values, const bool *user_isnull, + Datum *index_values, bool *index_isnull) { - IndexTuple itup; uint32 hashkey; - Datum hashkeydatum; - TupleDesc hashdesc; - if (isnull[0]) { - hashkeydatum = (Datum)0; - } else { - hashkey = _hash_datum2hashkey(index, values[0]); - hashkeydatum = UInt32GetDatum(hashkey); - } - hashdesc = RelationGetDescr(index); - Assert(hashdesc->natts == 1); - itup = index_form_tuple(hashdesc, &hashkeydatum, isnull); - return itup; + /* + * We do not insert null values into hash indexes. This is okay because + * the only supported search operator is '=', and we assume it is strict. + */ + if (user_isnull[0]) + return false; + + hashkey = _hash_datum2hashkey(index, user_values[0]); + index_values[0] = UInt32GetDatum(hashkey); + index_isnull[0] = false; + return true; } /* @@ -312,3 +389,154 @@ OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value) return lower; } + +/* + * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket + * from which current (new) bucket is being split. + */ +BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) +{ + Bucket old_bucket; + uint32 mask; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + /* + * To get the old bucket from the current bucket, we need a mask to modulo + * into lower half of table. This mask is stored in meta page as + * hashm_lowmask, but here we can't rely on the same, because we need a + * value of lowmask that was prevalent at the time when bucket split was + * started. Masking the most significant bit of new bucket would give us + * old bucket. + */ + mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1; + old_bucket = new_bucket & mask; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + blkno = BUCKET_TO_BLKNO(metap, old_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket + * that will be generated after split from old bucket. + * + * This is used to find the new bucket from old bucket based on current table + * half. It is mainly required to finish the incomplete splits where we are + * sure that not more than one bucket could have split in progress from old + * bucket. + */ +BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) +{ + Bucket new_bucket; + Buffer metabuf; + HashMetaPage metap; + BlockNumber blkno; + + metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); + metap = HashPageGetMeta(BufferGetPage(metabuf)); + + new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, + metap->hashm_lowmask, + metap->hashm_maxbucket); + blkno = BUCKET_TO_BLKNO(metap, new_bucket); + + _hash_relbuf(rel, metabuf); + + return blkno; +} + +/* + * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be + * generated after split from current (old) bucket. + * + * This is used to find the new bucket from old bucket. New bucket can be + * obtained by OR'ing old bucket with most significant bit of current table + * half (lowmask passed in this function can be used to identify msb of + * current table half). There could be multiple buckets that could have + * been split from current bucket. We need the first such bucket that exists. + * Caller must ensure that no more than one split has happened from old + * bucket. + */ +Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, + uint32 lowmask, uint32 maxbucket) +{ + Bucket new_bucket; + + new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); + if (new_bucket > maxbucket) { + lowmask = lowmask >> 1; + new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask); + } + + return new_bucket; +} + +/* + * _hash_kill_items - set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * scan->opaque, referenced locally through so, contains information about the + * current page and killed tuples thereon (generally, this should only be + * called if so->numKilled > 0). + * + * We match items by heap TID before assuming they are the right ones to + * delete. + */ +void _hash_kill_items(IndexScanDesc scan) +{ + HashScanOpaque so = (HashScanOpaque) scan->opaque; + Page page; + HashPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber maxoff; + int numKilled = so->numKilled; + int i; + bool killedsomething = false; + + Assert(so->numKilled > 0); + Assert(so->killedItems != NULL); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; + + page = BufferGetPage(so->hashso_curbuf); + opaque = (HashPageOpaque) PageGetSpecialPointer(page); + maxoff = PageGetMaxOffsetNumber(page); + + for (i = 0; i < numKilled; i++) { + offnum = so->killedItems[i].indexOffset; + + while (offnum <= maxoff) { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple ituple = (IndexTuple)PageGetItem(page, iid); + + if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) { + /* found the item */ + ItemIdMarkDead(iid); + killedsomething = true; + break; /* out of inner search loop */ + } + offnum = OffsetNumberNext(offnum); + } + } + + /* + * Since this can be redone later if needed, mark as dirty hint. Whenever + * we mark anything LP_DEAD, we also set the page's + * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint. + */ + if (killedsomething) { + opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; + MarkBufferDirtyHint(so->hashso_curbuf, true); + } +} diff --git a/src/gausskernel/storage/access/redo/redo_hash.cpp b/src/gausskernel/storage/access/redo/redo_hash.cpp index ad7f378f8..19435ec38 100644 --- a/src/gausskernel/storage/access/redo/redo_hash.cpp +++ b/src/gausskernel/storage/access/redo/redo_hash.cpp @@ -26,7 +26,9 @@ #include "knl/knl_variable.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "access/relscan.h" +#include "access/xlogutils.h" #include "access/xlogproc.h" #include "catalog/index.h" @@ -37,9 +39,1312 @@ #include "utils/rel.h" #include "utils/rel_gs.h" +static XLogRecParseState *HashXlogInitMetaPageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + + *blocknum = 1; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_INIT_META_PAGE_NUM, recordstatehead); + + return recordstatehead; +} + +static XLogRecParseState *HashXlogInitBitmapPageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM, recordstatehead); + + XLogRecParseState *blockstate = NULL; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_INIT_BITMAP_PAGE_META_NUM, blockstate); + + *blocknum = 2; + return recordstatehead; +} + +static XLogRecParseState *HashXlogInsertParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_INSERT_PAGE_NUM, recordstatehead); + + XLogRecParseState *blockstate = NULL; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_INSERT_META_NUM, blockstate); + + *blocknum = 2; + return recordstatehead; +} + +static XLogRecParseState *HashXlogAddOvflPageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + BlockNumber leftblk; + BlockNumber rightblk; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); + XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); + + XLogRecParseState *recordstatehead = NULL; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM, recordstatehead); + XLogRecSetAuxiBlkNumState(&recordstatehead->blockparse.extra_rec.blockdatarec, leftblk, InvalidForkNumber); + + XLogRecParseState *blockstate = NULL; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM, blockstate); + XLogRecSetAuxiBlkNumState(&blockstate->blockparse.extra_rec.blockdatarec, rightblk, InvalidForkNumber); + + *blocknum = 2; + + if (XLogRecHasBlockRef(record, XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM)) { + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM, blockstate); + } + + if (XLogRecHasBlockRef(record, XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM)) { + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM, blockstate); + } + + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_ADD_OVFL_PAGE_META_NUM, blockstate); + + return recordstatehead; +} + +static XLogRecParseState *HashXlogSplitAllocatePageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM, recordstatehead); + + XLogRecParseState *blockstate = NULL; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM, blockstate); + + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_SPLIT_ALLOCATE_PAGE_META_NUM, blockstate); + + *blocknum = 3; + return recordstatehead; +} + +static XLogRecParseState *HashXlogSplitPageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_SPLIT_PAGE_NUM, recordstatehead); + + *blocknum = 1; + return recordstatehead; +} + +static XLogRecParseState *HashXlogSplitCompleteParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM, recordstatehead); + + XLogRecParseState *blockstate = NULL; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, XLOG_HASH_SPLIT_COMPLETE_NBUK_NUM, blockstate); + + *blocknum = 2; + return recordstatehead; +} + +static XLogRecParseState *HashXlogMovePageContentsParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogRecParseState *blockstate = NULL; + + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); + + *blocknum = 1; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + + if (xldata->is_prim_bucket_same_wrt) { + XLogRecSetBlockDataState(record, HASH_MOVE_ADD_BLOCK_NUM, recordstatehead); + } else { + XLogRecParseState *blockstate = NULL; + XLogRecSetBlockDataState(record, HASH_MOVE_BUK_BLOCK_NUM, recordstatehead); + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + + XLogRecSetBlockDataState(record, HASH_MOVE_ADD_BLOCK_NUM, blockstate); + (*blocknum)++; + } + + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_MOVE_DELETE_OVFL_BLOCK_NUM, blockstate); + (*blocknum)++; + + return recordstatehead; +} + +static XLogRecParseState *HashXlogSqueezePageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogRecParseState *blockstate = NULL; + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); + + *blocknum = 1; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + + if (xldata->is_prim_bucket_same_wrt) { + XLogRecSetBlockDataState(record, HASH_SQUEEZE_ADD_BLOCK_NUM, recordstatehead); + } else { + XLogRecSetBlockDataState(record, HASH_SQUEEZE_BUK_BLOCK_NUM, recordstatehead); + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SQUEEZE_ADD_BLOCK_NUM, blockstate); + (*blocknum)++; + } + + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM, blockstate); + + if (!xldata->is_prev_bucket_same_wrt) { + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM, blockstate); + } + + if (XLogRecHasBlockRef(record, HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM)) { + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM, blockstate); + } + + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM, blockstate); + + if (XLogRecHasBlockRef(record, HASH_SQUEEZE_UPDATE_META_BLOCK_NUM)) { + (*blocknum)++; + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SQUEEZE_UPDATE_META_BLOCK_NUM, blockstate); + } + + return recordstatehead; +} + +static XLogRecParseState *HashXlogDeleteParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + xl_hash_delete *xldata = (xl_hash_delete *)XLogRecGetData(record); + + *blocknum = 1; + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + + if (xldata->is_primary_bucket_page) { + XLogRecSetBlockDataState(record, HASH_DELETE_OVFL_BLOCK_NUM, recordstatehead); + } else { + XLogRecParseState *blockstate = NULL; + XLogRecSetBlockDataState(record, HASH_DELETE_BUK_BLOCK_NUM, recordstatehead); + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + + XLogRecSetBlockDataState(record, HASH_DELETE_OVFL_BLOCK_NUM, blockstate); + (*blocknum)++; + } + + return recordstatehead; +} + +static XLogRecParseState *HashXlogSplitCleanupParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_SPLIT_CLEANUP_BLOCK_NUM, recordstatehead); + + *blocknum = 1; + return recordstatehead; +} + +static XLogRecParseState *HashXlogUpdateMetaPageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_UPDATE_META_BLOCK_NUM, recordstatehead); + + *blocknum = 1; + return recordstatehead; +} + +static XLogRecParseState *HashXlogVacuumOnePageParseBlock(XLogReaderState *record, uint32 *blocknum) +{ + XLogRecParseState *recordstatehead = NULL; + XLogRecParseState *blockstate = NULL; + + XLogParseBufferAllocListFunc(record, &recordstatehead, NULL); + if (recordstatehead == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_VACUUM_PAGE_BLOCK_NUM, recordstatehead); + + XLogParseBufferAllocListFunc(record, &blockstate, recordstatehead); + if (blockstate == NULL) { + return NULL; + } + XLogRecSetBlockDataState(record, HASH_VACUUM_META_BLOCK_NUM, blockstate); + + *blocknum = 2; + + return recordstatehead; +} + XLogRecParseState *HashRedoParseToBlock(XLogReaderState *record, uint32 *blocknum) { + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecParseState *recordblockstate = NULL; + *blocknum = 0; - ereport(PANIC, (errmsg("HashRedoParseToBlock: unimplemented"))); - return NULL; + switch (info) { + case XLOG_HASH_INIT_META_PAGE: + recordblockstate = HashXlogInitMetaPageParseBlock(record, blocknum); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + recordblockstate = HashXlogInitBitmapPageParseBlock(record, blocknum); + break; + case XLOG_HASH_INSERT: + recordblockstate = HashXlogInsertParseBlock(record, blocknum); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + recordblockstate = HashXlogAddOvflPageParseBlock(record, blocknum); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + recordblockstate = HashXlogSplitAllocatePageParseBlock(record, blocknum); + break; + case XLOG_HASH_SPLIT_PAGE: + recordblockstate = HashXlogSplitPageParseBlock(record, blocknum); + break; + case XLOG_HASH_SPLIT_COMPLETE: + recordblockstate = HashXlogSplitCompleteParseBlock(record, blocknum); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + recordblockstate = HashXlogMovePageContentsParseBlock(record, blocknum); + break; + case XLOG_HASH_SQUEEZE_PAGE: + recordblockstate = HashXlogSqueezePageParseBlock(record, blocknum); + break; + case XLOG_HASH_DELETE: + recordblockstate = HashXlogDeleteParseBlock(record, blocknum); + break; + case XLOG_HASH_SPLIT_CLEANUP: + recordblockstate = HashXlogSplitCleanupParseBlock(record, blocknum); + break; + case XLOG_HASH_UPDATE_META_PAGE: + recordblockstate = HashXlogUpdateMetaPageParseBlock(record, blocknum); + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + recordblockstate = HashXlogVacuumOnePageParseBlock(record, blocknum); + break; + default: + ereport(PANIC, (errmsg("hash_redo_block: unknown op code %u", info))); + } + + return recordblockstate; } + +void HashRedoInitMetaPageOperatorPage(RedoBufferInfo *metabuf, void *recorddata) +{ + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *)recorddata; + _hash_init_metabuffer(metabuf->buf, xlrec->num_tuples, xlrec->procid, xlrec->ffactor, true); + PageSetLSN(metabuf->pageinfo.page, metabuf->lsn); +} + +void HashRedoInitBitmapPageOperatorBitmapPage(RedoBufferInfo *bitmapbuf, void *recorddata) +{ + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *)recorddata; + _hash_initbitmapbuffer(bitmapbuf->buf, xlrec->bmsize, true); + PageSetLSN(bitmapbuf->pageinfo.page, bitmapbuf->lsn); +} + +void HashRedoInitBitmapPageOperatorMetaPage(RedoBufferInfo *metabuf) +{ + uint32 num_buckets; + HashMetaPage metap; + + metap = HashPageGetMeta(metabuf->pageinfo.page); + num_buckets = metap->hashm_maxbucket + 1; + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + metap->hashm_nmaps++; + + PageSetLSN(metabuf->pageinfo.page, metabuf->lsn); +} + +void HashRedoInsertOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *data, Size datalen) +{ + xl_hash_insert *xlrec = (xl_hash_insert *)recorddata; + Page page = buffer->pageinfo.page; + char *datapos = (char *)data; + + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) { + ereport(PANIC, (errmsg("hash_xlog_insert: failed to add item"))); + } + + PageSetLSN(page, buffer->lsn); +} + +void HashRedoInsertOperatorMetaPage(RedoBufferInfo *metabuf) +{ + HashMetaPage metap; + + metap = HashPageGetMeta(metabuf->pageinfo.page); + metap->hashm_ntuples += 1; + + PageSetLSN(metabuf->pageinfo.page, metabuf->lsn); +} + +void HashRedoAddOvflPageOperatorOvflPage(RedoBufferInfo *ovflbuf, BlockNumber leftblk, void *data, Size datalen) +{ + Page ovflpage; + HashPageOpaque ovflopaque; + uint32 *num_bucket; + + num_bucket = (uint32 *)data; + Assert(datalen == sizeof(uint32)); + _hash_initbuf(ovflbuf->buf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, true); + /* update backlink */ + ovflpage = ovflbuf->pageinfo.page; + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = leftblk; + + PageSetLSN(ovflpage, ovflbuf->lsn); +} + +void HashRedoAddOvflPageOperatorLeftPage(RedoBufferInfo *leftbuf, BlockNumber rightblk) +{ + Page leftpage; + HashPageOpaque leftopaque; + + leftpage = leftbuf->pageinfo.page; + leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); + leftopaque->hasho_nextblkno = rightblk; + + PageSetLSN(leftpage, leftbuf->lsn); +} + +void HashRedoAddOvflPageOperatorMapPage(RedoBufferInfo *mapbuf, void *data) +{ + uint32 *bitmap_page_bit = (uint32 *)data; + Page mappage = mapbuf->pageinfo.page; + uint32 *freep = NULL; + + freep = HashPageGetBitmap(mappage); + SETBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, mapbuf->lsn); +} + +void HashRedoAddOvflPageOperatorNewmapPage(RedoBufferInfo *newmapbuf, void *recorddata) +{ + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *)recorddata; + + _hash_initbitmapbuffer(newmapbuf->buf, xlrec->bmsize, true); + + PageSetLSN(newmapbuf->pageinfo.page, newmapbuf->lsn); +} + +void HashRedoAddOvflPageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *data, Size datalen) +{ + HashMetaPage metap; + uint32 firstfree_ovflpage; + BlockNumber *newmapblk = NULL; + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *)recorddata; + errno_t rc = EOK; + + rc = memcpy_s(&firstfree_ovflpage, sizeof(uint32), data, sizeof(uint32)); + securec_check(rc, "", ""); + metap = HashPageGetMeta(metabuf->pageinfo.page); + metap->hashm_firstfree = firstfree_ovflpage; + + if (!xlrec->bmpage_found) { + metap->hashm_spares[metap->hashm_ovflpoint]++; + + if (datalen > sizeof(uint32)) { + Assert(datalen == sizeof(uint32) + sizeof(BlockNumber)); + + newmapblk = (BlockNumber *)((char *)data + sizeof(uint32)); + Assert(BlockNumberIsValid(*newmapblk)); + + metap->hashm_mapp[metap->hashm_nmaps] = *newmapblk; + metap->hashm_nmaps++; + metap->hashm_spares[metap->hashm_ovflpoint]++; + } + } + + PageSetLSN(metabuf->pageinfo.page, metabuf->lsn); +} + +void HashRedoSplitAllocatePageOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata) +{ + Page oldpage; + HashPageOpaque oldopaque; + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *)recorddata; + + oldpage = oldbukbuf->pageinfo.page; + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + oldopaque->hasho_prevblkno = xlrec->new_bucket; + + PageSetLSN(oldpage, oldbukbuf->lsn); +} + +void HashRedoSplitAllocatePageOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata) +{ + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *)recorddata; + + _hash_initbuf(newbukbuf->buf, xlrec->new_bucket, xlrec->new_bucket, xlrec->new_bucket_flag, true); + + PageSetLSN(newbukbuf->pageinfo.page, newbukbuf->lsn); +} + +void HashRedoSplitAllocatePageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *blkdata) +{ + HashMetaPage metap; + char *data = (char *)blkdata; + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *)recorddata; + + metap = HashPageGetMeta(metabuf->pageinfo.page); + metap->hashm_maxbucket = xlrec->new_bucket; + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) { + uint32 lowmask; + uint32 *highmask = NULL; + errno_t rc = EOK; + + /* extract low and high masks. */ + rc = memcpy_s(&lowmask, sizeof(uint32), data, sizeof(uint32)); + securec_check(rc, "", ""); + highmask = (uint32 *)((char *)data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_lowmask = lowmask; + metap->hashm_highmask = *highmask; + + data += sizeof(uint32) * 2; + } + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) { + uint32 ovflpoint; + uint32 *ovflpages = NULL; + errno_t rc = EOK; + + /* extract information of overflow pages. */ + rc = memcpy_s(&ovflpoint, sizeof(uint32), data, sizeof(uint32)); + securec_check(rc, "", ""); + ovflpages = (uint32 *)((char *)data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_spares[ovflpoint] = *ovflpages; + metap->hashm_ovflpoint = ovflpoint; + } + + PageSetLSN(metabuf->pageinfo.page, metabuf->lsn); +} + +void HashRedoSplitCompleteOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata) +{ + Page oldpage; + HashPageOpaque oldopaque; + xl_hash_split_complete *xlrec = (xl_hash_split_complete *)recorddata; + + oldpage = oldbukbuf->pageinfo.page; + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + oldopaque->hasho_flag = xlrec->old_bucket_flag; + + PageSetLSN(oldpage, oldbukbuf->lsn); +} + +void HashRedoSplitCompleteOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata) +{ + Page newpage; + HashPageOpaque newopaque; + xl_hash_split_complete *xlrec = (xl_hash_split_complete *)recorddata; + + newpage = newbukbuf->pageinfo.page; + newopaque = (HashPageOpaque) PageGetSpecialPointer(newpage); + newopaque->hasho_flag = xlrec->new_bucket_flag; + + PageSetLSN(newpage, newbukbuf->lsn); +} + +void HashXlogMoveAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len) +{ + Page writepage = redobuffer->pageinfo.page;; + char *begin = (char *)blkdata; + char *data = (char *)blkdata; + Size datalen = len; + uint16 ninserted = 0; + + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) (recorddata); + + if (xldata->ntups > 0) { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while ((Size)(data - begin) < datalen) { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + PageSetLSN(writepage, redobuffer->lsn); +} + +void HashXlogMoveDeleteOvflPageOperatorPage(RedoBufferInfo *redobuffer, void *blkdata, Size len) +{ + Page page = redobuffer->pageinfo.page;; + char *data = (char *)blkdata; + Size datalen = len; + + if (datalen > 0) { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) data; + unend = (OffsetNumber *) ((char *) data + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, redobuffer->lsn); +} + +/* adding item to overflow buffer(writepage) from free overflowpage */ +void HashXlogSqueezeAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len) +{ + Page writepage = redobuffer->pageinfo.page; + char *begin = (char *)blkdata; + char *data = (char *)blkdata; + Size datalen = len; + uint16 ninserted = 0; + + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) (recorddata); + + if (xldata->ntups > 0) { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while ((Size)(data - begin) < datalen) { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + /* + * if the page on which are adding tuples is a page previous to freed + * overflow page, then update its nextblkno. + */ + if (xldata->is_prev_bucket_same_wrt) { + HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); + + writeopaque->hasho_nextblkno = xldata->nextblkno; + } + + PageSetLSN(writepage, redobuffer->lsn); +} + +/* initializing free overflow page */ +void HashXlogSqueezeInitOvflbufOperatorPage(RedoBufferInfo *redobuffer, void *recorddata) +{ + Page ovflpage; + HashPageOpaque ovflopaque; + + ovflpage = redobuffer->pageinfo.page; + + _hash_pageinit(ovflpage, BufferGetPageSize(redobuffer->buf)); + + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + + ovflopaque->hasho_prevblkno = InvalidBlockNumber; + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = InvalidBucket; + ovflopaque->hasho_flag = LH_UNUSED_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + PageSetLSN(ovflpage, redobuffer->lsn); +} + +void HashXlogSqueezeUpdatePrevPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata) +{ + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) (recorddata); + + Page prevpage = redobuffer->pageinfo.page; + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + prevopaque->hasho_nextblkno = xldata->nextblkno; + + PageSetLSN(prevpage, redobuffer->lsn); +} + +void HashXlogSqueezeUpdateNextPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata) +{ + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) (recorddata); + + Page nextpage = redobuffer->pageinfo.page; + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + nextopaque->hasho_prevblkno = xldata->prevblkno; + + PageSetLSN(nextpage, redobuffer->lsn); +} + +void HashXlogSqueezeUpdateBitmapOperatorPage(RedoBufferInfo *redobuffer, void *blkdata) +{ + Page mappage = redobuffer->pageinfo.page; + uint32 *freep = NULL; + char *data = (char *)blkdata; + uint32 *bitmap_page_bit; + + freep = HashPageGetBitmap(mappage); + + bitmap_page_bit = (uint32 *) data; + + CLRBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, redobuffer->lsn); +} + +void HashXlogSqueezeUpdateMateOperatorPage(RedoBufferInfo *redobuffer, void *blkdata) +{ + HashMetaPage metap; + Page page = redobuffer->pageinfo.page; + char *data = (char *)blkdata; + uint32 *firstfree_ovflpage; + + firstfree_ovflpage = (uint32 *) data; + + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + PageSetLSN(page, redobuffer->lsn); +} + +void HashXlogDeleteBlockOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len) +{ + xl_hash_delete *xldata = (xl_hash_delete *)(recorddata); + + Page page = redobuffer->pageinfo.page; + char *datapos = (char *)blkdata; + + if (len > 0) { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) datapos; + unend = (OffsetNumber *) ((char *) datapos + len); + + if ((unend - unused) > 0) { + PageIndexMultiDelete(page, unused, unend - unused); + } + } + + /* + * Mark the page as not containing any LP_DEAD items only if + * clear_dead_marking flag is set to true. See comments in + * hashbucketcleanup() for details. + */ + if (xldata->clear_dead_marking) { + HashPageOpaque pageopaque; + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + } + + PageSetLSN(page, redobuffer->lsn); +} + +void HashXlogSplitCleanupOperatorPage(RedoBufferInfo *redobuffer) +{ + Page page; + HashPageOpaque bucket_opaque; + + page = redobuffer->pageinfo.page; + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + + /* cleanup flag for finished split */ + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + + PageSetLSN(page, redobuffer->lsn); +} + +void HashXlogUpdateMetaOperatorPage(RedoBufferInfo *redobuffer, void *recorddata) +{ + Page page; + HashMetaPage metap; + xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) (recorddata); + + page = redobuffer->pageinfo.page; + metap = HashPageGetMeta(page); + + metap->hashm_ntuples = xldata->ntuples; + + PageSetLSN(page, redobuffer->lsn); + +} + +void HashXlogVacuumOnePageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, Size len) +{ + Page page = redobuffer->pageinfo.page; + xl_hash_vacuum_one_page *xldata; + HashPageOpaque pageopaque; + + xldata = (xl_hash_vacuum_one_page *) (recorddata); + + if (len > SizeOfHashVacuumOnePage) { + OffsetNumber *unused; + + unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage); + + PageIndexMultiDelete(page, unused, xldata->ntuples); + } + + /* + * Mark the page as not containing any LP_DEAD items. See comments in + * _hash_vacuum_one_page() for details. + */ + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; + + PageSetLSN(page, redobuffer->lsn); +} + +void HashXlogVacuumMateOperatorPage(RedoBufferInfo *redobuffer, void *recorddata) +{ + Page metapage; + HashMetaPage metap; + xl_hash_vacuum_one_page *xldata; + xldata = (xl_hash_vacuum_one_page *) (recorddata); + + metapage = redobuffer->pageinfo.page; + metap = HashPageGetMeta(metapage); + + metap->hashm_ntuples -= xldata->ntuples; + + PageSetLSN(metapage, redobuffer->lsn); +} + +static void HashXlogInitMetaPageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_INIT_META_PAGE_NUM) { + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + HashRedoInitMetaPageOperatorPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + if (blockhead->forknum == INIT_FORKNUM) { + FlushOneBuffer(bufferinfo->buf); + } + } +} + +static void HashXlogInitBitmapPageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + bool modifypage = false; + if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM) { + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + HashRedoInitBitmapPageOperatorBitmapPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + modifypage = true; + } else { + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashRedoInitBitmapPageOperatorMetaPage(bufferinfo); + MakeRedoBufferDirty(bufferinfo); + modifypage = true; + } + } + + if (blockhead->forknum == INIT_FORKNUM && modifypage) { + FlushOneBuffer(bufferinfo->buf); + } +} + +static void HashXlogInsertBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action != BLK_NEEDS_REDO) { + return; + } + + if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_INSERT_PAGE_NUM) { + Size blkdatalen; + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + char *blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); + + HashRedoInsertOperatorPage(bufferinfo, (void *)maindata, (void *)blkdata, blkdatalen); + } else { + HashRedoInsertOperatorMetaPage(bufferinfo); + } + MakeRedoBufferDirty(bufferinfo); +} + +static void HashXlogAddOvflPageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM) { + Size blkdatalen; + char *blkdata = NULL; + BlockNumber leftblk; + blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); + leftblk = XLogBlockDataGetAuxiBlock1(datadecode); + + HashRedoAddOvflPageOperatorOvflPage(bufferinfo, leftblk, blkdata, blkdatalen); + MakeRedoBufferDirty(bufferinfo); + } else if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM) { + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + BlockNumber rightblk = XLogBlockDataGetAuxiBlock1(datadecode); + HashRedoAddOvflPageOperatorLeftPage(bufferinfo, rightblk); + MakeRedoBufferDirty(bufferinfo); + } + } else if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM) { + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + char *blkdata = XLogBlockDataGetBlockData(datadecode, NULL); + HashRedoAddOvflPageOperatorMapPage(bufferinfo, blkdata); + MakeRedoBufferDirty(bufferinfo); + } + } else if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM) { + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + HashRedoAddOvflPageOperatorNewmapPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + + } else { + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + Size blkdatalen; + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + char *blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); + + HashRedoAddOvflPageOperatorMetaPage(bufferinfo, maindata, blkdata, blkdatalen); + MakeRedoBufferDirty(bufferinfo); + } + } +} + +static void HashXlogSplitAllocatePageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM) { + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + HashRedoSplitAllocatePageOperatorObukPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + } + } else if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM) { + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + HashRedoSplitAllocatePageOperatorNbukPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + } else { + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + char *blkdata = XLogBlockDataGetBlockData(datadecode, NULL); + HashRedoSplitAllocatePageOperatorMetaPage(bufferinfo, maindata, blkdata); + MakeRedoBufferDirty(bufferinfo); + } + } +} + +static void HashXlogSplitPageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action != BLK_RESTORED) { + ereport(ERROR, (errmsg("Hash split record did not contain a full-page image"))); + } + MakeRedoBufferDirty(bufferinfo); +} + +static void HashXlogSplitCompleteBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action != BLK_NEEDS_REDO && action != BLK_RESTORED) { + return; + } + + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + if (XLogBlockDataGetBlockId(datadecode) == XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM) { + HashRedoSplitCompleteOperatorObukPage(bufferinfo, maindata); + } else { + HashRedoSplitCompleteOperatorNbukPage(bufferinfo, maindata); + } + MakeRedoBufferDirty(bufferinfo); +} + +static void HashXlogMovePageContentsBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + Size blkdatalen; + char *blkdata = NULL; + blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); + uint8 block_id = XLogBlockDataGetBlockId(datadecode); + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + + if (block_id == HASH_MOVE_BUK_BLOCK_NUM) { + PageSetLSN(bufferinfo->pageinfo.page, bufferinfo->lsn); + } + + if (block_id == HASH_MOVE_ADD_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogMoveAddPageOperatorPage(bufferinfo, maindata, blkdata, blkdatalen); + MakeRedoBufferDirty(bufferinfo); + } + } + + if (block_id == HASH_MOVE_DELETE_OVFL_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogMoveDeleteOvflPageOperatorPage(bufferinfo, blkdata, blkdatalen); + MakeRedoBufferDirty(bufferinfo); + } + } +} + +static void HashXlogSqueezePageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + Size blkdatalen; + char *blkdata = NULL; + blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); + uint8 block_id = XLogBlockDataGetBlockId(datadecode); + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + + if (block_id == HASH_SQUEEZE_BUK_BLOCK_NUM) { + PageSetLSN(bufferinfo->pageinfo.page, bufferinfo->lsn); + } + + if (block_id == HASH_SQUEEZE_ADD_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogSqueezeAddPageOperatorPage(bufferinfo, maindata, blkdata, blkdatalen); + MakeRedoBufferDirty(bufferinfo); + } + } + + if (block_id == HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogSqueezeInitOvflbufOperatorPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + } + } + + if (block_id == HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) (maindata); + if (!xldata->is_prev_bucket_same_wrt && action == BLK_NEEDS_REDO) { + HashXlogSqueezeUpdatePrevPageOperatorPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + } + } + + if (block_id == HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogSqueezeUpdateNextPageOperatorPage(bufferinfo, maindata); + MakeRedoBufferDirty(bufferinfo); + } + } + + if (block_id == HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogSqueezeUpdateBitmapOperatorPage(bufferinfo, blkdata); + MakeRedoBufferDirty(bufferinfo); + } + } + + if (block_id == HASH_SQUEEZE_UPDATE_META_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogSqueezeUpdateMateOperatorPage(bufferinfo, blkdata); + MakeRedoBufferDirty(bufferinfo); + } + } +} + +static void HashXlogDeleteBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + uint8 block_id = XLogBlockDataGetBlockId(datadecode); + Size blkdatalen; + char *blkdata = NULL; + blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); + XLogRedoAction action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + + if (block_id == HASH_DELETE_OVFL_BLOCK_NUM) { + if (action == BLK_NEEDS_REDO) { + HashXlogDeleteBlockOperatorPage(bufferinfo, maindata, blkdata, blkdatalen); + MakeRedoBufferDirty(bufferinfo); + } + } else { + PageSetLSN(bufferinfo->pageinfo.page, bufferinfo->lsn); + } +} + +static void HashXlogSplitCleanupBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogSplitCleanupOperatorPage(bufferinfo); + MakeRedoBufferDirty(bufferinfo); + } +} + +static void HashXlogUpdateMetaPageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogUpdateMetaOperatorPage(bufferinfo, (void *)maindata); + MakeRedoBufferDirty(bufferinfo); + } +} + +static void HashXlogVacuumOnePageBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, + RedoBufferInfo *bufferinfo) +{ + XLogBlockDataParse *datadecode = blockdatarec; + uint8 block_id = XLogBlockDataGetBlockId(datadecode); + Size maindatalen; + char *maindata = XLogBlockDataGetMainData(datadecode, &maindatalen); + + if (block_id == HASH_VACUUM_PAGE_BLOCK_NUM) { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogVacuumOnePageOperatorPage(bufferinfo, (void *)maindata, maindatalen); + MakeRedoBufferDirty(bufferinfo); + } + } else { + XLogRedoAction action; + action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); + if (action == BLK_NEEDS_REDO) { + HashXlogVacuumMateOperatorPage(bufferinfo, (void *)maindata); + MakeRedoBufferDirty(bufferinfo); + } + } +} + +void HashRedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, RedoBufferInfo *bufferinfo) +{ + uint8 info = XLogBlockHeadGetInfo(blockhead) & ~XLR_INFO_MASK; + + switch (info) { + case XLOG_HASH_INIT_META_PAGE: + HashXlogInitMetaPageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + HashXlogInitBitmapPageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_INSERT: + HashXlogInsertBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + HashXlogAddOvflPageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + HashXlogSplitAllocatePageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_SPLIT_PAGE: + HashXlogSplitPageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_SPLIT_COMPLETE: + HashXlogSplitCompleteBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + HashXlogMovePageContentsBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_SQUEEZE_PAGE: + HashXlogSqueezePageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_DELETE: + HashXlogDeleteBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_SPLIT_CLEANUP: + HashXlogSplitCleanupBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_UPDATE_META_PAGE: + HashXlogUpdateMetaPageBlock(blockhead, blockdatarec, bufferinfo); + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + HashXlogVacuumOnePageBlock(blockhead, blockdatarec, bufferinfo); + break; + default: + ereport(PANIC, (errmsg("hash_redo_block: unknown op code %u", info))); + } +} \ No newline at end of file diff --git a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp index 8cebaac0d..d1fb04820 100644 --- a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp +++ b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp @@ -843,6 +843,9 @@ void XLogBlockDataCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoB case RM_BTREE_ID: BtreeRedoDataBlock(blockhead, blockdatarec, bufferinfo); break; + case RM_HASH_ID: + HashRedoDataBlock(blockhead, blockdatarec, bufferinfo); + break; case RM_XLOG_ID: xlog_redo_data_block(blockhead, blockdatarec, bufferinfo); break; diff --git a/src/gausskernel/storage/access/rmgrdesc/hashdesc.cpp b/src/gausskernel/storage/access/rmgrdesc/hashdesc.cpp index 04caba655..5533f09fe 100644 --- a/src/gausskernel/storage/access/rmgrdesc/hashdesc.cpp +++ b/src/gausskernel/storage/access/rmgrdesc/hashdesc.cpp @@ -16,9 +16,155 @@ #include "postgres.h" #include "knl/knl_variable.h" -#include "access/hash.h" +#include "access/rmgr.h" +#include "access/hash_xlog.h" void hash_desc(StringInfo buf, XLogReaderState *record) { - /* nothing to do */ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) { + case XLOG_HASH_INIT_META_PAGE: + { + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; + + appendStringInfo(buf, "num_tuples %g, fillfactor %d", + xlrec->num_tuples, xlrec->ffactor); + break; + } + case XLOG_HASH_INIT_BITMAP_PAGE: + { + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec; + + appendStringInfo(buf, "bmsize %d", xlrec->bmsize); + break; + } + case XLOG_HASH_INSERT: + { + xl_hash_insert *xlrec = (xl_hash_insert *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + break; + } + case XLOG_HASH_ADD_OVFL_PAGE: + { + xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec; + + appendStringInfo(buf, "bmsize %d, bmpage_found %c", + xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + { + xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec; + + appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + xlrec->new_bucket, + (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', + (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_COMPLETE: + { + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec; + + appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u", + xlrec->old_bucket_flag, xlrec->new_bucket_flag); + break; + } + case XLOG_HASH_MOVE_PAGE_CONTENTS: + { + xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec; + + appendStringInfo(buf, "ntups %d, is_primary %c", + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_SQUEEZE_PAGE: + { + xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; + + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + xlrec->prevblkno, + xlrec->nextblkno, + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_DELETE: + { + xl_hash_delete *xlrec = (xl_hash_delete *) rec; + + appendStringInfo(buf, "clear_dead_marking %c, is_primary %c", + xlrec->clear_dead_marking ? 'T' : 'F', + xlrec->is_primary_bucket_page ? 'T' : 'F'); + break; + } + case XLOG_HASH_UPDATE_META_PAGE: + { + xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec; + + appendStringInfo(buf, "ntuples %g", + xlrec->ntuples); + break; + } + case XLOG_HASH_VACUUM_ONE_PAGE: + { + xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec; + + appendStringInfo(buf, "ntuples %d", + xlrec->ntuples); + break; + } + } +} + +const char *hash_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) { + case XLOG_HASH_INIT_META_PAGE: + id = "INIT_META_PAGE"; + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + id = "INIT_BITMAP_PAGE"; + break; + case XLOG_HASH_INSERT: + id = "INSERT"; + break; + case XLOG_HASH_ADD_OVFL_PAGE: + id = "ADD_OVFL_PAGE"; + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + id = "SPLIT_ALLOCATE_PAGE"; + break; + case XLOG_HASH_SPLIT_PAGE: + id = "SPLIT_PAGE"; + break; + case XLOG_HASH_SPLIT_COMPLETE: + id = "SPLIT_COMPLETE"; + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + id = "MOVE_PAGE_CONTENTS"; + break; + case XLOG_HASH_SQUEEZE_PAGE: + id = "SQUEEZE_PAGE"; + break; + case XLOG_HASH_DELETE: + id = "DELETE"; + break; + case XLOG_HASH_SPLIT_CLEANUP: + id = "SPLIT_CLEANUP"; + break; + case XLOG_HASH_UPDATE_META_PAGE: + id = "UPDATE_META_PAGE"; + break; + case XLOG_HASH_VACUUM_ONE_PAGE: + id = "VACUUM_ONE_PAGE"; + } + + return id; } \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp b/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp index d7f21bc44..15ecca232 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp @@ -32,6 +32,7 @@ #include "access/xact.h" #include "access/xlog_internal.h" #include "access/nbtree.h" +#include "access/hash_xlog.h" #include "access/xlogreader.h" #include "access/gist_private.h" #include "access/multixact.h" @@ -165,7 +166,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = { { DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE }, { DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE }, { DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE }, - { DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 }, + { DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE }, { DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE }, /* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */ { DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 }, @@ -1031,8 +1032,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time /* Run from the dispatcher thread. */ static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) { - DispatchTxnRecord(record, expectedTLIs, recordXTime, false, true); - return true; + bool isNeedFullSync = false; + + /* index not support mvcc, so we need to sync with trx thread when the record is vacuum */ + if (IsHashVacuumPages(record) && g_supportHotStandby) { + GetSlotIds(record, ANY_WORKER, true); + /* sync with trxn thread */ + /* only need to process in pageworker thread, wait trxn sync */ + /* pageworker exe, trxn don't need exe */ + DispatchToSpecPageWorker(record, expectedTLIs, true); + } else { + DispatchRecordWithPages(record, expectedTLIs, true); + } + + return isNeedFullSync; } /* Run from the dispatcher thread. */ diff --git a/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp b/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp index 284ba4109..3a76002b9 100644 --- a/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp @@ -31,6 +31,7 @@ #include "access/xact.h" #include "access/xlog_internal.h" #include "access/nbtree.h" +#include "access/hash_xlog.h" #include "access/xlogreader.h" #include "access/gist_private.h" #include "access/multixact.h" @@ -165,7 +166,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = { { DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE }, { DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE }, { DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE }, - { DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 }, + { DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE }, { DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE }, /* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */ { DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 }, @@ -912,8 +913,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time /* Run from the dispatcher thread. */ static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) { - DispatchTxnRecord(record, expectedTLIs, recordXTime, false); - return true; + bool isNeedFullSync = false; + + /* index not support mvcc, so we need to sync with trx thread when the record is vacuum */ + if (IsHashVacuumPages(record) && g_supportHotStandby) { + GetWorkerIds(record, ANY_WORKER, true); + /* sync with trxn thread */ + /* only need to process in pageworker thread, wait trxn sync */ + /* pageworker exe, trxn don't need exe */ + DispatchToSpecPageWorker(record, expectedTLIs, true); + } else { + DispatchRecordWithPages(record, expectedTLIs, true); + } + + return isNeedFullSync; } static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) diff --git a/src/gausskernel/storage/access/transam/rmgr.cpp b/src/gausskernel/storage/access/transam/rmgr.cpp index 3afe4880d..8c448a7c3 100644 --- a/src/gausskernel/storage/access/transam/rmgr.cpp +++ b/src/gausskernel/storage/access/transam/rmgr.cpp @@ -29,6 +29,7 @@ #include "access/gin.h" #include "access/gist_private.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" diff --git a/src/gausskernel/storage/buffer/bufmgr.cpp b/src/gausskernel/storage/buffer/bufmgr.cpp index 4f0bc1c4a..7d3dfa706 100644 --- a/src/gausskernel/storage/buffer/bufmgr.cpp +++ b/src/gausskernel/storage/buffer/bufmgr.cpp @@ -5264,6 +5264,51 @@ bool ConditionalLockBufferForCleanup(Buffer buffer) return false; } +/* + * IsBufferCleanupOK - as above, but we already have the lock + * + * Check whether it's OK to perform cleanup on a buffer we've already + * locked. If we observe that the pin count is 1, our exclusive lock + * happens to be a cleanup lock, and we can proceed with anything that + * would have been allowable had we sought a cleanup lock originally. + */ +bool IsBufferCleanupOK(Buffer buffer) +{ + BufferDesc *bufHdr; + uint32 buf_state; + + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) { + /* There should be exactly one pin */ + if (u_sess->storage_cxt.LocalRefCount[-buffer - 1] != 1) + return false; + /* Nobody else to wait for */ + return true; + } + + /* There should be exactly one local pin */ + if (GetPrivateRefCount(buffer) != 1) + return false; + + bufHdr = GetBufferDescriptor(buffer - 1); + + /* caller must hold exclusive lock on buffer */ + Assert(LWLockHeldByMeInMode(bufHdr->content_lock, LW_EXCLUSIVE)); + + buf_state = LockBufHdr(bufHdr); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) { + /* pincount is OK. */ + UnlockBufHdr(bufHdr, buf_state); + return true; + } + + UnlockBufHdr(bufHdr, buf_state); + return false; +} + /* * Functions for buffer I/O handling * diff --git a/src/gausskernel/storage/page/bufpage.cpp b/src/gausskernel/storage/page/bufpage.cpp index 8f047ae53..6e8593ae9 100644 --- a/src/gausskernel/storage/page/bufpage.cpp +++ b/src/gausskernel/storage/page/bufpage.cpp @@ -400,3 +400,28 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno) ((PageHeader)page)->pd_checksum = pg_checksum_page((char*)page, blkno); } + +/* + * PageGetFreeSpaceForMultipleTuples + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for multiple new line pointers. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. + */ +Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int)((PageHeader)page)->pd_upper - (int)((PageHeader)page)->pd_lower; + + if (space < (int)(ntups * sizeof(ItemIdData))) + return 0; + space -= ntups * sizeof(ItemIdData); + + return (Size) space; +} diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 7a48f5645..95c74d17c 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -4,7 +4,7 @@ * header file for postgres hash access method implementation * * - * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/access/hash.h @@ -33,36 +33,59 @@ */ typedef uint32 Bucket; -#define INVALID_BUCKET_NUM (0xFFFFFFFF) -#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B) + 1) - 1] : 0)) + 1) +#define InvalidBucket ((Bucket) 0xFFFFFFFF) +#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B) + 1) - 1] : 0)) + 1) /* * Special space for hash index pages. * - * hasho_flag tells us which type of page we're looking at. For - * example, knowing overflow pages from bucket pages is necessary - * information when you're deleting tuples from a page. If all the - * tuples are deleted from an overflow page, the overflow is made - * available to other buckets by calling _hash_freeovflpage(). If all - * the tuples are deleted from a bucket page, no additional action is - * necessary. + * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at. + * Additional bits in the flag word are used for more transient purposes. + * + * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE. + * However, we ensure that each used page type has a distinct bit so that + * we can OR together page types for uses such as the allowable-page-types + * argument of _hash_checkpage(). */ #define LH_UNUSED_PAGE (0) #define LH_OVERFLOW_PAGE (1 << 0) #define LH_BUCKET_PAGE (1 << 1) #define LH_BITMAP_PAGE (1 << 2) #define LH_META_PAGE (1 << 3) +#define LH_BUCKET_BEING_POPULATED (1 << 4) +#define LH_BUCKET_BEING_SPLIT (1 << 5) +#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6) +#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) +#define LH_PAGE_TYPE \ + (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE) + +/* + * In an overflow page, hasho_prevblkno stores the block number of the previous + * page in the bucket chain; in a bucket page, hasho_prevblkno stores the + * hashm_maxbucket value as of the last time the bucket was last split, or + * else as of the time the bucket was created. The latter convention is used + * to determine whether a cached copy of the metapage is too stale to be used + * without needing to lock or pin the metapage. + * + * hasho_nextblkno is always the block number of the next page in the + * bucket chain, or InvalidBlockNumber if there are no more such pages. + */ typedef struct HashPageOpaqueData { - BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */ - BlockNumber hasho_nextblkno; /* next ovfl blkno */ - Bucket hasho_bucket; /* bucket number this pg belongs to */ - uint16 hasho_flag; /* page type code, see above */ - uint16 hasho_page_id; /* for identification of hash indexes */ + BlockNumber hasho_prevblkno; /* see above */ + BlockNumber hasho_nextblkno; /* see above */ + Bucket hasho_bucket; /* bucket number this pg belongs to */ + uint16 hasho_flag; /* page type code + flag bits, see above */ + uint16 hasho_page_id; /* for identification of hash indexes */ } HashPageOpaqueData; typedef HashPageOpaqueData* HashPageOpaque; +#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0) +#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0) +#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0) +#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0) + /* * The page ID is for the convenience of pg_filedump and similar utilities, * which otherwise would have a hard time telling pages of different index @@ -71,26 +94,19 @@ typedef HashPageOpaqueData* HashPageOpaque; */ #define HASHO_PAGE_ID 0xFF80 +typedef struct HashScanPosItem { + ItemPointerData heapTid; /* TID of referenced heap item */ + OffsetNumber indexOffset; /* index item's location within page */ +} HashScanPosItem; + + /* - * HashScanOpaqueData is private state for a hash index scan. + * HashScanOpaqueData is private state for a hash index scan. */ typedef struct HashScanOpaqueData { /* Hash value of the scan key, ie, the hash key we seek */ uint32 hashso_sk_hash; - /* - * By definition, a hash scan should be examining only one bucket. We - * record the bucket number here as soon as it is known. - */ - Bucket hashso_bucket; - bool hashso_bucket_valid; - - /* - * If we have a share lock on the bucket, we record it here. When - * hashso_bucket_blkno is zero, we have no such lock. - */ - BlockNumber hashso_bucket_blkno; - /* * We also want to remember which buffer we're currently examining in the * scan. We keep the buffer pinned (but not locked) across hashgettuple @@ -99,11 +115,33 @@ typedef struct HashScanOpaqueData { */ Buffer hashso_curbuf; + /* remember the buffer associated with primary bucket */ + Buffer hashso_bucket_buf; + + /* + * remember the buffer associated with primary bucket page of bucket being + * split. it is required during the scan of the bucket which is being + * populated during split operation. + */ + Buffer hashso_split_bucket_buf; + /* Current position of the scan, as an index TID */ ItemPointerData hashso_curpos; /* Current position of the scan, as a heap TID */ ItemPointerData hashso_heappos; + + /* Whether scan starts on bucket being populated due to split */ + bool hashso_buc_populated; + + /* + * Whether scanning bucket being split? The value of this parameter is + * referred only when hashso_buc_populated is true. + */ + bool hashso_buc_split; + /* info about killed items if any (killedItems is NULL if never used) */ + HashScanPosItem *killedItems; /* tids and offset numbers of killed items */ + int numKilled; /* number of currently stored items */ } HashScanOpaqueData; typedef HashScanOpaqueData* HashScanOpaque; @@ -115,7 +153,7 @@ typedef HashScanOpaqueData* HashScanOpaque; #define HASH_METAPAGE 0 /* metapage is always block 0 */ #define HASH_MAGIC 0x6440640 -#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */ +#define HASH_VERSION 4 /* * Spares[] holds the number of overflow pages currently allocated at or @@ -128,17 +166,32 @@ typedef HashScanOpaqueData* HashScanOpaque; * * ovflpages that have been recycled for reuse can be found by looking at * bitmaps that are stored within ovflpages dedicated for the purpose. - * The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the + * The blknos of these bitmap pages are kept in mapp[]; nmaps is the * number of currently existing bitmaps. * * The limitation on the size of spares[] comes from the fact that there's * no point in having more than 2^32 buckets with only uint32 hashcodes. + * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is + * adjusted in such a way to accommodate multi phased allocation of buckets + * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE). + * * There is no particular upper limit on the size of mapp[], other than - * needing to fit into the metapage. (With 8K block size, 128 bitmaps - * limit us to 64 Gb of overflow space...) + * needing to fit into the metapage. (With 8K block size, 1024 bitmaps + * limit us to 256 GB of overflow space...) */ -#define HASH_MAX_SPLITPOINTS 32 -#define HASH_MAX_BITMAPS 128 +#define HASH_MAX_BITMAPS 1024 + +#define HASH_SPLITPOINT_PHASE_BITS 2 +#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS) +#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1) +#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10 + +/* defines max number of splitpoit phases a hash index can have */ +#define HASH_MAX_SPLITPOINT_GROUP 32 +#define HASH_MAX_SPLITPOINTS \ + (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \ + HASH_SPLITPOINT_PHASES_PER_GRP) + \ + HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) typedef struct HashMetaPageData { uint32 hashm_magic; /* magic no. for hash tables */ @@ -280,37 +333,40 @@ extern Datum hash_new_uint32(uint32 k); /* private routines */ /* hashinsert.c */ -extern void _hash_doinsert(Relation rel, IndexTuple itup); +extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel); extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup); +extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, + OffsetNumber *itup_offsets, uint16 nitups); /* hashovfl.c */ -extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf); -extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy); -extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum); -extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy); +extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); +extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy); +extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage); +extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy); /* hashpage.c */ -extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access); -extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access); -extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access); extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags); +extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, + BlockNumber blkno, int flags); +extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh); +extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, + int access, HashMetaPage *cachedmetap); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); +extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage); extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum); extern Buffer _hash_getbuf_with_strategy( Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy); extern void _hash_relbuf(Relation rel, Buffer buf); extern void _hash_dropbuf(Relation rel, Buffer buf); -extern void _hash_wrtbuf(Relation rel, Buffer buf); -extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); -extern uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum); +extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so); +extern uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum); +extern void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); - -/* hashscan.c */ -extern void _hash_regscan(IndexScanDesc scan); -extern void _hash_dropscan(IndexScanDesc scan); -extern bool _hash_has_active_scan(Relation rel, Bucket bucket); -extern void ReleaseResources_hash(void); +extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, + uint32 maxbucket, uint32 highmask, uint32 lowmask); /* hashsearch.c */ extern bool _hash_next(IndexScanDesc scan, ScanDirection dir); @@ -320,10 +376,10 @@ extern bool _hash_step(IndexScanDesc scan, Buffer* bufP, ScanDirection dir); /* hashsort.c */ typedef struct HSpool HSpool; /* opaque struct in hashsort.c */ -extern HSpool* _h_spoolinit(Relation index, uint32 num_buckets, void* meminfo); +extern HSpool* _h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void* meminfo); extern void _h_spooldestroy(HSpool* hspool); extern void _h_spool(HSpool* hspool, ItemPointer self, Datum* values, const bool* isnull); -extern void _h_indexbuild(HSpool* hspool); +extern void _h_indexbuild(HSpool* hspool, Relation heapRel); /* hashutil.c */ extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup); @@ -331,15 +387,28 @@ extern uint32 _hash_datum2hashkey(Relation rel, Datum key); extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype); extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask); extern uint32 _hash_log2(uint32 num); +extern uint32 _hash_spareindex(uint32 num_bucket); +extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase); extern void _hash_checkpage(Relation rel, Buffer buf, int flags); extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup); -extern IndexTuple _hash_form_tuple(Relation index, Datum* values, const bool* isnull); +extern bool _hash_convert_tuple(Relation index, Datum *user_values, const bool *user_isnull, + Datum *index_values, bool *index_isnull); extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value); extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value); +extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket); +extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket); +extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, + uint32 lowmask, uint32 maxbucket); +extern void _hash_kill_items(IndexScanDesc scan); /* hash.c */ -extern void hash_redo(XLogReaderState* record); -extern void hash_desc(StringInfo buf, XLogReaderState* record); +extern void hashbucketcleanup(Relation rel, Bucket cur_bucket, + Buffer bucket_buf, BlockNumber bucket_blkno, + BufferAccessStrategy bstrategy, + uint32 maxbucket, uint32 highmask, uint32 lowmask, + double *tuples_removed, double *num_index_tuples, + bool bucket_has_garbage, + IndexBulkDeleteCallback callback, void *callback_state); #ifdef PGXC extern Datum compute_hash(Oid type, Datum value, char locator); diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h new file mode 100644 index 000000000..9d10c9ae4 --- /dev/null +++ b/src/include/access/hash_xlog.h @@ -0,0 +1,352 @@ +/*------------------------------------------------------------------------- + * + * hash_xlog.h + * header file for Postgres hash AM implementation + * + * Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/hash_xlog.h + * + *------------------------------------------------------------------------- + */ + +#ifndef HASH_XLOG_H +#define HASH_XLOG_H + +#include "access/xlogreader.h" +#include "lib/stringinfo.h" +#include "storage/off.h" + +/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */ +#define HASH_XLOG_FREE_OVFL_BUFS 6 + +/* + * XLOG records for hash operations + */ +#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */ +#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */ +#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */ +#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */ +#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */ +#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */ +#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */ +#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page + * and add to another page */ +#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous + * pages in chain and free the ovfl + * page */ +#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */ +#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary + * bucket page after deleting tuples + * that are moved due to split */ +#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */ +#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index page */ + +typedef enum { + XLOG_HASH_INIT_META_PAGE_NUM = 0, +}XLogHashInitMetaPageEnum; + +typedef enum { + XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM = 0, + XLOG_HASH_INIT_BITMAP_PAGE_META_NUM, +}XLogHashInitBitmapPageEnum; + +typedef enum { + XLOG_HASH_INSERT_PAGE_NUM = 0, + XLOG_HASH_INSERT_META_NUM, +}XLogHashInsertEnum; + +typedef enum { + XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM = 0, + XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM, + XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM, + XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM, + XLOG_HASH_ADD_OVFL_PAGE_META_NUM, +}XLogHashAddOvflPageEnum; + +typedef enum { + XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM = 0, + XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM, + XLOG_HASH_SPLIT_ALLOCATE_PAGE_META_NUM, +}XLogHashSplitAllocatePageEnum; + +typedef enum { + XLOG_HASH_SPLIT_PAGE_NUM = 0, +}XLogHashSplitPageEnum; + +typedef enum { + XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM = 0, + XLOG_HASH_SPLIT_COMPLETE_NBUK_NUM, +}XLogHashSplitCompleteEnum; + +typedef enum { + HASH_MOVE_BUK_BLOCK_NUM = 0, + HASH_MOVE_ADD_BLOCK_NUM, + HASH_MOVE_DELETE_OVFL_BLOCK_NUM, +}XLogHashMovePageEnum; + +typedef enum { + HASH_SQUEEZE_BUK_BLOCK_NUM = 0, + HASH_SQUEEZE_ADD_BLOCK_NUM, + HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM, + HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM, + HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM, + HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM, + HASH_SQUEEZE_UPDATE_META_BLOCK_NUM, +}XLogHashSqueezePageEnum; + +typedef enum { + HASH_DELETE_BUK_BLOCK_NUM = 0, + HASH_DELETE_OVFL_BLOCK_NUM, +}XLogHashDeleteEnum; + +typedef enum { + HASH_SPLIT_CLEANUP_BLOCK_NUM, +}XLogHashSplitCleanupEnum; + +typedef enum { + HASH_UPDATE_META_BLOCK_NUM, +} XLogHashUpdateMateEnum; + +typedef enum { + HASH_VACUUM_PAGE_BLOCK_NUM = 0, + HASH_VACUUM_META_BLOCK_NUM, +} XLogHashVacuumPageEnum; + +/* + * xl_hash_split_allocate_page flag values, 8 bits are available. + */ +#define XLH_SPLIT_META_UPDATE_MASKS (1<<0) +#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1) + +/* + * This is what we need to know about a HASH index create. + * + * Backup block 0: metapage + */ +typedef struct xl_hash_createidx +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_createidx; + +#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16)) + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for XLOG_HASH_INSERT + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: metapage (HashMetaPageData) + */ +typedef struct xl_hash_insert +{ + OffsetNumber offnum; +} xl_hash_insert; + +#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about addition of overflow page. + * + * This data record is used for XLOG_HASH_ADD_OVFL_PAGE + * + * Backup Blk 0: newly allocated overflow page + * Backup Blk 1: page before new overflow page in the bucket chain + * Backup Blk 2: bitmap page + * Backup Blk 3: new bitmap page + * Backup Blk 4: metapage + */ +typedef struct xl_hash_add_ovfl_page +{ + uint16 bmsize; + bool bmpage_found; +} xl_hash_add_ovfl_page; + +#define SizeOfHashAddOvflPage \ + (offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool)) + +/* + * This is what we need to know about allocating a page for split. + * + * This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + * Backup Blk 2: metapage + */ +typedef struct xl_hash_split_allocate_page +{ + uint32 new_bucket; + uint16 old_bucket_flag; + uint16 new_bucket_flag; + uint8 flags; +} xl_hash_split_allocate_page; + +#define SizeOfHashSplitAllocPage \ + (offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8)) + +/* + * This is what we need to know about completing the split operation. + * + * This data record is used for XLOG_HASH_SPLIT_COMPLETE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + */ +typedef struct xl_hash_split_complete +{ + uint16 old_bucket_flag; + uint16 new_bucket_flag; +} xl_hash_split_complete; + +#define SizeOfHashSplitComplete \ + (offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16)) + +/* + * This is what we need to know about move page contents required during + * squeeze operation. + * + * This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS + * + * Backup Blk 0: bucket page + * Backup Blk 1: page containing moved tuples + * Backup Blk 2: page from which tuples will be removed + */ +typedef struct xl_hash_move_page_contents +{ + uint16 ntups; + bool is_prim_bucket_same_wrt; /* true if the page to which + * tuples are moved is same as + * primary bucket page */ +} xl_hash_move_page_contents; + +#define SizeOfHashMovePageContents \ + (offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the squeeze page operation. + * + * This data record is used for XLOG_HASH_SQUEEZE_PAGE + * + * Backup Blk 0: page containing tuples moved from freed overflow page + * Backup Blk 1: freed overflow page + * Backup Blk 2: page previous to the freed overflow page + * Backup Blk 3: page next to the freed overflow page + * Backup Blk 4: bitmap page containing info of freed overflow page + * Backup Blk 5: meta page + */ +typedef struct xl_hash_squeeze_page +{ + BlockNumber prevblkno; + BlockNumber nextblkno; + uint16 ntups; + bool is_prim_bucket_same_wrt; /* true if the page to which + * tuples are moved is same as + * primary bucket page */ + bool is_prev_bucket_same_wrt; /* true if the page to which + * tuples are moved is the page + * previous to the freed overflow + * page */ +} xl_hash_squeeze_page; + +#define SizeOfHashSqueezePage \ + (offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the deletion of index tuples from a page. + * + * This data record is used for XLOG_HASH_DELETE + * + * Backup Blk 0: primary bucket page + * Backup Blk 1: page from which tuples are deleted + */ +typedef struct xl_hash_delete +{ + bool clear_dead_marking; /* true if this operation clears + * LH_PAGE_HAS_DEAD_TUPLES flag */ + bool is_primary_bucket_page; /* true if the operation is for + * primary bucket page */ +} xl_hash_delete; + +#define SizeOfHashDelete \ + (offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool)) + +/* + * This is what we need for metapage update operation. + * + * This data record is used for XLOG_HASH_UPDATE_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_update_meta_page +{ + double ntuples; +} xl_hash_update_meta_page; + +#define SizeOfHashUpdateMetaPage \ + (offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double)) + +/* + * This is what we need to initialize metapage. + * + * This data record is used for XLOG_HASH_INIT_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_init_meta_page +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_init_meta_page; + +#define SizeOfHashInitMetaPage \ + (offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16)) + +/* + * This is what we need to initialize bitmap page. + * + * This data record is used for XLOG_HASH_INIT_BITMAP_PAGE + * + * Backup Blk 0: bitmap page + * Backup Blk 1: meta page + */ +typedef struct xl_hash_init_bitmap_page +{ + uint16 bmsize; +} xl_hash_init_bitmap_page; + +#define SizeOfHashInitBitmapPage \ + (offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16)) + +/* + * This is what we need for index tuple deletion and to + * update the meta page. + * + * This data record is used for XLOG_HASH_VACUUM_ONE_PAGE + * + * Backup Blk 0: bucket page + * Backup Blk 1: meta page + */ +typedef struct xl_hash_vacuum_one_page +{ + RelFileNode hnode; + int ntuples; + + /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ +} xl_hash_vacuum_one_page; + +#define SizeOfHashVacuumOnePage \ + (offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(int)) + +extern void hash_redo(XLogReaderState *record); +extern void hash_desc(StringInfo buf, XLogReaderState *record); +extern const char *hash_identify(uint8 info); +extern bool IsHashVacuumPages(XLogReaderState *record); + +#endif /* HASH_XLOG_H */ diff --git a/src/include/access/xlogproc.h b/src/include/access/xlogproc.h index 438809c0d..dfa731ce2 100644 --- a/src/include/access/xlogproc.h +++ b/src/include/access/xlogproc.h @@ -754,6 +754,47 @@ void BtreeXlogUnlinkPageOperatorChildpage(RedoBufferInfo* cbuf, void* recorddata void BtreeXlogClearIncompleteSplit(RedoBufferInfo* buffer); +void HashRedoInitMetaPageOperatorPage(RedoBufferInfo *metabuf, void *recorddata); + +void HashRedoInitBitmapPageOperatorBitmapPage(RedoBufferInfo *bitmapbuf, void *recorddata); +void HashRedoInitBitmapPageOperatorMetaPage(RedoBufferInfo *metabuf); + +void HashRedoInsertOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *data, Size datalen); +void HashRedoInsertOperatorMetaPage(RedoBufferInfo *metabuf); + +void HashRedoAddOvflPageOperatorOvflPage(RedoBufferInfo *ovflbuf, BlockNumber leftblk, void *data, Size datalen); +void HashRedoAddOvflPageOperatorLeftPage(RedoBufferInfo *ovflbuf, BlockNumber rightblk); +void HashRedoAddOvflPageOperatorMapPage(RedoBufferInfo *mapbuf, void *data); +void HashRedoAddOvflPageOperatorNewmapPage(RedoBufferInfo *newmapbuf, void *recorddata); +void HashRedoAddOvflPageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *data, Size datalen); + +void HashRedoSplitAllocatePageOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata); +void HashRedoSplitAllocatePageOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata); +void HashRedoSplitAllocatePageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *blkdata); + +void HashRedoSplitCompleteOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata); +void HashRedoSplitCompleteOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata); + +void HashXlogMoveAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len); +void HashXlogMoveDeleteOvflPageOperatorPage(RedoBufferInfo *redobuffer, void *blkdata, Size len); + +void HashXlogSqueezeAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len); +void HashXlogSqueezeInitOvflbufOperatorPage(RedoBufferInfo *redobuffer, void *recorddata); +void HashXlogSqueezeUpdatePrevPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata); +void HashXlogSqueezeUpdateNextPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata); +void HashXlogSqueezeUpdateBitmapOperatorPage(RedoBufferInfo *redobuffer, void *blkdata); +void HashXlogSqueezeUpdateMateOperatorPage(RedoBufferInfo *redobuffer, void *blkdata); + +void HashXlogDeleteBlockOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len); + +void HashXlogSplitCleanupOperatorPage(RedoBufferInfo *redobuffer); + +void HashXlogUpdateMetaOperatorPage(RedoBufferInfo *redobuffer, void *recorddata); + +void HashXlogVacuumOnePageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, Size len); + +void HashXlogVacuumMateOperatorPage(RedoBufferInfo *redobuffer, void *recorddata); + void XLogRecSetBlockCommonState(XLogReaderState* record, XLogBlockParseEnum blockvalid, ForkNumber forknum, BlockNumber blockknum, RelFileNode* relnode, XLogRecParseState* recordblockstate); @@ -787,6 +828,7 @@ extern void XLogRecSetBlockDdlState(XLogBlockDdlParse* blockddlstate, uint32 blo char *mainData, Oid ownerid = InvalidOid); XLogRedoAction XLogCheckBlockDataRedoAction(XLogBlockDataParse* datadecode, RedoBufferInfo* bufferinfo); void BtreeRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo); +extern void HashRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo); XLogRecParseState* XactXlogCsnlogParseToBlock(XLogReaderState* record, uint32* blocknum, TransactionId xid, int nsubxids, TransactionId* subxids, CommitSeqNo csn, XLogRecParseState* recordstatehead); extern void XLogRecSetVmBlockState(XLogReaderState* record, uint32 blockid, XLogRecParseState* recordblockstate); @@ -914,5 +956,4 @@ extern void XLogBlockDdlDoSmgrAction(XLogBlockHead* blockhead, void* blockrecbod extern void GinRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo); extern void GistRedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, RedoBufferInfo *bufferinfo); extern bool IsCheckPoint(const XLogRecParseState *parseState); - #endif diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index d31d8a220..6e86b9f90 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -19,6 +19,7 @@ #include "utils/tuplesort.h" #define DEFAULT_INDEX_TYPE "btree" +#define DEFAULT_HASH_INDEX_TYPE "hash" #define DEFAULT_CSTORE_INDEX_TYPE "psort" #define DEFAULT_GIST_INDEX_TYPE "gist" #define CSTORE_BTREE_INDEX_TYPE "cbtree" diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index a7576b957..500ac0efb 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -55,6 +55,7 @@ extern const uint32 RANGE_LIST_DISTRIBUTION_VERSION_NUM; extern const uint32 FIX_SQL_ADD_RELATION_REF_COUNT; extern const uint32 GENERATED_COL_VERSION_NUM; extern const uint32 ANALYZER_HOOK_VERSION_NUM; +extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM; #define INPLACE_UPGRADE_PRECOMMIT_VERSION 1 diff --git a/src/include/storage/buf/bufmgr.h b/src/include/storage/buf/bufmgr.h index bd1fad7ee..e0471ef2e 100644 --- a/src/include/storage/buf/bufmgr.h +++ b/src/include/storage/buf/bufmgr.h @@ -268,6 +268,7 @@ extern void LockBuffer(Buffer buffer, int mode); extern bool ConditionalLockBuffer(Buffer buffer); extern void LockBufferForCleanup(Buffer buffer); extern bool ConditionalLockBufferForCleanup(Buffer buffer); +extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern void AsyncUnpinBuffer(volatile void* bufHdr, bool forgetBuffer); extern void AsyncCompltrPinBuffer(volatile void* bufHdr); diff --git a/src/include/storage/buf/bufpage.h b/src/include/storage/buf/bufpage.h index cab832018..a57ba2328 100644 --- a/src/include/storage/buf/bufpage.h +++ b/src/include/storage/buf/bufpage.h @@ -472,6 +472,7 @@ extern Page PageGetTempPageCopySpecial(Page page, bool isbtree); extern void PageRestoreTempPage(Page tempPage, Page oldPage); extern void PageRepairFragmentation(Page page); extern Size PageGetFreeSpace(Page page); +extern Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups); extern Size PageGetExactFreeSpace(Page page); extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index 2f725e43b..235d5e6ef 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -66,7 +66,8 @@ extern Tuplesortstate* tuplesort_begin_cluster( extern Tuplesortstate* tuplesort_begin_index_btree( Relation indexRel, bool enforceUnique, int workMem, bool randomAccess, int maxMem); extern Tuplesortstate* tuplesort_begin_index_hash( - Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem); + Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets, + int workMem, bool randomAccess, int maxMem); extern Tuplesortstate* tuplesort_begin_datum( Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, bool randomAccess); #ifdef PGXC diff --git a/src/test/regress/expected/gtt_function.out b/src/test/regress/expected/gtt_function.out index 560454fc5..3d685252a 100644 --- a/src/test/regress/expected/gtt_function.out +++ b/src/test/regress/expected/gtt_function.out @@ -354,7 +354,6 @@ NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "mytable_pkey" fo -- ok create index idx_gtt1_1 on gtt1 using btree (a); create index idx_gtt1_2 on gtt1 using hash (a); -ERROR: access method "hash" does not support row store create global temp table tmp_t0(c0 tsvector,c1 varchar(100)); create index idx_tmp_t0_1 on tmp_t0 using gin (c0); create index idx_tmp_t0_2 on tmp_t0 using gist (c0); diff --git a/src/test/regress/expected/hash_index_001.out b/src/test/regress/expected/hash_index_001.out new file mode 100644 index 000000000..cdbfab1e0 --- /dev/null +++ b/src/test/regress/expected/hash_index_001.out @@ -0,0 +1,213 @@ +-------------------------------- +---------- hash index ---------- +-------------------------------- +set enable_seqscan = off; +set enable_indexscan = off; +------------------ +-- hash_table_1 -- +------------------ +drop table if exists hash_table_1 cascade; +NOTICE: table "hash_table_1" does not exist, skipping +create table hash_table_1 (id int, name varchar, sex varchar default 'male'); +insert into hash_table_1 values (1, 'Smith'); +insert into hash_table_1 values (2, 'Jones'); +insert into hash_table_1 values (3, 'Williams', 'female'); +insert into hash_table_1 values (4, 'Taylor'); +insert into hash_table_1 values (5, 'Brown'); +insert into hash_table_1 values (6, 'Davies'); +drop index if exists hash_t1_id1; +NOTICE: index "hash_t1_id1" does not exist, skipping +create index hash_t1_id1 on hash_table_1 using hash (id); +-- error, does not support multicolumn indexes +drop index if exists hash_t1_id2; +NOTICE: index "hash_t1_id2" does not exist, skipping +create index hash_t1_id2 on hash_table_1 using hash (id, sex); +ERROR: access method "hash" does not support multicolumn indexes +-- compare with hash_t1_id1 and hash_t1_id3, hash index can be create in same column +drop index if exists hash_t1_id3; +NOTICE: index "hash_t1_id3" does not exist, skipping +drop index if exists hash_t1_id4; +NOTICE: index "hash_t1_id4" does not exist, skipping +create index hash_t1_id3 on hash_table_1 using btree (id); +create index hash_t1_id4 on hash_table_1 using hash (id); +-- drop superfluous index now +drop index hash_t1_id3, hash_t1_id4; +-- insert into large volumns of data into hash_table_1 +insert into hash_table_1 select 4, 'XXX', 'XXX' from generate_series(1,50000); +insert into hash_table_1 select 6, 'XXX', 'XXX' from generate_series(1,50000); +analyse hash_table_1; +-- after insert, hash_t1_id1 is still work +explain(costs off) select * from hash_table_1 where id = 4; + QUERY PLAN +---------------------------------------- + Bitmap Heap Scan on hash_table_1 + Recheck Cond: (id = 4) + -> Bitmap Index Scan on hash_t1_id1 + Index Cond: (id = 4) +(4 rows) + +select count(*) from hash_table_1 where id = 6; --50001 + count +------- + 50001 +(1 row) + +-- do other dml action, then check hash_t1_id1 again +insert into hash_table_1 select random()*100, 'XXX', 'XXX' from generate_series(1,50000); +update hash_table_1 set id = 101, sex = 'male' where id = 60; +delete from hash_table_1 where id = 80; +explain(costs off) select * from hash_table_1 where id = 101; + QUERY PLAN +---------------------------------------- + Bitmap Heap Scan on hash_table_1 + Recheck Cond: (id = 101) + -> Bitmap Index Scan on hash_t1_id1 + Index Cond: (id = 101) +(4 rows) + +-- cleanup env +drop table hash_table_1 cascade; +------------------ +-- hash_table_2 -- +------------------ +drop table if exists hash_table_2 cascade; +NOTICE: table "hash_table_2" does not exist, skipping +create table hash_table_2 (id int, name varchar, sex varchar default 'male'); +insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100000); +-- create index concurrently +-- In this fastcheck, we only check it can run properly. However, in a real +-- situation, you should run this sql in connection a first, then doing some DML( +-- insert, delete, update) operation about this table in connection b as soon +-- as possible. We expect the create index do not block DML operation. +-- connection a +create index concurrently hash_t2_id1 on hash_table_2 using hash (id); +-- connection b +insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100); +explain(costs off) select * from hash_table_2 where id = 40; + QUERY PLAN +---------------------------------------- + Bitmap Heap Scan on hash_table_2 + Recheck Cond: (id = 40) + -> Bitmap Index Scan on hash_t2_id1 + Index Cond: (id = 40) +(4 rows) + +-- error, does not support unique indexes +create unique index hash_t2_id2 on hash_table_2 using hash (sex); +ERROR: access method "hash" does not support unique indexes +-- hash_t2_id3 occupies more disk space than hash_t2_id2 +create index hash_t2_id2 on hash_table_2 using hash (id) with (fillfactor=25); +create index hash_t2_id3 on hash_table_2 using hash (id) with (fillfactor=75); +select count(*) from hash_table_2; --100100 + count +-------- + 100100 +(1 row) + +-- cleanup env +drop table hash_table_2 cascade; +------------------ +-- hash_table_3 -- +------------------ +drop schema if exists hash_sc_3 cascade; +NOTICE: schema "hash_sc_3" does not exist, skipping +drop tablespace if exists hash_sp_3; +NOTICE: Tablespace "hash_sp_3" does not exist, skipping. +create schema hash_sc_3; +create tablespace hash_sp_3 relative location 'tablespace/tablespace_1'; +create table hash_sc_3.hash_table_3 +( + id int, name varchar, + sex varchar default 'male' +) +tablespace hash_sp_3; +-- create index specify schema and tablespace +create index concurrently hash_sc_3.hash_t3_id1 on hash_sc_3.hash_table_3 using hash (id); +create index hash_sc_3.hash_t3_id2 on hash_sc_3.hash_table_3 using hash (id) tablespace hash_sp_3; +drop table hash_sc_3.hash_table_3 cascade; +drop schema hash_sc_3 cascade; +drop tablespace hash_sp_3; +------------------ +-- hash_table_4 -- +------------------ +drop table if exists hash_table_4 cascade; +NOTICE: table "hash_table_4" does not exist, skipping +create table hash_table_4 +( + id int, + name varchar, + sex varchar default 'male' +) +partition by range(id) +( + partition p1 values less than (1000), + partition p2 values less than (2000), + partition p3 values less than (3000), + partition p4 values less than (maxvalue) +); +-- hash index only support local index in partition table +drop index if exists hash_t4_id1; +NOTICE: index "hash_t4_id1" does not exist, skipping +drop index if exists hash_t4_id2; +NOTICE: index "hash_t4_id2" does not exist, skipping +drop index if exists hash_t4_id2_new; +NOTICE: index "hash_t4_id2_new" does not exist, skipping +create index hash_t4_id1 on hash_table_4 using hash(id) global; +ERROR: Global partition index only support btree. +create index hash_t4_id2 on hash_table_4 using hash(id) local +( + partition index_t4_p1, + partition index_t4_p2, + partition index_t4_p3, + partition index_t4_p4 +); +-- alter index rename, unusable +insert into hash_table_4 select random()*5000, 'XXX', 'XXX' from generate_series(1,1000); +alter index hash_t4_id2 rename to hash_t4_id2_new; +alter index hash_t4_id2_new modify partition index_t4_p2 unusable; +reindex index hash_t4_id2_new partition index_t4_p2; +drop table hash_table_4 cascade; +------------------ +-- hash_table_5 -- +------------------ +drop table if exists hash_table_5; +NOTICE: table "hash_table_5" does not exist, skipping +create temporary table hash_table_5(id int, name varchar, sex varchar default 'male'); +drop index if exists hash_t5_id1; +NOTICE: index "hash_t5_id1" does not exist, skipping +create index hash_t5_id1 on hash_table_5 using hash(id) with(fillfactor = 80); +insert into hash_table_5 select random()*100, 'XXX', 'XXX' from generate_series(1,100); +update hash_table_5 set name = 'aaa' where id = 80; +alter index hash_t5_id1 set (fillfactor = 60); +alter index hash_t5_id1 RESET (fillfactor); +explain (costs off) select * from hash_table_5 where id = 80; + QUERY PLAN +---------------------------------------- + Bitmap Heap Scan on hash_table_5 + Recheck Cond: (id = 80) + -> Bitmap Index Scan on hash_t5_id1 + Index Cond: (id = 80) +(4 rows) + +drop table hash_table_5 cascade; +------------------ +-- hash_table_6 -- +------------------ +drop table if exists hash_table_6; +NOTICE: table "hash_table_6" does not exist, skipping +create global temporary table hash_table_6(id int, name varchar, sex varchar default 'male'); +drop index if exists hash_t6_id1; +NOTICE: index "hash_t6_id1" does not exist, skipping +create index hash_t6_id1 on hash_table_6 using hash((id*10)) with (fillfactor = 30); +insert into hash_table_6 select random()*100, 'XXX', 'XXX' from generate_series(1,1000); +delete from hash_table_6 where id in (50, 60, 70); +explain (costs off) select * from hash_table_6 where id*10 = 80; + QUERY PLAN +---------------------------------------- + Bitmap Heap Scan on hash_table_6 + Recheck Cond: ((id * 10) = 80) + -> Bitmap Index Scan on hash_t6_id1 + Index Cond: ((id * 10) = 80) +(4 rows) + +drop table hash_table_6 cascade; diff --git a/src/test/regress/expected/hw_partition_interval_index.out b/src/test/regress/expected/hw_partition_interval_index.out index 3a32cb5a1..f5c8117dd 100644 --- a/src/test/regress/expected/hw_partition_interval_index.out +++ b/src/test/regress/expected/hw_partition_interval_index.out @@ -261,11 +261,8 @@ INTERVAL ('1 month') ); NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "interval_partition_table_003_pkey" for table "interval_partition_table_003" create index interval_partition_table_003_1 ON interval_partition_table_003 USING HASH (logdate) LOCAL; -ERROR: access method "hash" does not support row store create index interval_partition_table_003_2 ON interval_partition_table_003 USING HASH (c2) LOCAL; -ERROR: access method "hash" does not support row store create index interval_partition_table_003_3 ON interval_partition_table_003 USING HASH (c1) LOCAL; -ERROR: access method "hash" does not support row store select relname from pg_partition where INDEXTBLID=(select RELFILENODE from pg_partition where relname='interval_partition_table_003_1') order by 1; relname --------- diff --git a/src/test/regress/expected/macaddr.out b/src/test/regress/expected/macaddr.out index 1b1464be4..ad5498ca7 100644 --- a/src/test/regress/expected/macaddr.out +++ b/src/test/regress/expected/macaddr.out @@ -41,7 +41,6 @@ SELECT * FROM macaddr_data ORDER BY a; CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b); CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b); -ERROR: access method "hash" does not support row store SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1; a | b | trunc ----+-------------------+------------------- diff --git a/src/test/regress/expected/single_node_enum.out b/src/test/regress/expected/single_node_enum.out index a7c80858d..e40ccd02a 100644 --- a/src/test/regress/expected/single_node_enum.out +++ b/src/test/regress/expected/single_node_enum.out @@ -362,7 +362,6 @@ DROP INDEX enumtest_btree; -- Hash index / opclass with the = operator -- CREATE INDEX enumtest_hash ON enumtest USING hash (col); -ERROR: access method "hash" does not support row store SELECT * FROM enumtest WHERE col = 'orange'; col -------- @@ -370,7 +369,6 @@ SELECT * FROM enumtest WHERE col = 'orange'; (1 row) DROP INDEX enumtest_hash; -ERROR: index "enumtest_hash" does not exist -- -- End index tests -- diff --git a/src/test/regress/expected/single_node_macaddr.out b/src/test/regress/expected/single_node_macaddr.out index 3b355edd0..28fc8f55d 100644 --- a/src/test/regress/expected/single_node_macaddr.out +++ b/src/test/regress/expected/single_node_macaddr.out @@ -41,7 +41,6 @@ SELECT * FROM macaddr_data; CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b); CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b); -ERROR: access method "hash" does not support row store SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1; a | b | trunc ----+-------------------+------------------- diff --git a/src/test/regress/expected/single_node_uuid.out b/src/test/regress/expected/single_node_uuid.out index bcc0b6730..f87df8d08 100644 --- a/src/test/regress/expected/single_node_uuid.out +++ b/src/test/regress/expected/single_node_uuid.out @@ -120,7 +120,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222 -- btree and hash index creation test CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -ERROR: access method "hash" does not support row store -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); -- should fail @@ -131,7 +130,7 @@ DETAIL: Key (guid_field)=(11111111-1111-1111-1111-111111111111) already exists. SELECT count(*) FROM pg_class WHERE relkind='i' AND relname LIKE 'guid%'; count ------- - 2 + 3 (1 row) -- populating the test tables with additional records diff --git a/src/test/regress/expected/uuid_1.out b/src/test/regress/expected/uuid_1.out index 58aefb782..e24228f1e 100644 --- a/src/test/regress/expected/uuid_1.out +++ b/src/test/regress/expected/uuid_1.out @@ -120,7 +120,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222 -- btree and hash index creation test CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -ERROR: access method "hash" does not support row store -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); -- should fail @@ -131,7 +130,7 @@ DETAIL: Key (guid_field)=(11111111-1111-1111-1111-111111111111) already exists. SELECT count(*) FROM pg_class WHERE relkind='i' AND relname LIKE 'guid%'; count ------- - 2 + 3 (1 row) -- populating the test tables with additional records diff --git a/src/test/regress/parallel_schedule0 b/src/test/regress/parallel_schedule0 index 3c581c964..4d1de4ac0 100644 --- a/src/test/regress/parallel_schedule0 +++ b/src/test/regress/parallel_schedule0 @@ -110,6 +110,7 @@ test: single_node_random #test: single_node_portals #test: single_node_arrays #test: single_node_btree_index single_node_hash_index single_node_update +test: hash_index_001 test: single_node_update #test single_node_namespace #test: single_node_prepared_xacts diff --git a/src/test/regress/sql/hash_index_001.sql b/src/test/regress/sql/hash_index_001.sql new file mode 100644 index 000000000..3826c983c --- /dev/null +++ b/src/test/regress/sql/hash_index_001.sql @@ -0,0 +1,169 @@ +-------------------------------- +---------- hash index ---------- +-------------------------------- + +set enable_seqscan = off; +set enable_indexscan = off; +------------------ +-- hash_table_1 -- +------------------ +drop table if exists hash_table_1 cascade; +create table hash_table_1 (id int, name varchar, sex varchar default 'male'); + +insert into hash_table_1 values (1, 'Smith'); +insert into hash_table_1 values (2, 'Jones'); +insert into hash_table_1 values (3, 'Williams', 'female'); +insert into hash_table_1 values (4, 'Taylor'); +insert into hash_table_1 values (5, 'Brown'); +insert into hash_table_1 values (6, 'Davies'); + +drop index if exists hash_t1_id1; +create index hash_t1_id1 on hash_table_1 using hash (id); +-- error, does not support multicolumn indexes +drop index if exists hash_t1_id2; +create index hash_t1_id2 on hash_table_1 using hash (id, sex); + +-- compare with hash_t1_id1 and hash_t1_id3, hash index can be create in same column +drop index if exists hash_t1_id3; +drop index if exists hash_t1_id4; +create index hash_t1_id3 on hash_table_1 using btree (id); +create index hash_t1_id4 on hash_table_1 using hash (id); + +-- drop superfluous index now +drop index hash_t1_id3, hash_t1_id4; + +-- insert into large volumns of data into hash_table_1 +insert into hash_table_1 select 4, 'XXX', 'XXX' from generate_series(1,50000); +insert into hash_table_1 select 6, 'XXX', 'XXX' from generate_series(1,50000); +analyse hash_table_1; + +-- after insert, hash_t1_id1 is still work +explain(costs off) select * from hash_table_1 where id = 4; +select count(*) from hash_table_1 where id = 6; --50001 + +-- do other dml action, then check hash_t1_id1 again +insert into hash_table_1 select random()*100, 'XXX', 'XXX' from generate_series(1,50000); +update hash_table_1 set id = 101, sex = 'male' where id = 60; +delete from hash_table_1 where id = 80; +explain(costs off) select * from hash_table_1 where id = 101; + +-- cleanup env +drop table hash_table_1 cascade; + +------------------ +-- hash_table_2 -- +------------------ +drop table if exists hash_table_2 cascade; +create table hash_table_2 (id int, name varchar, sex varchar default 'male'); +insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100000); + +-- create index concurrently +-- In this fastcheck, we only check it can run properly. However, in a real +-- situation, you should run this sql in connection a first, then doing some DML( +-- insert, delete, update) operation about this table in connection b as soon +-- as possible. We expect the create index do not block DML operation. +-- connection a +create index concurrently hash_t2_id1 on hash_table_2 using hash (id); +-- connection b +insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100); +explain(costs off) select * from hash_table_2 where id = 40; + +-- error, does not support unique indexes +create unique index hash_t2_id2 on hash_table_2 using hash (sex); + +-- hash_t2_id3 occupies more disk space than hash_t2_id2 +create index hash_t2_id2 on hash_table_2 using hash (id) with (fillfactor=25); +create index hash_t2_id3 on hash_table_2 using hash (id) with (fillfactor=75); + +select count(*) from hash_table_2; --100100 + +-- cleanup env +drop table hash_table_2 cascade; + +------------------ +-- hash_table_3 -- +------------------ +drop schema if exists hash_sc_3 cascade; +drop tablespace if exists hash_sp_3; +create schema hash_sc_3; +create tablespace hash_sp_3 relative location 'tablespace/tablespace_1'; +create table hash_sc_3.hash_table_3 +( + id int, name varchar, + sex varchar default 'male' +) +tablespace hash_sp_3; +-- create index specify schema and tablespace +create index concurrently hash_sc_3.hash_t3_id1 on hash_sc_3.hash_table_3 using hash (id); +create index hash_sc_3.hash_t3_id2 on hash_sc_3.hash_table_3 using hash (id) tablespace hash_sp_3; + +drop table hash_sc_3.hash_table_3 cascade; +drop schema hash_sc_3 cascade; +drop tablespace hash_sp_3; + +------------------ +-- hash_table_4 -- +------------------ +drop table if exists hash_table_4 cascade; +create table hash_table_4 +( + id int, + name varchar, + sex varchar default 'male' +) +partition by range(id) +( + partition p1 values less than (1000), + partition p2 values less than (2000), + partition p3 values less than (3000), + partition p4 values less than (maxvalue) +); + +-- hash index only support local index in partition table +drop index if exists hash_t4_id1; +drop index if exists hash_t4_id2; +drop index if exists hash_t4_id2_new; +create index hash_t4_id1 on hash_table_4 using hash(id) global; +create index hash_t4_id2 on hash_table_4 using hash(id) local +( + partition index_t4_p1, + partition index_t4_p2, + partition index_t4_p3, + partition index_t4_p4 +); + +-- alter index rename, unusable +insert into hash_table_4 select random()*5000, 'XXX', 'XXX' from generate_series(1,1000); +alter index hash_t4_id2 rename to hash_t4_id2_new; +alter index hash_t4_id2_new modify partition index_t4_p2 unusable; +reindex index hash_t4_id2_new partition index_t4_p2; + +drop table hash_table_4 cascade; + +------------------ +-- hash_table_5 -- +------------------ +drop table if exists hash_table_5; +create temporary table hash_table_5(id int, name varchar, sex varchar default 'male'); + +drop index if exists hash_t5_id1; +create index hash_t5_id1 on hash_table_5 using hash(id) with(fillfactor = 80); + +insert into hash_table_5 select random()*100, 'XXX', 'XXX' from generate_series(1,100); +update hash_table_5 set name = 'aaa' where id = 80; +alter index hash_t5_id1 set (fillfactor = 60); +alter index hash_t5_id1 RESET (fillfactor); +explain (costs off) select * from hash_table_5 where id = 80; +drop table hash_table_5 cascade; + +------------------ +-- hash_table_6 -- +------------------ +drop table if exists hash_table_6; +create global temporary table hash_table_6(id int, name varchar, sex varchar default 'male'); +drop index if exists hash_t6_id1; +create index hash_t6_id1 on hash_table_6 using hash((id*10)) with (fillfactor = 30); +insert into hash_table_6 select random()*100, 'XXX', 'XXX' from generate_series(1,1000); +delete from hash_table_6 where id in (50, 60, 70); +explain (costs off) select * from hash_table_6 where id*10 = 80; +drop table hash_table_6 cascade;