!1236 openGauss support hash index

Merge pull request !1236 from 陈栋/hash_index
This commit is contained in:
opengauss-bot 2021-09-03 09:25:59 +00:00 committed by Gitee
commit 010fdac4e4
43 changed files with 6528 additions and 1306 deletions

View File

@ -13,6 +13,7 @@
#include "access/gin.h"
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/nbtree.h"

View File

@ -363,7 +363,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
Page page;
OffsetNumber maxoff;
_hash_getlock(rel, blkno, HASH_SHARE);
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
page = BufferGetPage(buf);
@ -390,7 +389,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
}
_hash_relbuf(rel, buf);
_hash_droplock(rel, blkno, HASH_SHARE);
}
/*

View File

@ -3353,12 +3353,21 @@ IndexStmt* transformIndexStmt(Oid relid, IndexStmt* stmt, const char* queryStrin
if (!isColStore && (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIN_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE))) {
/* row store only support btree/gin/gist index */
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE)) &&
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE))) {
/* row store only support btree/gin/gist/hash index */
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
}
if (0 == pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE) &&
t_thrd.proc->workingVersionNum < SUPPORT_HASH_XLOG_VERSION_NUM) {
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
}
if (isColStore && (!isPsortMothed && !isCBtreeMethod && !isCGinBtreeMethod)) {
/* column store support psort/cbtree/gin index */
ereport(ERROR,

View File

@ -59,7 +59,7 @@ bool open_join_children = true;
bool will_shutdown = false;
/* hard-wired binary version number */
const uint32 GRAND_VERSION_NUM = 92308;
const uint32 GRAND_VERSION_NUM = 92309;
const uint32 MATVIEW_VERSION_NUM = 92213;
const uint32 PARTIALPUSH_VERSION_NUM = 92087;
@ -79,6 +79,7 @@ const uint32 ML_OPT_MODEL_VERSION_NUM = 92284;
const uint32 FIX_SQL_ADD_RELATION_REF_COUNT = 92291;
const uint32 GENERATED_COL_VERSION_NUM = 92303;
const uint32 ANALYZER_HOOK_VERSION_NUM = 92306;
const uint32 SUPPORT_HASH_XLOG_VERSION_NUM = 92309;
/* This variable indicates wheather the instance is in progress of upgrade as a whole */
uint32 volatile WorkingGrandVersionNum = GRAND_VERSION_NUM;

View File

@ -389,9 +389,6 @@ static void ResourceOwnerReleaseInternal(
MemoryContextDelete(memContext);
ResourceOwnerForgetGMemContext(t_thrd.utils_cxt.TopTransactionResourceOwner, memContext);
}
/* Clean up index scans too */
ReleaseResources_hash();
}
/* Let add-on modules get a chance too */

View File

@ -109,6 +109,7 @@
#include <limits.h>
#include "access/nbtree.h"
#include "access/hash.h"
#include "access/tableam.h"
#include "catalog/index.h"
#include "commands/tablespace.h"
@ -389,6 +390,7 @@ struct Tuplesortstate {
* These variables are specific to the IndexTuple case; they are set by
* tuplesort_begin_index_xxx and used only by the IndexTuple routines.
*/
Relation heapRel; /* table the index is being built on */
Relation indexRel; /* index being built */
/* These are specific to the index_btree subcase: */
@ -396,7 +398,9 @@ struct Tuplesortstate {
bool enforceUnique; /* complain if we find duplicate tuples */
/* These are specific to the index_hash subcase: */
uint32 hash_mask; /* mask for sortable part of hash code */
uint32 high_mask; /* masks for sortable part of hash code */
uint32 low_mask;
uint32 max_buckets;
/*
* These variables are specific to the Datum case; they are set by
@ -917,7 +921,8 @@ Tuplesortstate* tuplesort_begin_index_btree(
}
Tuplesortstate* tuplesort_begin_index_hash(
Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem)
Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask,
uint32 max_buckets, int workMem, bool randomAccess, int maxMem)
{
Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess);
MemoryContext oldcontext;
@ -927,11 +932,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
#ifdef TRACE_SORT
if (u_sess->attr.attr_common.trace_sort) {
elog(LOG,
"begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c, maxMem = %d",
hash_mask,
workMem,
randomAccess ? 't' : 'f',
maxMem);
"begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
"max_buckets = 0x%x, workMem = %d, randomAccess = %c",
high_mask,
low_mask,
max_buckets,
workMem, randomAccess ? 't' : 'f');
}
#endif
@ -946,9 +952,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
#endif
state->reversedirection = reversedirection_index_hash;
state->heapRel = heapRel;
state->indexRel = indexRel;
state->hash_mask = hash_mask;
state->high_mask = high_mask;
state->low_mask = low_mask;
state->max_buckets = max_buckets;
state->maxMem = maxMem * 1024L;
(void)MemoryContextSwitchTo(oldcontext);
@ -3610,8 +3619,8 @@ static int comparetup_index_btree(const SortTuple* a, const SortTuple* b, Tuples
static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tuplesortstate* state)
{
uint32 hash1;
uint32 hash2;
Bucket bucket1;
Bucket bucket2;
IndexTuple tuple1;
IndexTuple tuple2;
@ -3620,13 +3629,17 @@ static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tupleso
* that the first column of the index tuple is the hash key.
*/
Assert(!a->isnull1);
hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
state->max_buckets, state->high_mask,
state->low_mask);
Assert(!b->isnull1);
hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
state->max_buckets, state->high_mask,
state->low_mask);
if (hash1 > hash2) {
if (bucket1 > bucket2) {
return 1;
} else if (hash1 < hash2) {
} else if (bucket1 < bucket2) {
return -1;
}

View File

@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
endif
endif
endif
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
hashsearch.o hashsort.o hashutil.o
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o\
hashsort.o hashutil.o hash_xlog.o
include $(top_srcdir)/src/gausskernel/common.mk

View File

@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
move primary bucket pages around after they are created.
Primary bucket pages (henceforth just "bucket pages") are allocated in
power-of-2 groups, called "split points" in the code. Buckets 0 and 1
are created when the index is initialized. At the first split, buckets 2
and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket
pages of a power-of-2 group appear consecutively in the index. This
addressing scheme allows the physical location of a bucket page to be
computed from the bucket number relatively easily, using only a small
amount of control information. We take the log2() of the bucket number
to determine which split point S the bucket belongs to, and then simply
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
metapage) to compute the physical address. hashm_spares[S] can be
interpreted as the total number of overflow pages that have been allocated
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
block numbers 1 and 2, just after the meta page. We always have
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow
pages appearing between the bucket page groups of splitpoints N and N+1.
power-of-2 groups, called "split points" in the code. That means at every new
splitpoint we double the existing number of buckets. Allocating huge chunks
of bucket pages all at once isn't optimal and we will take ages to consume
those. To avoid this exponential growth of index size, we did use a trick to
break up allocation of buckets at the splitpoint into 4 equal phases. If
(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
of total buckets at each phase of splitpoint group. Next quarter of allocation
will only happen if buckets of the previous phase have been already consumed.
For the initial splitpoint groups < 10 we will allocate all of their buckets in
single phase only, as number of buckets allocated at initial groups are small
in numbers. And for the groups >= 10 the allocation process is distributed
among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4
different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
indicate the number of buckets allocated within each phase of splitpoint group
10. And, for splitpoint group 11 and 12 allocation phases will be
{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We
can see that at each splitpoint group we double the total number of buckets
from the previous group but in an incremental phase. The bucket pages
allocated within one phase of a splitpoint group will appear consecutively in
the index. This addressing scheme allows the physical location of a bucket
page to be computed from the bucket number relatively easily, using only a
small amount of control information. If we look at the function
_hash_spareindex for a given bucket number we first compute the
splitpoint group it belongs to and then the phase to which the bucket belongs
to. Adding them we get the global splitpoint phase number S to which the
bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
is an array stored in the metapage) with given bucket number to compute its
physical address. The hashm_spares[S] can be interpreted as the total number
of overflow pages that have been allocated before the bucket pages of
splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1
always appear at block numbers 1 and 2, just after the meta page. We always
have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
former. The difference between the two represents the number of overflow pages
appearing between the bucket page groups of splitpoints phase N and N+1.
(Note: the above describes what happens when filling an initially minimally
sized hash index. In practice, we try to estimate the required index size
and allocate a suitable number of splitpoints immediately, to avoid
sized hash index. In practice, we try to estimate the required index size and
allocate a suitable number of splitpoints phases immediately, to avoid
expensive re-splitting during initial index build.)
When S splitpoints exist altogether, the array entries hashm_spares[0]
through hashm_spares[S] are valid; hashm_spares[S] records the current
total number of overflow pages. New overflow pages are created as needed
at the end of the index, and recorded by incrementing hashm_spares[S].
When it is time to create a new splitpoint's worth of bucket pages, we
When it is time to create a new splitpoint phase's worth of bucket pages, we
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
stored in the hashm_ovflpoint field of the meta page). This has the
effect of reserving the correct number of bucket pages at the end of the
@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
an index extension we crash after allocating filesystem space and before
updating the metapage. Note that on filesystems that allow "holes" in
files, it's entirely likely that pages before the logical EOF are not yet
allocated: when we allocate a new splitpoint's worth of bucket pages, we
allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
physically zero the last such page to force the EOF up, and the first such
page will be used immediately, but the intervening pages are not written
until needed.
@ -126,61 +142,98 @@ the initially created buckets.
Lock Definitions
----------------
We use both lmgr locks ("heavyweight" locks) and buffer context locks
(LWLocks) to control access to a hash index. lmgr locks are needed for
long-term locking since there is a (small) risk of deadlock, which we must
be able to detect. Buffer context locks are used for short-term access
control to individual pages of the index.
Concurrency control for hash indexes is provided using buffer content
locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL,
cleanup lock means that we hold an exclusive lock on the buffer and have
observed at some point after acquiring the lock that we hold the only pin
on that buffer. For hash indexes, a cleanup lock on a primary bucket page
represents the right to perform an arbitrary reorganization of the entire
bucket. Therefore, scans retain a pin on the primary bucket page for the
bucket they are currently scanning. Splitting a bucket requires a cleanup
lock on both the old and new primary bucket pages. VACUUM therefore takes
a cleanup lock on every bucket page in order to remove tuples. It can also
remove tuples copied to a new bucket by any previous split operation, because
the cleanup lock taken on the primary bucket page guarantees that no scans
which started prior to the most recent split can still be in progress. After
cleaning each page individually, it attempts to take a cleanup lock on the
primary bucket page in order to "squeeze" the bucket down to the minimum
possible number of pages.
We define the following lmgr locks for a hash index:
To avoid deadlocks, we must be consistent about the lock order in which we
lock the buckets for operations that requires locks on two different buckets.
We choose to always lock the lower-numbered bucket first. The metapage is
only ever locked after all bucket locks have been taken.
LockPage(rel, 0) represents the right to modify the hash-code-to-bucket
mapping. A process attempting to enlarge the hash table by splitting a
bucket must exclusive-lock this lock before modifying the metapage data
representing the mapping. Processes intending to access a particular
bucket must share-lock this lock until they have acquired lock on the
correct target bucket.
LockPage(rel, page), where page is the page number of a hash bucket page,
represents the right to split or compact an individual bucket. A process
splitting a bucket must exclusive-lock both old and new halves of the
bucket until it is done. A process doing VACUUM must exclusive-lock the
bucket it is currently purging tuples from. Processes doing scans or
insertions must share-lock the bucket they are scanning or inserting into.
(It is okay to allow concurrent scans and insertions.)
Metapage Caching
----------------
The lmgr lock IDs corresponding to overflow pages are currently unused.
These are available for possible future refinements.
Both scanning the index and inserting tuples require locating the bucket
where a given tuple ought to be located. To do this, we need the bucket
count, highmask, and lowmask from the metapage; however, it's undesirable
for performance reasons to have to have to lock and pin the metapage for
every such operation. Instead, we retain a cached copy of the metapage
in each each backend's relcache entry. This will produce the correct
bucket mapping as long as the target bucket hasn't been split since the
last cache refresh.
Note that these lock definitions are conceptually distinct from any sort
of lock on the pages whose numbers they share. A process must also obtain
read or write buffer lock on the metapage or bucket page before accessing
said page.
To guard against the possibility that such a split has occurred, the
primary page of each bucket chain stores the number of buckets that
existed as of the time the bucket was last split, or if never split as
of the time it was created, in the space normally used for the
previous block number (that is, hasho_prevblkno). This doesn't cost
anything because the primary bucket page is always the first page in
the chain, and the previous block number is therefore always, in
reality, InvalidBlockNumber.
Processes performing hash index scans must hold share lock on the bucket
they are scanning throughout the scan. This seems to be essential, since
there is no reasonable way for a scan to cope with its bucket being split
underneath it. This creates a possibility of deadlock external to the
hash index code, since a process holding one of these locks could block
waiting for an unrelated lock held by another process. If that process
then does something that requires exclusive lock on the bucket, we have
deadlock. Therefore the bucket locks must be lmgr locks so that deadlock
can be detected and recovered from. This also forces the page-zero lock
to be an lmgr lock, because as we'll see below it is held while attempting
to acquire a bucket lock, and so it could also participate in a deadlock.
After computing the ostensibly-correct bucket number based on our cached
copy of the metapage, we lock the corresponding primary bucket page and
check whether the bucket count stored in hasho_prevblkno is greater than
our the number of buckets stored in our cached copy of the metapage. If
so, the bucket has certainly been split, because the must originally
have been less than the number of buckets that existed at that time and
can't have increased except due to a split. If not, the bucket can't have
been split, because a split would have created a new bucket with a higher
bucket number than any we'd seen previously. In the latter case, we've
locked the correct bucket and can proceed; in the former case, we must
release the lock on this bucket, lock the metapage, update our cache,
unlock the metapage, and retry.
Processes must obtain read (share) buffer context lock on any hash index
page while reading it, and write (exclusive) lock while modifying it.
To prevent deadlock we enforce these coding rules: no buffer lock may be
held long term (across index AM calls), nor may any buffer lock be held
while waiting for an lmgr lock, nor may more than one buffer lock
be held at a time by any one process. (The third restriction is probably
stronger than necessary, but it makes the proof of no deadlock obvious.)
Needing to retry occasionally might seem expensive, but the number of times
any given bucket can be split is limited to a few dozen no matter how
many times the hash index is accessed, because the total number of
buckets is limited to less than 2^32. On the other hand, the number of
times we access a bucket is unbounded and will be several orders of
magnitude larger even in unsympathetic cases.
(The metapage cache is new in v10. Older hash indexes had the primary
bucket page's hasho_prevblkno initialized to InvalidBuffer.)
Pseudocode Algorithms
---------------------
Various flags that are used in hash index operations are described as below:
The bucket-being-split and bucket-being-populated flags indicate that split
the operation is in progress for a bucket. During split operation, a
bucket-being-split flag is set on the old bucket and bucket-being-populated
flag is set on new bucket. These flags are cleared once the split operation
is finished.
The split-cleanup flag indicates that a bucket which has been recently split
still contains tuples that were also copied to the new bucket; it essentially
marks the split as incomplete. Once we're certain that no scans which
started before the new bucket was fully populated are still in progress, we
can remove the copies from the old bucket and clear the flag. We insist that
this flag must be clear before splitting a bucket; thus, a bucket can't be
split again until the previous split is totally complete.
The moved-by-split flag on a tuple indicates that tuple is moved from old to
new bucket. Concurrent scans will skip such tuples until the split operation
is finished. Once the tuple is marked as moved-by-split, it will remain so
forever but that does no harm. We have intentionally not cleared it as that
can generate an additional I/O which is not necessary.
The operations we need to support are: readers scanning the index for
entries of a particular hash code (which by definition are all in the same
bucket); insertion of a new tuple into the correct bucket; enlarging the
@ -195,57 +248,75 @@ track of available overflow pages.
The reader algorithm is:
share-lock page 0 (to prevent active split)
read/sharelock meta page
compute bucket number for target hash key
release meta page
share-lock bucket page (to prevent split/compact of this bucket)
release page 0 share-lock
lock the primary bucket page of the target bucket
if the target bucket is still being populated by a split:
release the buffer content lock on current bucket page
pin and acquire the buffer content lock on old bucket in shared mode
release the buffer content lock on old bucket, but not pin
retake the buffer content lock on new bucket
arrange to scan the old bucket normally and the new bucket for
tuples which are not moved-by-split
-- then, per read request:
read/sharelock current page of bucket
step to next page if necessary (no chaining of locks)
reacquire content lock on current page
step to next page if necessary (no chaining of content locks, but keep
the pin on the primary bucket throughout the scan; we also maintain
a pin on the page currently being scanned)
get tuple
release current page
release content lock
-- at scan shutdown:
release bucket share-lock
release all pins still held
By holding the page-zero lock until lock on the target bucket is obtained,
the reader ensures that the target bucket calculation is valid (otherwise
the bucket might be split before the reader arrives at it, and the target
entries might go into the new bucket). Holding the bucket sharelock for
the remainder of the scan prevents the reader's current-tuple pointer from
being invalidated by splits or compactions. Notice that the reader's lock
does not prevent other buckets from being split or compacted.
Holding the buffer pin on the primary bucket page for the whole scan prevents
the reader's current-tuple pointer from being invalidated by splits or
compactions. (Of course, other buckets can still be split or compacted.)
To keep concurrency reasonably good, we require readers to cope with
concurrent insertions, which means that they have to be able to re-find
their current scan position after re-acquiring the page sharelock. Since
deletion is not possible while a reader holds the bucket sharelock, and
we assume that heap tuple TIDs are unique, this can be implemented by
their current scan position after re-acquiring the buffer content lock on
page. Since deletion is not possible while a reader holds the pin on bucket,
and we assume that heap tuple TIDs are unique, this can be implemented by
searching for the same heap tuple TID previously returned. Insertion does
not move index entries across pages, so the previously-returned index entry
should always be on the same page, at the same or higher offset number,
as it was before.
To allow for scans during a bucket split, if at the start of the scan, the
bucket is marked as bucket-being-populated, it scan all the tuples in that
bucket except for those that are marked as moved-by-split. Once it finishes
the scan of all the tuples in the current bucket, it scans the old bucket from
which this bucket is formed by split.
The insertion algorithm is rather similar:
share-lock page 0 (to prevent active split)
read/sharelock meta page
compute bucket number for target hash key
release meta page
share-lock bucket page (to prevent split/compact of this bucket)
release page 0 share-lock
-- (so far same as reader)
read/exclusive-lock current page of bucket
if full, release, read/exclusive-lock next page; repeat as needed
lock the primary bucket page of the target bucket
-- (so far same as reader, except for acquisition of buffer content lock in
exclusive mode on primary bucket page)
if the bucket-being-split flag is set for a bucket and pin count on it is
one, then finish the split
release the buffer content lock on current bucket
get the "new" bucket which was being populated by the split
scan the new bucket and form the hash table of TIDs
conditionally get the cleanup lock on old and new buckets
if we get the lock on both the buckets
finish the split using algorithm mentioned below for split
release the pin on old bucket and restart the insert from beginning.
if current page is full, first check if this page contains any dead tuples.
if yes, remove dead tuples from the current page and again check for the
availability of the space. If enough space found, insert the tuple else
release lock but not pin, read/exclusive-lock
next page; repeat as needed
>> see below if no space in any page of bucket
take buffer content lock in exclusive mode on metapage
insert tuple at appropriate place in page
write/release current page
release bucket share-lock
read/exclusive-lock meta page
mark current page dirty
increment tuple count, decide if split needed
write/release meta page
done if no split needed, else enter Split algorithm below
mark meta page dirty
write WAL for insertion of tuple
release the buffer content lock on metapage
release buffer content lock on current page
if current page is not a bucket page, release the pin on bucket page
if split is needed, enter Split algorithm below
release the pin on metapage
To speed searches, the index entries within any individual index page are
kept sorted by hash code; the insertion code must take care to insert new
@ -254,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this
as explained above. We only need the short-term buffer locks to ensure
that readers do not see a partially-updated page.
It is clearly impossible for readers and inserters to deadlock, and in
fact this algorithm allows them a very high degree of concurrency.
(The exclusive metapage lock taken to update the tuple count is stronger
than necessary, since readers do not care about the tuple count, but the
lock is held for such a short time that this is probably not an issue.)
To avoid deadlock between readers and inserters, whenever there is a need
to lock multiple buckets, we always take in the order suggested in Lock
Definitions above. This algorithm allows them a very high degree of
concurrency. (The exclusive metapage lock taken to update the tuple count
is stronger than necessary, since readers do not care about the tuple count,
but the lock is held for such a short time that this is probably not an
issue.)
When an inserter cannot find space in any existing page of a bucket, it
must obtain an overflow page and add that page to the bucket's chain.
@ -269,82 +342,95 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets).
The algorithm attempts, but does not necessarily succeed, to split one
existing bucket in two, thereby lowering the fill ratio:
exclusive-lock page 0 (assert the right to begin a split)
read/exclusive-lock meta page
check split still needed
if split not needed anymore, drop locks and exit
decide which bucket to split
Attempt to X-lock old bucket number (definitely could fail)
Attempt to X-lock new bucket number (shouldn't fail, but...)
if above fail, drop locks and exit
update meta page to reflect new number of buckets
write/release meta page
release X-lock on page 0
-- now, accesses to all other buckets can proceed.
Perform actual split of bucket, moving tuples as needed
>> see below about acquiring needed extra space
Release X-locks of old and new buckets
pin meta page and take buffer content lock in exclusive mode
check split still needed
if split not needed anymore, drop buffer content lock and pin and exit
decide which bucket to split
try to take a cleanup lock on that bucket; if fail, give up
if that bucket is still being split or has split-cleanup work:
try to finish the split and the cleanup work
if that succeeds, start over; if it fails, give up
mark the old and new buckets indicating split is in progress
mark both old and new buckets as dirty
write WAL for allocation of new page for split
copy the tuples that belongs to new bucket from old bucket, marking
them as moved-by-split
write WAL record for moving tuples to new page once the new page is full
or all the pages of old bucket are finished
release lock but not pin for primary bucket page of old bucket,
read/shared-lock next page; repeat as needed
clear the bucket-being-split and bucket-being-populated flags
mark the old bucket indicating split-cleanup
write WAL for changing the flags on both old and new buckets
Note the page zero and metapage locks are not held while the actual tuple
rearrangement is performed, so accesses to other buckets can proceed in
parallel; in fact, it's possible for multiple bucket splits to proceed
in parallel.
Split's attempt to X-lock the old bucket number could fail if another
process holds S-lock on it. We do not want to wait if that happens, first
because we don't want to wait while holding the metapage exclusive-lock,
and second because it could very easily result in deadlock. (The other
process might be out of the hash AM altogether, and could do something
that blocks on another lock this process holds; so even if the hash
algorithm itself is deadlock-free, a user-induced deadlock could occur.)
So, this is a conditional LockAcquire operation, and if it fails we just
abandon the attempt to split. This is all right since the index is
overfull but perfectly functional. Every subsequent inserter will try to
split, and eventually one will succeed. If multiple inserters failed to
split, the index might still be overfull, but eventually, the index will
The split operation's attempt to acquire cleanup-lock on the old bucket number
could fail if another process holds any lock or pin on it. We do not want to
wait if that happens, because we don't want to wait while holding the metapage
exclusive-lock. So, this is a conditional LWLockAcquire operation, and if
it fails we just abandon the attempt to split. This is all right since the
index is overfull but perfectly functional. Every subsequent inserter will
try to split, and eventually one will succeed. If multiple inserters failed
to split, the index might still be overfull, but eventually, the index will
not be overfull and split attempts will stop. (We could make a successful
splitter loop to see if the index is still overfull, but it seems better to
distribute the split overhead across successive insertions.)
A problem is that if a split fails partway through (eg due to insufficient
disk space) the index is left corrupt. The probability of that could be
made quite low if we grab a free page or two before we update the meta
page, but the only real solution is to treat a split as a WAL-loggable,
must-complete action. I'm not planning to teach hash about WAL in this
go-round.
If a split fails partway through (e.g. due to insufficient disk space or an
interrupt), the index will not be corrupted. Instead, we'll retry the split
every time a tuple is inserted into the old bucket prior to inserting the new
tuple; eventually, we should succeed. The fact that a split is left
unfinished doesn't prevent subsequent buckets from being split, but we won't
try to split the bucket again until the prior split is finished. In other
words, a bucket can be in the middle of being split for some time, but it can't
be in the middle of two splits at the same time.
The fourth operation is garbage collection (bulk deletion):
next bucket := 0
read/sharelock meta page
pin metapage and take buffer content lock in exclusive mode
fetch current max bucket number
release meta page
release meta page buffer content lock and pin
while next bucket <= max bucket do
Acquire X lock on target bucket
Scan and remove tuples, compact free space as needed
Release X lock
acquire cleanup lock on primary bucket page
loop:
scan and remove tuples
mark the target page dirty
write WAL for deleting tuples from target page
if this is the last bucket page, break out of loop
pin and x-lock next page
release prior lock and pin (except keep pin on primary bucket page)
if the page we have locked is not the primary bucket page:
release lock and take exclusive lock on primary bucket page
if there are no other pins on the primary bucket page:
squeeze the bucket to remove free space
release the pin on primary bucket page
next bucket ++
end loop
exclusive-lock meta page
pin metapage and take buffer content lock in exclusive mode
check if number of buckets changed
if so, release lock and return to for-each-bucket loop
if so, release content lock and pin and return to for-each-bucket loop
else update metapage tuple count
write/release meta page
mark meta page dirty and write WAL for update of metapage
release buffer content lock and pin
Note that this is designed to allow concurrent splits. If a split occurs,
tuples relocated into the new bucket will be visited twice by the scan,
but that does no harm. (We must however be careful about the statistics
Note that this is designed to allow concurrent splits and scans. If a split
occurs, tuples relocated into the new bucket will be visited twice by the
scan, but that does no harm. As we release the lock on bucket page during
cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
and ensures that scan will always be behind cleanup. It is must to keep scans
behind cleanup, else vacuum could decrease the TIDs that are required to
complete the scan. Now, as the scan that returns multiple tuples from the
same bucket page always expect next valid TID to be greater than or equal to
the current TID, it might miss the tuples. This holds true for backward scans
as well (backward scans first traverse each bucket starting from first bucket
to last overflow page in the chain). We must be careful about the statistics
reported by the VACUUM operation. What we can do is count the number of
tuples scanned, and believe this in preference to the stored tuple count
if the stored tuple count and number of buckets did *not* change at any
time during the scan. This provides a way of correcting the stored tuple
count if it gets out of sync for some reason. But if a split or insertion
does occur concurrently, the scan count is untrustworthy; instead,
subtract the number of tuples deleted from the stored tuple count and
use that.)
The exclusive lock request could deadlock in some strange scenarios, but
we can just error out without any great harm being done.
tuples scanned, and believe this in preference to the stored tuple count if
the stored tuple count and number of buckets did *not* change at any time
during the scan. This provides a way of correcting the stored tuple count if
it gets out of sync for some reason. But if a split or insertion does occur
concurrently, the scan count is untrustworthy; instead, subtract the number of
tuples deleted from the stored tuple count and use that.
Free Space Management
@ -360,25 +446,23 @@ overflow page to the free pool.
Obtaining an overflow page:
read/exclusive-lock meta page
take metapage content lock in exclusive mode
determine next bitmap page number; if none, exit loop
release meta page lock
read/exclusive-lock bitmap page
release meta page content lock
pin bitmap page and take content lock in exclusive mode
search for a free page (zero bit in bitmap)
if found:
set bit in bitmap
write/release bitmap page
read/exclusive-lock meta page
mark bitmap page dirty
take metapage buffer content lock in exclusive mode
if first-free-bit value did not change,
update it and write meta page
release meta page
return page number
update it and mark meta page dirty
else (not found):
release bitmap page
release bitmap page buffer content lock
loop back to try next bitmap page, if any
-- here when we have checked all bitmap pages; we hold meta excl. lock
extend index to add another overflow page; update meta information
write/release meta page
mark meta page dirty
return page number
It is slightly annoying to release and reacquire the metapage lock
@ -398,12 +482,17 @@ like this:
-- having determined that no space is free in the target bucket:
remember last page of bucket, drop write lock on it
call free-page-acquire routine
re-write-lock last page of bucket
if it is not last anymore, step to the last page
update (former) last page to point to new page
execute free-page-acquire (obtaining an overflow page) mechanism
described above
update (former) last page to point to the new page and mark buffer dirty
write-lock and initialize new page, with back link to former last page
write and release former last page
write WAL for addition of overflow page
release the locks on meta page and bitmap page acquired in
free-page-acquire algorithm
release the lock on former last page
release the lock on new overflow page
insert tuple into new page
-- etc.
@ -418,27 +507,27 @@ free page; there can be no other process holding lock on it.
Bucket splitting uses a similar algorithm if it has to extend the new
bucket, but it need not worry about concurrent extension since it has
exclusive lock on the new bucket.
buffer content lock in exclusive mode on the new bucket.
Freeing an overflow page is done by garbage collection and by bucket
splitting (the old bucket may contain no-longer-needed overflow pages).
In both cases, the process holds exclusive lock on the containing bucket,
so need not worry about other accessors of pages in the bucket. The
algorithm is:
Freeing an overflow page requires the process to hold buffer content lock in
exclusive mode on the containing bucket, so need not worry about other
accessors of pages in the bucket. The algorithm is:
delink overflow page from bucket chain
(this requires read/update/write/release of fore and aft siblings)
read/share-lock meta page
pin meta page and take buffer content lock in shared mode
determine which bitmap page contains the free space bit for page
release meta page
read/exclusive-lock bitmap page
release meta page buffer content lock
pin bitmap page and take buffer content lock in exclusive mode
retake meta page buffer content lock in exclusive mode
move (insert) tuples that belong to the overflow page being freed
update bitmap bit
write/release bitmap page
if page number is less than what we saw as first-free-bit in meta:
read/exclusive-lock meta page
mark bitmap page dirty
if page number is still less than first-free-bit,
update first-free-bit field and write meta page
release meta page
update first-free-bit field and mark meta page dirty
write WAL for delinking overflow page operation
release buffer content lock and pin
release meta page buffer content lock and pin
We have to do it this way because we must clear the bitmap bit before
changing the first-free-bit field (hashm_firstfree). It is possible that
@ -448,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to. What must be
avoided is having first-free-bit greater than the actual first free bit,
because then that free page would never be found by searchers.
All the freespace operations should be called while holding no buffer
locks. Since they need no lmgr locks, deadlock is not possible.
The reason of moving tuples from overflow page while delinking the later is
to make that as an atomic operation. Not doing so could lead to spurious reads
on standby. Basically, the user might see the same tuple twice.
WAL Considerations
------------------
The hash index operations like create index, insert, delete, bucket split,
allocate overflow page, and squeeze in themselves don't guarantee hash index
consistency after a crash. To provide robustness, we write WAL for each of
these operations.
CREATE INDEX writes multiple WAL records. First, we write a record to cover
the initializatoin of the metapage, followed by one for each new bucket
created, followed by one for the initial bitmap page. It's not important for
index creation to appear atomic, because the index isn't yet visible to any
other transaction, and the creating transaction will roll back in the event of
a crash. It would be difficult to cover the whole operation with a single
write-ahead log record anyway, because we can log only a fixed number of
pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery.
Ordinary item insertions (that don't force a page split or need a new overflow
page) are single WAL entries. They touch a single bucket page and the
metapage. The metapage is updated during replay as it is updated during
original operation.
If an insertion causes the addition of an overflow page, there will be one
WAL entry for the new overflow page and second entry for insert itself.
If an insertion causes a bucket split, there will be one WAL entry for insert
itself, followed by a WAL entry for allocating a new bucket, followed by a WAL
entry for each overflow bucket page in the new bucket to which the tuples are
moved from old bucket, followed by a WAL entry to indicate that split is
complete for both old and new buckets. A split operation which requires
overflow pages to complete the operation will need to write a WAL record for
each new allocation of an overflow page.
As splitting involves multiple atomic actions, it's possible that the system
crashes between moving tuples from bucket pages of the old bucket to new
bucket. In such a case, after recovery, the old and new buckets will be
marked with bucket-being-split and bucket-being-populated flags respectively
which indicates that split is in progress for those buckets. The reader
algorithm works correctly, as it will scan both the old and new buckets when
the split is in progress as explained in the reader algorithm section above.
We finish the split at next insert or split operation on the old bucket as
explained in insert and split algorithm above. It could be done during
searches, too, but it seems best not to put any extra updates in what would
otherwise be a read-only operation (updating is not possible in hot standby
mode anyway). It would seem natural to complete the split in VACUUM, but since
splitting a bucket might require allocating a new page, it might fail if you
run out of disk space. That would be bad during VACUUM - the reason for
running VACUUM in the first place might be that you run out of disk space,
and now VACUUM won't finish because you're out of disk space. In contrast,
an insertion can require enlarging the physical file anyway.
Deletion of tuples from a bucket is performed for two reasons: to remove dead
tuples, and to remove tuples that were moved by a bucket split. A WAL entry
is made for each bucket page from which tuples are removed, and then another
WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples
are removed, a separate WAL entry is made to update the metapage.
As deletion involves multiple atomic operations, it is quite possible that
system crashes after (a) removing tuples from some of the bucket pages, (b)
before clearing the garbage flag, or (c) before updating the metapage. If the
system crashes before completing (b), it will again try to clean the bucket
during next vacuum or insert after recovery which can have some performance
impact, but it will work fine. If the system crashes before completing (c),
after recovery there could be some additional splits until the next vacuum
updates the metapage, but the other operations like insert, delete and scan
will work correctly. We can fix this problem by actually updating the
metapage based on delete operation during replay, but it's not clear whether
it's worth the complication.
A squeeze operation moves tuples from one of the buckets later in the chain to
one of the bucket earlier in chain and writes WAL record when either the
bucket to which it is writing tuples is filled or bucket from which it
is removing the tuples becomes empty.
As a squeeze operation involves writing multiple atomic operations, it is
quite possible that the system crashes before completing the operation on
entire bucket. After recovery, the operations will work correctly, but
the index will remain bloated and this can impact performance of read and
insert operations until the next vacuum squeeze the bucket completely.
Other Notes
-----------
All the shenanigans with locking prevent a split occurring while *another*
process is stopped in a given bucket. They do not ensure that one of
our *own* backend's scans is not stopped in the bucket, because lmgr
doesn't consider a process's own locks to conflict. So the Split
algorithm must check for that case separately before deciding it can go
ahead with the split. VACUUM does not have this problem since nothing
else can be happening within the vacuuming backend.
Should we instead try to fix the state of any conflicting local scan?
Seems mighty ugly --- got to move the held bucket S-lock as well as lots
of other messiness. For now, just punt and don't split.
Clean up locks prevent a split from occurring while *another* process is stopped
in a given bucket. It also ensures that one of our *own* backend's scans is not
stopped in the bucket.

View File

@ -3,8 +3,8 @@
* hash.cpp
* Implementation of Margo Seltzer's Hashing package for postgres.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -20,6 +20,8 @@
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/xloginsert.h"
#include "access/tableam.h"
#include "access/relscan.h"
#include "catalog/index.h"
@ -34,6 +36,7 @@
typedef struct {
HSpool *spool; /* NULL if not using spooling */
double indtuples; /* # tuples accepted into index */
Relation heapRel; /* heap relation descriptor */
} HashBuildState;
static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, const bool *isnull, bool tupleIsAlive,
@ -52,6 +55,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
double reltuples;
double allvisfrac;
uint32 num_buckets;
long sort_threshold;
HashBuildState buildstate;
/*
@ -66,7 +70,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac, NULL);
/* Initialize the hash index metadata page and initial buckets */
num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
/*
* If we just insert the tuples into the index in scan order, then
* (assuming their hash codes are pretty random) there will be no locality
@ -74,25 +78,38 @@ Datum hashbuild(PG_FUNCTION_ARGS)
* then we'll thrash horribly. To prevent that scenario, we can sort the
* tuples by (expected) bucket number. However, such a sort is useless
* overhead when the index does fit in RAM. We choose to sort if the
* initial index size exceeds NBuffers.
* initial index size exceeds maintenance_work_mem, or the number of
* buffers usable for the index, whichever is less. (Limiting by the
* number of buffers should reduce thrashing between PG buffers and kernel
* buffers, which seems useful even if no physical I/O results. Limiting
* by maintenance_work_mem is useful to allow easy testing of the sort
* code path, and may be useful to DBAs as an additional control knob.)
*
* NOTE: this test will need adjustment if a bucket is ever different from
* one page.
* one page. Also, "initial index size" accounting does not include the
* metapage, nor the first bitmap page.
*/
if (num_buckets >= (uint32)g_instance.attr.attr_storage.NBuffers)
buildstate.spool = _h_spoolinit(index, num_buckets, &indexInfo->ii_desc);
sort_threshold = (u_sess->attr.attr_memory.maintenance_work_mem * 1024L) / BLCKSZ;
if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
sort_threshold = Min(sort_threshold, g_instance.attr.attr_storage.NBuffers);
else
sort_threshold = Min(sort_threshold, u_sess->storage_cxt.NLocBuffer);
if (num_buckets >= (uint32)sort_threshold)
buildstate.spool = _h_spoolinit(heap, index, num_buckets, &indexInfo->ii_desc);
else
buildstate.spool = NULL;
/* prepare to build the index */
buildstate.indtuples = 0;
buildstate.heapRel = heap;
/* do the heap scan */
reltuples = tableam_index_build_scan(heap, index, indexInfo, true, hashbuildCallback, (void*)&buildstate);
if (buildstate.spool != NULL) {
/* sort the tuples and insert them into the index */
_h_indexbuild(buildstate.spool);
_h_indexbuild(buildstate.spool, buildstate.heapRel);
_h_spooldestroy(buildstate.spool);
}
@ -114,7 +131,7 @@ Datum hashbuildempty(PG_FUNCTION_ARGS)
{
Relation index = (Relation)PG_GETARG_POINTER(0);
_hash_metapinit(index, 0, INIT_FORKNUM);
_hash_init(index, 0, INIT_FORKNUM);
PG_RETURN_VOID();
}
@ -126,21 +143,24 @@ static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, con
void *state)
{
HashBuildState *buildstate = (HashBuildState *)state;
Datum index_values[1];
bool index_isnull[1];
IndexTuple itup;
/* Hash indexes don't index nulls, see notes in hashinsert */
if (isnull[0]) {
/* convert data to a hash key; on failure, do not insert anything */
if (!_hash_convert_tuple(index,
values, isnull,
index_values, index_isnull))
return;
}
/* Either spool the tuple for sorting, or just put it into the index */
if (buildstate->spool != NULL) {
_h_spool(buildstate->spool, &htup->t_self, values, isnull);
_h_spool(buildstate->spool, &htup->t_self, index_values, index_isnull);
} else {
/* form an index tuple and point it at the heap tuple */
itup = _hash_form_tuple(index, values, isnull);
itup = index_form_tuple(RelationGetDescr(index), index_values, index_isnull);
itup->t_tid = htup->t_self;
_hash_doinsert(index, itup);
_hash_doinsert(index, itup, buildstate->heapRel);
pfree(itup);
}
@ -159,30 +179,22 @@ Datum hashinsert(PG_FUNCTION_ARGS)
Datum *values = (Datum *)PG_GETARG_POINTER(1);
bool *isnull = (bool *)PG_GETARG_POINTER(2);
ItemPointer ht_ctid = (ItemPointer)PG_GETARG_POINTER(3);
#ifdef NOT_USED
Relation heapRel = (Relation)PG_GETARG_POINTER(4);
IndexUniqueCheck checkUnique = (IndexUniqueCheck)PG_GETARG_INT32(5);
#endif
Datum index_values[1];
bool index_isnull[1];
IndexTuple itup;
/*
* If the single index key is null, we don't insert it into the index.
* Hash tables support scans on '='. Relational algebra says that A = B
* returns null if either A or B is null. This means that no
* qualification used in an index scan could ever return true on a null
* attribute. It also means that indices can't be used by ISNULL or
* NOTNULL scans, but that's an artifact of the strategy map architecture
* chosen in 1986, not of the way nulls are handled here.
*/
if (isnull[0])
PG_RETURN_BOOL(false);
/* convert data to a hash key; on failure, do not insert anything */
if (!_hash_convert_tuple(rel,
values, isnull,
index_values, index_isnull))
return false;
/* generate an index tuple */
itup = _hash_form_tuple(rel, values, isnull);
/* form an index tuple and point it at the heap tuple */
itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
itup->t_tid = *ht_ctid;
_hash_doinsert(rel, itup);
_hash_doinsert(rel, itup, heapRel);
pfree(itup);
@ -212,7 +224,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
* Reacquire the read lock here.
*/
if (BufferIsValid(so->hashso_curbuf))
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
/*
* If we've already initialized this scan, we can just advance it in the
@ -224,16 +236,21 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
/*
* An insertion into the current index page could have happened while
* we didn't have read lock on it. Re-find our position by looking
* for the TID we previously returned. (Because we hold share lock on
* the bucket, no deletions or splits could have occurred; therefore
* we can expect that the TID still exists in the current index page,
* at an offset >= where we were.)
* for the TID we previously returned. (Because we hold a pin on the
* primary bucket page, no deletions or splits could have occurred;
* therefore we can expect that the TID still exists in the current
* index page, at an offset >= where we were.)
*/
OffsetNumber maxoffnum;
buf = so->hashso_curbuf;
Assert(BufferIsValid(buf));
page = BufferGetPage(buf);
/*
* We don't need test for old snapshot here as the current buffer is
* pinned, so vacuum can't clean the page.
*/
maxoffnum = PageGetMaxOffsetNumber(page);
for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) {
IndexTuple itup;
@ -253,14 +270,22 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
*/
if (scan->kill_prior_tuple) {
/*
* Yes, so mark it by setting the LP_DEAD state in the item flags.
* Yes, so remember it for later. (We'll deal with all such tuples
* at once right after leaving the index page or at end of scan.)
* In case if caller reverses the indexscan direction it is quite
* possible that the same item might get entered multiple times.
* But, we don't detect that; instead, we just forget any excess
* entries.
*/
ItemIdMarkDead(PageGetItemId(page, offnum));
if (so->killedItems == NULL)
so->killedItems = (HashScanPosItem *)palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem));
/*
* Since this can be redone later if needed, mark as a hint.
*/
MarkBufferDirtyHint(buf, true);
if (so->numKilled < MaxIndexTuplesPerPage) {
so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
so->killedItems[so->numKilled].indexOffset =
ItemPointerGetOffsetNumber(&(so->hashso_curpos));
so->numKilled++;
}
}
/*
@ -285,7 +310,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
/* Release read lock on current buffer, but keep it pinned */
if (BufferIsValid(so->hashso_curbuf))
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
/* Return current heap TID on success */
scan->xs_ctup.t_self = so->hashso_heappos;
@ -353,17 +378,20 @@ Datum hashbeginscan(PG_FUNCTION_ARGS)
scan = RelationGetIndexScan(rel, nkeys, norderbys);
so = (HashScanOpaque)palloc(sizeof(HashScanOpaqueData));
so->hashso_bucket_valid = false;
so->hashso_bucket_blkno = 0;
so->hashso_curbuf = InvalidBuffer;
so->hashso_bucket_buf = InvalidBuffer;
so->hashso_split_bucket_buf = InvalidBuffer;
/* set position invalid (this will cause _hash_first call) */
ItemPointerSetInvalid(&(so->hashso_curpos));
ItemPointerSetInvalid(&(so->hashso_heappos));
scan->opaque = so;
so->hashso_buc_populated = false;
so->hashso_buc_split = false;
/* register scan in case we change pages it's using */
_hash_regscan(scan);
so->killedItems = NULL;
so->numKilled = 0;
scan->opaque = so;
PG_RETURN_POINTER(scan);
}
@ -381,14 +409,13 @@ Datum hashrescan(PG_FUNCTION_ARGS)
Relation rel = scan->indexRelation;
/* release any pin we still hold */
if (BufferIsValid(so->hashso_curbuf))
_hash_dropbuf(rel, so->hashso_curbuf);
so->hashso_curbuf = InvalidBuffer;
if (so->numKilled > 0) {
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
_hash_kill_items(scan);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
}
/* release lock on bucket, too */
if (so->hashso_bucket_blkno)
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
so->hashso_bucket_blkno = 0;
_hash_dropscanbuf(rel, so);
/* set position invalid (this will cause _hash_first call) */
ItemPointerSetInvalid(&(so->hashso_curpos));
@ -400,10 +427,11 @@ Datum hashrescan(PG_FUNCTION_ARGS)
rc = memmove_s(scan->keyData, (unsigned)scan->numberOfKeys * sizeof(ScanKeyData), scankey,
(unsigned)scan->numberOfKeys * sizeof(ScanKeyData));
securec_check(rc, "", "");
so->hashso_bucket_valid = false;
}
so->hashso_buc_populated = false;
so->hashso_buc_split = false;
PG_RETURN_VOID();
}
@ -416,18 +444,20 @@ Datum hashendscan(PG_FUNCTION_ARGS)
HashScanOpaque so = (HashScanOpaque)scan->opaque;
Relation rel = scan->indexRelation;
/* don't need scan registered anymore */
_hash_dropscan(scan);
/*
* Before leaving current page, deal with any killed items. Also, ensure
* that we acquire lock on current page before calling _hash_kill_items.
*/
if (so->numKilled > 0) {
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
_hash_kill_items(scan);
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
}
/* release any pin we still hold */
if (BufferIsValid(so->hashso_curbuf))
_hash_dropbuf(rel, so->hashso_curbuf);
so->hashso_curbuf = InvalidBuffer;
_hash_dropscanbuf(rel, so);
/* release lock on bucket, too */
if (so->hashso_bucket_blkno)
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
so->hashso_bucket_blkno = 0;
if (so->killedItems != NULL)
pfree(so->killedItems);
pfree(so);
scan->opaque = NULL;
@ -458,6 +488,9 @@ Datum hashrestrpos(PG_FUNCTION_ARGS)
* The set of target tuples is specified via a callback routine that tells
* whether any given heap tuple (identified by ItemPointer) is being deleted.
*
* This function also deletes the tuples that are moved by split to other
* bucket.
*
* Result: a palloc'd struct containing statistical info for VACUUM displays.
*/
Datum hashbulkdelete(PG_FUNCTION_ARGS)
@ -473,29 +506,24 @@ Datum hashbulkdelete(PG_FUNCTION_ARGS)
Bucket orig_maxbucket;
Bucket cur_maxbucket;
Bucket cur_bucket;
Buffer metabuf;
Buffer metabuf = InvalidBuffer;
HashMetaPage metap;
HashMetaPageData local_metapage;
errno_t rc;
HashMetaPage cachedmetap;
tuples_removed = 0;
num_index_tuples = 0;
/*
* Read the metapage to fetch original bucket and tuple counts. Also, we
* keep a copy of the last-seen metapage so that we can use its
* hashm_spares[] values to compute bucket page addresses. This is a bit
* hokey but perfectly safe, since the interesting entries in the spares
* array cannot change under us; and it beats rereading the metapage for
* each bucket.
* We need a copy of the metapage so that we can use its hashm_spares[]
* values to compute bucket page addresses, but a cached copy should be
* good enough. (If not, we'll detect that further down and refresh the
* cache as necessary.)
*/
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
orig_maxbucket = metap->hashm_maxbucket;
orig_ntuples = metap->hashm_ntuples;
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
securec_check(rc, "", "");
_hash_relbuf(rel, metabuf);
cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
Assert(cachedmetap != NULL);
orig_maxbucket = cachedmetap->hashm_maxbucket;
orig_ntuples = cachedmetap->hashm_ntuples;
/* Scan the buckets that we know exist */
cur_bucket = 0;
@ -505,90 +533,85 @@ loop_top:
while (cur_bucket <= cur_maxbucket) {
BlockNumber bucket_blkno;
BlockNumber blkno;
bool bucket_dirty = false;
Buffer bucket_buf;
Buffer buf;
HashPageOpaque bucket_opaque;
Page page;
bool split_cleanup = false;
/* Get address of bucket's start page */
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
/* Exclusive-lock the bucket so we can shrink it */
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
/* Shouldn't have any active scans locally, either */
if (_hash_has_active_scan(rel, cur_bucket))
ereport(ERROR,
(errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), (errmsg("hash index has active scan during VACUUM."))));
/* Scan each page in bucket */
blkno = bucket_blkno;
while (BlockNumberIsValid(blkno)) {
Buffer buf;
Page page;
HashPageOpaque opaque;
OffsetNumber offno;
OffsetNumber maxoffno;
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
vacuum_delay_point();
/*
* We need to acquire a cleanup lock on the primary bucket page to out
* wait concurrent scans before deleting the dead tuples.
*/
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
LockBufferForCleanup(buf);
_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy);
page = BufferGetPage(buf);
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == cur_bucket);
page = BufferGetPage(buf);
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/* Scan each tuple in page */
maxoffno = PageGetMaxOffsetNumber(page);
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
IndexTuple itup;
ItemPointer htup;
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offno));
htup = &(itup->t_tid);
if (callback(htup, callback_state, InvalidOid)) {
/* mark the item for deletion */
deletable[ndeletable++] = offno;
tuples_removed += 1;
} else
num_index_tuples += 1;
}
/*
* If the bucket contains tuples that are moved by split, then we need
* to delete such tuples. We can't delete such tuples if the split
* operation on bucket is not finished as those are needed by scans.
*/
if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) {
split_cleanup = true;
/*
* Apply deletions and write page if needed, advance to next page.
* This bucket might have been split since we last held a lock on
* the metapage. If so, hashm_maxbucket, hashm_highmask and
* hashm_lowmask might be old enough to cause us to fail to remove
* tuples left behind by the most recent split. To prevent that,
* now that the primary page of the target bucket has been locked
* (and thus can't be further split), check whether we need to
* update our cached metapage data.
*/
blkno = opaque->hasho_nextblkno;
if (ndeletable > 0) {
PageIndexMultiDelete(page, deletable, ndeletable);
_hash_wrtbuf(rel, buf);
bucket_dirty = true;
} else
_hash_relbuf(rel, buf);
Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) {
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
Assert(cachedmetap != NULL);
}
}
/* If we deleted anything, try to compact free space */
if (bucket_dirty)
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy);
bucket_buf = buf;
/* Release bucket lock */
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
cachedmetap->hashm_maxbucket,
cachedmetap->hashm_highmask,
cachedmetap->hashm_lowmask, &tuples_removed,
&num_index_tuples, split_cleanup,
callback, callback_state);
_hash_dropbuf(rel, bucket_buf);
/* Advance to next bucket */
cur_bucket++;
}
if (BufferIsInvalid(metabuf))
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
/* Write-lock metapage and check for split since we started */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
if (cur_maxbucket != metap->hashm_maxbucket) {
/* There's been a split, so process the additional bucket(s) */
cur_maxbucket = metap->hashm_maxbucket;
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
securec_check(rc, "", "");
_hash_relbuf(rel, metabuf);
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
Assert(cachedmetap != NULL);
cur_maxbucket = cachedmetap->hashm_maxbucket;
goto loop_top;
}
/* Okay, we're really done. Update tuple count in metapage. */
START_CRIT_SECTION();
if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) {
/*
* No one has split or inserted anything since start of scan, so
@ -609,7 +632,27 @@ loop_top:
num_index_tuples = metap->hashm_ntuples;
}
_hash_wrtbuf(rel, metabuf);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_update_meta_page xlrec;
XLogRecPtr recptr;
xlrec.ntuples = metap->hashm_ntuples;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
PageSetLSN(BufferGetPage(metabuf), recptr);
}
END_CRIT_SECTION();
_hash_relbuf(rel, metabuf);
/* return statistics */
if (stats == NULL)
@ -645,9 +688,244 @@ Datum hashvacuumcleanup(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(stats);
}
void hash_redo(XLogReaderState *record)
/*
* Helper function to perform deletion of index entries from a bucket.
*
* This function expects that the caller has acquired a cleanup lock on the
* primary bucket page, and will return with a write lock again held on the
* primary bucket page. The lock won't necessarily be held continuously,
* though, because we'll release it when visiting overflow pages.
*
* It would be very bad if this function cleaned a page while some other
* backend was in the midst of scanning it, because hashgettuple assumes
* that the next valid TID will be greater than or equal to the current
* valid TID. There can't be any concurrent scans in progress when we first
* enter this function because of the cleanup lock we hold on the primary
* bucket page, but as soon as we release that lock, there might be. We
* handle that by conspiring to prevent those scans from passing our cleanup
* scan. To do that, we lock the next page in the bucket chain before
* releasing the lock on the previous page. (This type of lock chaining is
* not ideal, so we might want to look for a better solution at some point.)
*
* We need to retain a pin on the primary bucket to ensure that no concurrent
* split can start.
*/
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
uint32 maxbucket, uint32 highmask, uint32 lowmask,
double *tuples_removed, double *num_index_tuples,
bool split_cleanup,
IndexBulkDeleteCallback callback, void *callback_state)
{
ereport(PANIC, (errmsg("hash_redo: unimplemented")));
BlockNumber blkno;
Buffer buf;
Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
bool bucket_dirty = false;
blkno = bucket_blkno;
buf = bucket_buf;
if (split_cleanup)
new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
lowmask, maxbucket);
/* Scan each page in bucket */
for (;;) {
HashPageOpaque opaque;
OffsetNumber offno;
OffsetNumber maxoffno;
Buffer next_buf;
Page page;
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
bool retain_pin = false;
bool clear_dead_marking = false;
vacuum_delay_point();
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/* Scan each tuple in page */
maxoffno = PageGetMaxOffsetNumber(page);
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
ItemPointer htup;
IndexTuple itup;
Bucket bucket;
bool kill_tuple = false;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno));
htup = &(itup->t_tid);
/*
* To remove the dead tuples, we strictly want to rely on results
* of callback function. refer btvacuumpage for detailed reason.
*/
if (callback && callback(htup, callback_state, InvalidOid)) {
kill_tuple = true;
if (tuples_removed)
*tuples_removed += 1;
} else if (split_cleanup) {
/* delete the tuples that are moved by split. */
bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
maxbucket, highmask, lowmask);
/* mark the item for deletion */
if (bucket != cur_bucket) {
/*
* We expect tuples to either belong to current bucket or
* new_bucket. This is ensured because we don't allow
* further splits from bucket that contains garbage. See
* comments in _hash_expandtable.
*/
Assert(bucket == new_bucket);
kill_tuple = true;
}
}
if (kill_tuple) {
/* mark the item for deletion */
deletable[ndeletable++] = offno;
} else {
/* we're keeping it, so count it */
if (num_index_tuples)
*num_index_tuples += 1;
}
}
/* retain the pin on primary bucket page till end of bucket scan */
if (blkno == bucket_blkno)
retain_pin = true;
else
retain_pin = false;
blkno = opaque->hasho_nextblkno;
/*
* Apply deletions, advance to next page and write page if needed.
*/
if (ndeletable > 0) {
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
PageIndexMultiDelete(page, deletable, ndeletable);
bucket_dirty = true;
/*
* Let us mark the page as clean if vacuum removes the DEAD tuples
* from an index page. We do this by clearing
* LH_PAGE_HAS_DEAD_TUPLES flag.
*/
if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) {
opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
clear_dead_marking = true;
}
MarkBufferDirty(buf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_delete xlrec;
XLogRecPtr recptr;
xlrec.clear_dead_marking = clear_dead_marking;
xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
/*
* bucket buffer needs to be registered to ensure that we can
* acquire a cleanup lock on it during replay.
*/
if (!xlrec.is_primary_bucket_page) {
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
}
XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber));
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
if (!xlrec.is_primary_bucket_page) {
PageSetLSN(BufferGetPage(bucket_buf), recptr);
}
PageSetLSN(BufferGetPage(buf), recptr);
}
END_CRIT_SECTION();
}
/* bail out if there are no more pages to scan. */
if (!BlockNumberIsValid(blkno))
break;
next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
LH_OVERFLOW_PAGE,
bstrategy);
/*
* release the lock on previous page after acquiring the lock on next
* page
*/
if (retain_pin)
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
else
_hash_relbuf(rel, buf);
buf = next_buf;
}
/*
* lock the bucket page to clear the garbage flag and squeeze the bucket.
* if the current buffer is same as bucket buffer, then we already have
* lock on bucket page.
*/
if (buf != bucket_buf) {
_hash_relbuf(rel, buf);
LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
}
/*
* Clear the garbage flag from bucket after deleting the tuples that are
* moved by split. We purposefully clear the flag before squeeze bucket,
* so that after restart, vacuum shouldn't again try to delete the moved
* by split tuples.
*/
if (split_cleanup) {
HashPageOpaque bucket_opaque;
Page page;
page = BufferGetPage(bucket_buf);
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
MarkBufferDirty(bucket_buf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
XLogRecPtr recptr;
XLogBeginInsert();
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
/*
* If we have deleted anything, try to compact free space. For squeezing
* the bucket, we must have a cleanup lock, else it can impact the
* ordering of tuples for a scan that has started before it.
*/
if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy);
else
LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
}
Datum hashmerge(PG_FUNCTION_ARGS)

View File

@ -0,0 +1,861 @@
/* -------------------------------------------------------------------------
*
* hash_xlog.cpp
* WAL replay logic for hash index.
*
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/gausskernel/storage/access/hash/hash_xlog.cpp
*
* -------------------------------------------------------------------------
*/
#include "access/xlogproc.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/xlogutils.h"
#include "access/xlog.h"
#include "access/transam.h"
#include "access/xlogproc.h"
#include "storage/procarray.h"
#include "miscadmin.h"
/*
* replay a hash index meta page
*/
static void hash_xlog_init_meta_page(XLogReaderState *record)
{
RedoBufferInfo metabuf;
ForkNumber forknum;
/* create the index' metapage */
XLogInitBufferForRedo(record, 0, &metabuf);
Assert(BufferIsValid(metabuf.buf));
HashRedoInitMetaPageOperatorPage(&metabuf, XLogRecGetData(record));
MarkBufferDirty(metabuf.buf);
/*
* Force the on-disk state of init forks to always be in sync with the
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
* special handling for init forks as create index operations don't log a
* full page image of the metapage.
*/
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
if (forknum == INIT_FORKNUM)
FlushOneBuffer(metabuf.buf);
/* all done */
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay a hash index bitmap page
*/
static void hash_xlog_init_bitmap_page(XLogReaderState *record)
{
RedoBufferInfo bitmapbuf;
RedoBufferInfo metabuf;
ForkNumber forknum;
/*
* Initialize bitmap page
*/
XLogInitBufferForRedo(record, 0, &bitmapbuf);
HashRedoInitBitmapPageOperatorBitmapPage(&bitmapbuf, XLogRecGetData(record));
MarkBufferDirty(bitmapbuf.buf);
/*
* Force the on-disk state of init forks to always be in sync with the
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
* special handling for init forks as create index operations don't log a
* full page image of the metapage.
*/
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
if (forknum == INIT_FORKNUM)
FlushOneBuffer(bitmapbuf.buf);
UnlockReleaseBuffer(bitmapbuf.buf);
/* add the new bitmap page to the metapage's list of bitmaps */
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
/*
* Note: in normal operation, we'd update the metapage while still
* holding lock on the bitmap page. But during replay it's not
* necessary to hold that lock, since nobody can see it yet; the
* creating transaction hasn't yet committed.
*/
HashRedoInitBitmapPageOperatorMetaPage(&metabuf);
MarkBufferDirty(metabuf.buf);
XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL);
if (forknum == INIT_FORKNUM)
FlushOneBuffer(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay a hash index insert without split
*/
static void hash_xlog_insert(XLogReaderState *record)
{
RedoBufferInfo buffer;
RedoBufferInfo metabuf;
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
Size datalen;
char *datapos = XLogRecGetBlockData(record, 0, &datalen);
HashRedoInsertOperatorPage(&buffer, XLogRecGetData(record), datapos, datalen);
MarkBufferDirty(buffer.buf);
}
if (BufferIsValid(buffer.buf))
UnlockReleaseBuffer(buffer.buf);
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
/*
* Note: in normal operation, we'd update the metapage while still
* holding lock on the page we inserted into. But during replay it's
* not necessary to hold that lock, since no other index updates can
* be happening concurrently.
*/
HashRedoInsertOperatorMetaPage(&metabuf);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay addition of overflow page for hash index
*/
static void hash_xlog_add_ovfl_page(XLogReaderState* record)
{
RedoBufferInfo leftbuf;
RedoBufferInfo ovflbuf;
RedoBufferInfo metabuf;
BlockNumber leftblk;
BlockNumber rightblk;
char *data = NULL;
Size datalen;
XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
XLogInitBufferForRedo(record, 0, &ovflbuf);
Assert(BufferIsValid(ovflbuf.buf));
data = XLogRecGetBlockData(record, 0, &datalen);
HashRedoAddOvflPageOperatorOvflPage(&ovflbuf, leftblk, data, datalen);
MarkBufferDirty(ovflbuf.buf);
if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) {
HashRedoAddOvflPageOperatorLeftPage(&leftbuf, rightblk);
MarkBufferDirty(leftbuf.buf);
}
if (BufferIsValid(leftbuf.buf))
UnlockReleaseBuffer(leftbuf.buf);
UnlockReleaseBuffer(ovflbuf.buf);
/*
* Note: in normal operation, we'd update the bitmap and meta page while
* still holding lock on the overflow pages. But during replay it's not
* necessary to hold those locks, since no other index updates can be
* happening concurrently.
*/
if (XLogRecHasBlockRef(record, 2)) {
RedoBufferInfo mapbuffer;
if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) {
data = XLogRecGetBlockData(record, 2, &datalen);
HashRedoAddOvflPageOperatorMapPage(&mapbuffer, data);
MarkBufferDirty(mapbuffer.buf);
}
if (BufferIsValid(mapbuffer.buf))
UnlockReleaseBuffer(mapbuffer.buf);
}
if (XLogRecHasBlockRef(record, 3)) {
RedoBufferInfo newmapbuf;
XLogInitBufferForRedo(record, 3, &newmapbuf);
HashRedoAddOvflPageOperatorNewmapPage(&newmapbuf, XLogRecGetData(record));
MarkBufferDirty(newmapbuf.buf);
UnlockReleaseBuffer(newmapbuf.buf);
}
if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) {
data = XLogRecGetBlockData(record, 4, &datalen);
HashRedoAddOvflPageOperatorMetaPage(&metabuf, XLogRecGetData(record), data, datalen);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay allocation of page for split operation
*/
static void hash_xlog_split_allocate_page(XLogReaderState *record)
{
RedoBufferInfo oldbuf;
RedoBufferInfo newbuf;
RedoBufferInfo metabuf;
Size datalen PG_USED_FOR_ASSERTS_ONLY;
char *data = NULL;
XLogRedoAction action;
/*
* To be consistent with normal operation, here we take cleanup locks on
* both the old and new buckets even though there can't be any concurrent
* inserts.
*/
/* replay the record for old bucket */
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
/*
* Note that we still update the page even if it was restored from a full
* page image, because the special space is not included in the image.
*/
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
HashRedoSplitAllocatePageOperatorObukPage(&oldbuf, XLogRecGetData(record));
MarkBufferDirty(oldbuf.buf);
}
/* replay the record for new bucket */
XLogInitBufferForRedo(record, 1, &newbuf);
HashRedoSplitAllocatePageOperatorNbukPage(&newbuf, XLogRecGetData(record));
if (!IsBufferCleanupOK(newbuf.buf))
elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock");
MarkBufferDirty(newbuf.buf);
/*
* We can release the lock on old bucket early as well but doing here to
* consistent with normal operation.
*/
if (BufferIsValid(oldbuf.buf))
UnlockReleaseBuffer(oldbuf.buf);
if (BufferIsValid(newbuf.buf))
UnlockReleaseBuffer(newbuf.buf);
/*
* Note: in normal operation, we'd update the meta page while still
* holding lock on the old and new bucket pages. But during replay it's
* not necessary to hold those locks, since no other bucket splits can be
* happening concurrently.
*/
/* replay the record for metapage changes */
if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) {
data = XLogRecGetBlockData(record, 2, &datalen);
HashRedoSplitAllocatePageOperatorMetaPage(&metabuf, XLogRecGetData(record), data);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* replay of split operation
*/
static void hash_xlog_split_page(XLogReaderState *record)
{
RedoBufferInfo buf;
if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
elog(ERROR, "Hash split record did not contain a full-page image");
if (BufferIsValid(buf.buf))
UnlockReleaseBuffer(buf.buf);
}
/*
* replay completion of split operation
*/
static void hash_xlog_split_complete(XLogReaderState *record)
{
RedoBufferInfo oldbuf;
RedoBufferInfo newbuf;
XLogRedoAction action;
/* replay the record for old bucket */
action = XLogReadBufferForRedo(record, 0, &oldbuf);
/*
* Note that we still update the page even if it was restored from a full
* page image, because the bucket flag is not included in the image.
*/
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
HashRedoSplitCompleteOperatorObukPage(&oldbuf, XLogRecGetData(record));
MarkBufferDirty(oldbuf.buf);
}
if (BufferIsValid(oldbuf.buf))
UnlockReleaseBuffer(oldbuf.buf);
/* replay the record for new bucket */
action = XLogReadBufferForRedo(record, 1, &newbuf);
/*
* Note that we still update the page even if it was restored from a full
* page image, because the bucket flag is not included in the image.
*/
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
HashRedoSplitCompleteOperatorNbukPage(&newbuf, XLogRecGetData(record));
MarkBufferDirty(newbuf.buf);
}
if (BufferIsValid(newbuf.buf))
UnlockReleaseBuffer(newbuf.buf);
}
/*
* replay move of page contents for squeeze operation of hash index
*/
static void hash_xlog_move_page_contents(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
RedoBufferInfo bucketbuf;
RedoBufferInfo writebuf;
RedoBufferInfo deletebuf;
XLogRedoAction action;
bucketbuf.buf = InvalidBuffer;
writebuf.buf = InvalidBuffer;
deletebuf.buf = InvalidBuffer;
/*
* Ensure we have a cleanup lock on primary bucket page before we start
* with the actual replay operation. This is to ensure that neither a
* scan can start nor a scan can be already-in-progress during the replay
* of this operation. If we allow scans during this operation, then they
* can miss some records or show the same record multiple times.
*/
if (xldata->is_prim_bucket_same_wrt) {
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
} else {
/*
* we don't care for return value as the purpose of reading bucketbuf
* is to ensure a cleanup lock on primary bucket page.
*/
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
PageSetLSN(bucketbuf.pageinfo.page, lsn);
action = XLogReadBufferForRedo(record, 1, &writebuf);
}
/* replay the record for adding entries in overflow buffer */
if (action == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 1, &datalen);
HashXlogMoveAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
MarkBufferDirty(writebuf.buf);
}
/* replay the record for deleting entries from overflow buffer */
if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) {
char *ptr = NULL;
Size len;
ptr = XLogRecGetBlockData(record, 2, &len);
HashXlogMoveDeleteOvflPageOperatorPage(&deletebuf, (void *)ptr, len);
MarkBufferDirty(deletebuf.buf);
}
/*
* Replay is complete, now we can release the buffers. We release locks at
* end of replay operation to ensure that we hold lock on primary bucket
* page till end of operation. We can optimize by releasing the lock on
* write buffer as soon as the operation for same is complete, if it is
* not same as primary bucket page, but that doesn't seem to be worth
* complicating the code.
*/
if (BufferIsValid(deletebuf.buf))
UnlockReleaseBuffer(deletebuf.buf);
if (BufferIsValid(writebuf.buf))
UnlockReleaseBuffer(writebuf.buf);
if (BufferIsValid(bucketbuf.buf))
UnlockReleaseBuffer(bucketbuf.buf);
}
/*
* replay squeeze page operation of hash index
*/
static void hash_xlog_squeeze_page(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
RedoBufferInfo bucketbuf;
RedoBufferInfo writebuf;
RedoBufferInfo ovflbuf;
RedoBufferInfo prevbuf;
RedoBufferInfo mapbuf;
XLogRedoAction action;
bucketbuf.buf = InvalidBuffer;
prevbuf.buf = InvalidBuffer;
/*
* Ensure we have a cleanup lock on primary bucket page before we start
* with the actual replay operation. This is to ensure that neither a
* scan can start nor a scan can be already-in-progress during the replay
* of this operation. If we allow scans during this operation, then they
* can miss some records or show the same record multiple times.
*/
if (xldata->is_prim_bucket_same_wrt) {
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
} else {
/*
* we don't care for return value as the purpose of reading bucketbuf
* is to ensure a cleanup lock on primary bucket page.
*/
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
PageSetLSN(bucketbuf.pageinfo.page, lsn);
action = XLogReadBufferForRedo(record, 1, &writebuf);
}
/* replay the record for adding entries in overflow buffer */
if (action == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 1, &datalen);
HashXlogSqueezeAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
MarkBufferDirty(writebuf.buf);
}
/* replay the record for initializing overflow buffer */
if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) {
HashXlogSqueezeInitOvflbufOperatorPage(&ovflbuf, XLogRecGetData(record));
MarkBufferDirty(ovflbuf.buf);
}
if (BufferIsValid(ovflbuf.buf))
UnlockReleaseBuffer(ovflbuf.buf);
/* replay the record for page previous to the freed overflow page */
if (!xldata->is_prev_bucket_same_wrt &&
XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) {
HashXlogSqueezeUpdatePrevPageOperatorPage(&prevbuf, XLogRecGetData(record));
MarkBufferDirty(prevbuf.buf);
}
if (BufferIsValid(prevbuf.buf))
UnlockReleaseBuffer(prevbuf.buf);
/* replay the record for page next to the freed overflow page */
if (XLogRecHasBlockRef(record, 4)) {
RedoBufferInfo nextbuf;
if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) {
HashXlogSqueezeUpdateNextPageOperatorPage(&nextbuf, XLogRecGetData(record));
MarkBufferDirty(nextbuf.buf);
}
if (BufferIsValid(nextbuf.buf))
UnlockReleaseBuffer(nextbuf.buf);
}
if (BufferIsValid(writebuf.buf))
UnlockReleaseBuffer(writebuf.buf);
if (BufferIsValid(bucketbuf.buf))
UnlockReleaseBuffer(bucketbuf.buf);
/*
* Note: in normal operation, we'd update the bitmap and meta page while
* still holding lock on the primary bucket page and overflow pages. But
* during replay it's not necessary to hold those locks, since no other
* index updates can be happening concurrently.
*/
/* replay the record for bitmap page */
if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 5, &datalen);
HashXlogSqueezeUpdateBitmapOperatorPage(&mapbuf, (void *)data);
MarkBufferDirty(mapbuf.buf);
}
if (BufferIsValid(mapbuf.buf))
UnlockReleaseBuffer(mapbuf.buf);
/* replay the record for meta page */
if (XLogRecHasBlockRef(record, 6)) {
RedoBufferInfo metabuf;
if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) {
char *data = NULL;
Size datalen;
data = XLogRecGetBlockData(record, 6, &datalen);
HashXlogSqueezeUpdateMateOperatorPage(&metabuf, (void *)data);
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
}
/*
* replay delete operation of hash index
*/
static void hash_xlog_delete(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
RedoBufferInfo bucketbuf;
RedoBufferInfo deletebuf;
XLogRedoAction action;
bucketbuf.buf = InvalidBuffer;
/*
* Ensure we have a cleanup lock on primary bucket page before we start
* with the actual replay operation. This is to ensure that neither a
* scan can start nor a scan can be already-in-progress during the replay
* of this operation. If we allow scans during this operation, then they
* can miss some records or show the same record multiple times.
*/
if (xldata->is_primary_bucket_page) {
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
} else {
/*
* we don't care for return value as the purpose of reading bucketbuf
* is to ensure a cleanup lock on primary bucket page.
*/
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
PageSetLSN(bucketbuf.pageinfo.page, lsn);
action = XLogReadBufferForRedo(record, 1, &deletebuf);
}
/* replay the record for deleting entries in bucket page */
if (action == BLK_NEEDS_REDO) {
char *ptr = NULL;
Size len;
ptr = XLogRecGetBlockData(record, 1, &len);
HashXlogDeleteBlockOperatorPage(&deletebuf, XLogRecGetData(record), (void *)ptr, len);
MarkBufferDirty(deletebuf.buf);
}
if (BufferIsValid(deletebuf.buf))
UnlockReleaseBuffer(deletebuf.buf);
if (BufferIsValid(bucketbuf.buf))
UnlockReleaseBuffer(bucketbuf.buf);
}
/*
* replay split cleanup flag operation for primary bucket page.
*/
static void hash_xlog_split_cleanup(XLogReaderState *record)
{
RedoBufferInfo buffer;
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
HashXlogSplitCleanupOperatorPage(&buffer);
MarkBufferDirty(buffer.buf);
}
if (BufferIsValid(buffer.buf))
UnlockReleaseBuffer(buffer.buf);
}
/*
* replay for update meta page
*/
static void hash_xlog_update_meta_page(XLogReaderState *record)
{
RedoBufferInfo metabuf;
if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) {
HashXlogUpdateMetaOperatorPage(&metabuf, XLogRecGetData(record));
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
/*
* Get the latestRemovedXid from the heap pages pointed at by the index
* tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
* on which this function is based.
*/
static TransactionId hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record)
{
xl_hash_vacuum_one_page *xlrec;
OffsetNumber *unused = NULL;
Buffer ibuffer;
Buffer hbuffer;
Page ipage;
Page hpage;
RelFileNode rnode;
BlockNumber blkno;
ItemId iitemid;
ItemId hitemid;
IndexTuple itup;
BlockNumber hblkno;
OffsetNumber hoffnum;
TransactionId latestRemovedXid = InvalidTransactionId;
int i;
xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
/*
* If there's nothing running on the standby we don't need to derive a
* full latestRemovedXid value, so use a fast path out of here. This
* returns InvalidTransactionId, and so will conflict with all HS
* transactions; but since we just worked out that that's zero people,
* it's OK.
*
* XXX There is a race condition here, which is that a new backend might
* start just after we look. If so, it cannot need to conflict, but this
* coding will result in throwing a conflict anyway.
*/
if (CountDBBackends(InvalidOid) == 0)
return latestRemovedXid;
/*
* Check if WAL replay has reached a consistent database state. If not, we
* must PANIC. See the definition of
* btree_xlog_delete_get_latestRemovedXid for more details.
*/
if (!t_thrd.xlog_cxt.reachedConsistency)
elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data");
/*
* Get index page. If the DB is consistent, this should not fail, nor
* should any of the heap page fetches below. If one does, we return
* InvalidTransactionId to cancel all HS transactions. That's probably
* overkill, but it's safe, and certainly better than panicking here.
*/
XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
if (!BufferIsValid(ibuffer))
return InvalidTransactionId;
LockBuffer(ibuffer, HASH_READ);
ipage = (Page) BufferGetPage(ibuffer);
/*
* Loop through the deleted index items to obtain the TransactionId from
* the heap items they point to.
*/
unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage);
for (i = 0; i < xlrec->ntuples; i++) {
/*
* Identify the index tuple about to be deleted.
*/
iitemid = PageGetItemId(ipage, unused[i]);
itup = (IndexTuple) PageGetItem(ipage, iitemid);
/*
* Locate the heap page that the index tuple points at
*/
hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
if (!BufferIsValid(hbuffer)) {
UnlockReleaseBuffer(ibuffer);
return InvalidTransactionId;
}
LockBuffer(hbuffer, HASH_READ);
hpage = (Page) BufferGetPage(hbuffer);
/*
* Look up the heap tuple header that the index tuple points at by
* using the heap node supplied with the xlrec. We can't use
* heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
* Note that we are not looking at tuple data here, just headers.
*/
hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
hitemid = PageGetItemId(hpage, hoffnum);
/*
* Follow any redirections until we find something useful.
*/
while (ItemIdIsRedirected(hitemid)) {
hoffnum = ItemIdGetRedirect(hitemid);
hitemid = PageGetItemId(hpage, hoffnum);
CHECK_FOR_INTERRUPTS();
}
/*
* If the heap item has storage, then read the header and use that to
* set latestRemovedXid.
*
* Some LP_DEAD items may not be accessible, so we ignore them.
*/
if (ItemIdHasStorage(hitemid)) {
HeapTupleData tuple;
tuple.t_data = (HeapTupleHeader) PageGetItem(hpage, hitemid);
HeapTupleCopyBaseFromPage(&tuple, &hpage);
HeapTupleHeaderAdvanceLatestRemovedXid(&tuple, &latestRemovedXid);
} else if (ItemIdIsDead(hitemid)) {
/*
* Conjecture: if hitemid is dead then it had xids before the xids
* marked on LP_NORMAL items. So we just ignore this item and move
* onto the next, for the purposes of calculating
* latestRemovedxids.
*/
} else
Assert(!ItemIdIsUsed(hitemid));
UnlockReleaseBuffer(hbuffer);
}
UnlockReleaseBuffer(ibuffer);
/*
* If all heap tuples were LP_DEAD then we will be returning
* InvalidTransactionId here, which avoids conflicts. This matches
* existing logic which assumes that LP_DEAD tuples must already be older
* than the latestRemovedXid on the cleanup record that set them as
* LP_DEAD, hence must already have generated a conflict.
*/
return latestRemovedXid;
}
/*
* replay delete operation in hash index to remove
* tuples marked as DEAD during index tuple insertion.
*/
static void hash_xlog_vacuum_one_page(XLogReaderState *record)
{
RedoBufferInfo buffer;
RedoBufferInfo metabuf;
XLogRedoAction action;
/*
* If we have any conflict processing to do, it must happen before we
* update the page.
*
* Hash index records that are marked as LP_DEAD and being removed during
* hash index tuple insertion can conflict with standby queries. You might
* think that vacuum records would conflict as well, but we've handled
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
* cleaned by the vacuum of the heap and so we can resolve any conflicts
* just once when that arrives. After that we know that no conflicts
* exist from individual hash index vacuum records on that index.
*/
if (InHotStandby) {
TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record);
RelFileNode rnode;
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
}
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
if (action == BLK_NEEDS_REDO) {
Size len;
len = XLogRecGetDataLen(record);
HashXlogVacuumOnePageOperatorPage(&buffer, XLogRecGetData(record), len);
MarkBufferDirty(buffer.buf);
}
if (BufferIsValid(buffer.buf))
UnlockReleaseBuffer(buffer.buf);
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
HashXlogVacuumMateOperatorPage(&metabuf, XLogRecGetData(record));
MarkBufferDirty(metabuf.buf);
}
if (BufferIsValid(metabuf.buf))
UnlockReleaseBuffer(metabuf.buf);
}
void hash_redo(XLogReaderState *record)
{
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
switch (info) {
case XLOG_HASH_INIT_META_PAGE:
hash_xlog_init_meta_page(record);
break;
case XLOG_HASH_INIT_BITMAP_PAGE:
hash_xlog_init_bitmap_page(record);
break;
case XLOG_HASH_INSERT:
hash_xlog_insert(record);
break;
case XLOG_HASH_ADD_OVFL_PAGE:
hash_xlog_add_ovfl_page(record);
break;
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
hash_xlog_split_allocate_page(record);
break;
case XLOG_HASH_SPLIT_PAGE:
hash_xlog_split_page(record);
break;
case XLOG_HASH_SPLIT_COMPLETE:
hash_xlog_split_complete(record);
break;
case XLOG_HASH_MOVE_PAGE_CONTENTS:
hash_xlog_move_page_contents(record);
break;
case XLOG_HASH_SQUEEZE_PAGE:
hash_xlog_squeeze_page(record);
break;
case XLOG_HASH_DELETE:
hash_xlog_delete(record);
break;
case XLOG_HASH_SPLIT_CLEANUP:
hash_xlog_split_cleanup(record);
break;
case XLOG_HASH_UPDATE_META_PAGE:
hash_xlog_update_meta_page(record);
break;
case XLOG_HASH_VACUUM_ONE_PAGE:
hash_xlog_vacuum_one_page(record);
break;
default:
elog(PANIC, "hash_redo: unknown op code %u", info);
}
}
bool IsHashVacuumPages(XLogReaderState *record)
{
uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK));
if (XLogRecGetRmid(record) == RM_HASH_ID) {
if (info == XLOG_HASH_DELETE) {
return true;
}
}
return false;
}

View File

@ -3,8 +3,8 @@
* hashinsert.cpp
* Item insertion in hash tables for Postgres.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -17,21 +17,30 @@
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/heapam.h"
#include "access/xloginsert.h"
#include "miscadmin.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "storage/lock/lwlock.h"
#include "storage/buf/buf_internals.h"
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode);
/*
* _hash_doinsert() -- Handle insertion of a single index tuple.
*
* This routine is called by the public interface routines, hashbuild
* and hashinsert. By here, itup is completely filled in.
* This routine is called by the public interface routines, hashbuild
* and hashinsert. By here, itup is completely filled in.
*/
void _hash_doinsert(Relation rel, IndexTuple itup)
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
{
Buffer buf;
Buffer bucket_buf;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
HashMetaPage usedmetap = NULL;
Page metapage;
Page page;
HashPageOpaque pageopaque;
@ -39,7 +48,7 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
bool do_expand = false;
uint32 hashkey;
Bucket bucket;
OffsetNumber itup_off;
/*
* Get the hash key for the item (it's stored in the index tuple itself).
*/
@ -49,16 +58,16 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
itemsz = IndexTupleDSize(*itup);
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
* need to be consistent */
/*
* Acquire shared split lock so we can compute the target bucket safely
* (see README).
*/
_hash_getlock(rel, 0, HASH_SHARE);
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
restart_insert:
/*
* Read the metapage. We don't lock it yet; HashMaxItemSize() will
* examine pd_pagesize_version, but that can't change so we can examine it
* without a lock.
*/
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
metapage = BufferGetPage(metabuf);
metap = HashPageGetMeta(metapage);
/*
* Check whether the item can fit on a hash page at all. (Eventually, we
@ -73,87 +82,154 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
(unsigned long)HashMaxItemSize(metapage)),
errhint("Values larger than a buffer page cannot be indexed.")));
/*
* Compute the target bucket number, and convert to block number.
*/
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
/* Lock the primary bucket page for the target bucket. */
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, &usedmetap);
Assert(usedmetap != NULL);
blkno = BUCKET_TO_BLKNO(metap, bucket);
/* remember the primary bucket buffer to release the pin on it at end. */
bucket_buf = buf;
/* release lock on metapage, but keep pin since we'll need it again */
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
/*
* Acquire share lock on target bucket; then we can release split lock.
*/
_hash_getlock(rel, blkno, HASH_SHARE);
_hash_droplock(rel, 0, HASH_SHARE);
/* Fetch the primary bucket page for the bucket */
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
page = BufferGetPage(buf);
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(pageopaque->hasho_bucket == bucket);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
bucket = pageopaque->hasho_bucket;
/*
* If this bucket is in the process of being split, try to finish the
* split before inserting, because that might create room for the
* insertion to proceed without allocating an additional overflow page.
* It's only interesting to finish the split if we're trying to insert
* into the bucket from which we're removing tuples (the "old" bucket),
* not if we're trying to insert into the bucket into which tuples are
* being moved (the "new" bucket).
*/
if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) {
/* release the lock on bucket buffer, before completing the split. */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
_hash_finish_split(rel, metabuf, buf, bucket,
usedmetap->hashm_maxbucket,
usedmetap->hashm_highmask,
usedmetap->hashm_lowmask);
/* release the pin on old and meta buffer. retry for insert. */
_hash_dropbuf(rel, buf);
_hash_dropbuf(rel, metabuf);
goto restart_insert;
}
/* Do the insertion */
while (PageGetFreeSpace(page) < itemsz) {
BlockNumber nextblkno;
/*
* Check if current page has any DEAD tuples. If yes, delete these
* tuples and see if we can get a space for the new item to be
* inserted before moving to the next page in the bucket chain.
*/
if (H_HAS_DEAD_TUPLES(pageopaque)) {
if (IsBufferCleanupOK(buf)) {
_hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
if (PageGetFreeSpace(page) >= itemsz)
break; /* OK, now we have enough space */
}
}
/*
* no space on this page; check for an overflow page
*/
BlockNumber nextblkno = pageopaque->hasho_nextblkno;
nextblkno = pageopaque->hasho_nextblkno;
if (BlockNumberIsValid(nextblkno)) {
/*
* ovfl page exists; go get it. if it doesn't have room, we'll
* find out next pass through the loop test above.
* find out next pass through the loop test above. we always
* release both the lock and pin if this is an overflow page, but
* only the lock if this is the primary bucket page, since the pin
* on the primary bucket must be retained throughout the scan.
*/
_hash_relbuf(rel, buf);
if (buf != bucket_buf)
_hash_relbuf(rel, buf);
else
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
page = BufferGetPage(buf);
} else {
/*
* we're at the end of the bucket chain and we haven't found a
* page with enough room. allocate a new overflow page.
*
* release our write lock without modifying buffer
*/
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
/* release our write lock without modifying buffer */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
/* chain to a new overflow page */
buf = _hash_addovflpage(rel, metabuf, buf);
buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
page = BufferGetPage(buf);
/* should fit now, given test above */
Assert(PageGetFreeSpace(page) >= itemsz);
}
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
Assert(pageopaque->hasho_bucket == bucket);
}
/* found page with enough space, so add the item here */
(void)_hash_pgaddtup(rel, buf, itemsz, itup);
/* write and release the modified page */
_hash_wrtbuf(rel, buf);
/* We can drop the bucket lock now */
_hash_droplock(rel, blkno, HASH_SHARE);
/*
* Write-lock the metapage so we can increment the tuple count. After
* incrementing it, check to see if it's time for a split.
*/
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* Do the update. No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
/* found page with enough space, so add the item here */
itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
MarkBufferDirty(buf);
/* metapage operations */
metap = HashPageGetMeta(metapage);
metap->hashm_ntuples += 1;
/* Make sure this stays in sync with _hash_expandtable() */
do_expand = metap->hashm_ntuples > (double)metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
/* Write out the metapage and drop lock, but keep pin */
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_insert xlrec;
XLogRecPtr recptr;
xlrec.offnum = itup_off;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup));
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
PageSetLSN(BufferGetPage(buf), recptr);
PageSetLSN(BufferGetPage(metabuf), recptr);
}
END_CRIT_SECTION();
/* drop lock on metapage, but keep pin */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
/*
* Release the modified page and ensure to release the pin on primary
* page.
*/
_hash_relbuf(rel, buf);
if (buf != bucket_buf)
_hash_dropbuf(rel, bucket_buf);
/* Attempt to split if a split is needed */
if (do_expand)
@ -192,3 +268,130 @@ OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple
return itup_off;
}
/*
* _hash_pgaddmultitup() -- add a tuple vector to a particular page in the index.
*
* This routine has same requirements for locking and tuple ordering as
* _hash_pgaddtup().
*
* Returns the offset number array at which the tuples were inserted.
*/
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
{
OffsetNumber itup_off;
Page page;
uint32 hashkey;
int i;
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
page = BufferGetPage(buf);
for (i = 0; i < nitups; i++) {
Size itemsize;
itemsize = IndexTupleDSize(*itups[i]);
itemsize = MAXALIGN(itemsize);
/* Find where to insert the tuple (preserving page's hashkey ordering) */
hashkey = _hash_get_indextuple_hashkey(itups[i]);
itup_off = _hash_binsearch(page, hashkey);
itup_offsets[i] = itup_off;
if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel));
}
}
/*
* _hash_vacuum_one_page - vacuum just one index page.
*
* Try to remove LP_DEAD items from the given page. We must acquire cleanup
* lock on the page being modified before calling this function.
*/
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode)
{
OffsetNumber deletable[MaxOffsetNumber];
int ndeletable = 0;
OffsetNumber offnum;
OffsetNumber maxoff;
Page page = BufferGetPage(buf);
HashPageOpaque pageopaque;
HashMetaPage metap;
/* Scan each tuple in page to see if it is marked as LP_DEAD */
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) {
ItemId itemId = PageGetItemId(page, offnum);
if (ItemIdIsDead(itemId))
deletable[ndeletable++] = offnum;
}
if (ndeletable > 0) {
/*
* Write-lock the meta page so that we can decrement tuple count.
*/
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
PageIndexMultiDelete(page, deletable, ndeletable);
/*
* Mark the page as not containing any LP_DEAD items. This is not
* certainly true (there might be some that have recently been marked,
* but weren't included in our target-item list), but it will almost
* always be true and it doesn't seem worth an additional page scan to
* check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
* anyway.
*/
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
metap = HashPageGetMeta(BufferGetPage(metabuf));
metap->hashm_ntuples -= ndeletable;
MarkBufferDirty(buf);
MarkBufferDirty(metabuf);
/* XLOG stuff */
if (RelationNeedsWAL(rel)) {
xl_hash_vacuum_one_page xlrec;
XLogRecPtr recptr;
xlrec.hnode = hnode;
xlrec.ntuples = ndeletable;
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
/*
* We need the target-offsets array whether or not we store the
* whole buffer, to allow us to find the latestRemovedXid on a
* standby server.
*/
XLogRegisterData((char *) deletable,
ndeletable * sizeof(OffsetNumber));
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
PageSetLSN(BufferGetPage(buf), recptr);
PageSetLSN(BufferGetPage(metabuf), recptr);
}
END_CRIT_SECTION();
/*
* Releasing write lock on meta page as we have updated the tuple
* count.
*/
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,138 +0,0 @@
/* -------------------------------------------------------------------------
*
* hashscan.cpp
* manage scans on hash tables
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/gausskernel/storage/access/hash/hashscan.cpp
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/relscan.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "utils/resowner.h"
/*
* We track all of a backend's active scans on hash indexes using a list
* of HashScanListData structs, which are allocated in t_thrd.top_mem_cxt.
* It's okay to use a long-lived context because we rely on the ResourceOwner
* mechanism to clean up unused entries after transaction or subtransaction
* abort. We can't safely keep the entries in the executor's per-query
* context, because that might be already freed before we get a chance to
* clean up the list. (XXX seems like there should be a better way to
* manage this...)
*/
typedef struct HashScanListData {
IndexScanDesc hashsl_scan;
ResourceOwner hashsl_owner;
struct HashScanListData *hashsl_next;
} HashScanListData;
typedef HashScanListData *HashScanList;
/*
* ReleaseResources_hash() --- clean up hash subsystem resources.
*
* This is here because it needs to touch this module's static var HashScans.
*/
void ReleaseResources_hash(void)
{
HashScanList l = NULL;
HashScanList prev = NULL;
HashScanList next = NULL;
/*
* Release all HashScanList items belonging to the current ResourceOwner.
* Note that we do not release the underlying IndexScanDesc; that's in
* executor memory and will go away on its own (in fact quite possibly has
* gone away already, so we mustn't try to touch it here).
*
* Note: this should be a no-op during normal query shutdown. However, in
* an abort situation ExecutorEnd is not called and so there may be open
* index scans to clean up.
*/
prev = NULL;
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = next) {
next = l->hashsl_next;
if (l->hashsl_owner == t_thrd.utils_cxt.CurrentResourceOwner) {
if (prev == NULL)
u_sess->exec_cxt.HashScans = next;
else
prev->hashsl_next = next;
pfree(l);
/* prev does not change */
} else
prev = l;
}
}
/*
* _hash_regscan() -- register a new scan.
*/
void _hash_regscan(IndexScanDesc scan)
{
HashScanList new_el;
new_el = (HashScanList)MemoryContextAlloc(
SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sizeof(HashScanListData));
new_el->hashsl_scan = scan;
new_el->hashsl_owner = t_thrd.utils_cxt.CurrentResourceOwner;
new_el->hashsl_next = u_sess->exec_cxt.HashScans;
u_sess->exec_cxt.HashScans = new_el;
}
/*
* _hash_dropscan() -- drop a scan from the scan list
*/
void _hash_dropscan(IndexScanDesc scan)
{
HashScanList chk = NULL;
HashScanList last = NULL;
last = NULL;
for (chk = u_sess->exec_cxt.HashScans; chk != NULL && chk->hashsl_scan != scan; chk = chk->hashsl_next)
last = chk;
if (chk == NULL)
ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("hash scan list trashed")));
if (last == NULL)
u_sess->exec_cxt.HashScans = chk->hashsl_next;
else
last->hashsl_next = chk->hashsl_next;
pfree(chk);
}
/*
* Is there an active scan in this bucket?
*/
bool _hash_has_active_scan(Relation rel, Bucket bucket)
{
Oid relid = RelationGetRelid(rel);
HashScanList l = NULL;
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = l->hashsl_next) {
if (relid == l->hashsl_scan->indexRelation->rd_id) {
HashScanOpaque so = (HashScanOpaque)l->hashsl_scan->opaque;
if (so->hashso_bucket_valid && so->hashso_bucket == bucket)
return true;
}
}
return false;
}

View File

@ -3,8 +3,8 @@
* hashsearch.cpp
* search code for postgres hash tables
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -64,40 +64,131 @@ bool _hash_next(IndexScanDesc scan, ScanDirection dir)
}
/*
* Advance to next page in a bucket, if any.
* Advance to next page in a bucket, if any. If we are scanning the bucket
* being populated during split operation then this function advances to the
* bucket being split after the last bucket page of bucket being populated.
*/
static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
static void _hash_readnext(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
{
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque)scan->opaque;
bool block_found = false;
blkno = (*opaquep)->hasho_nextblkno;
_hash_relbuf(rel, *bufp);
/*
* Retain the pin on primary bucket page till the end of scan. Refer the
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
else
_hash_relbuf(rel, *bufp);
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno)) {
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
block_found = true;
} else if (so->hashso_buc_populated && !so->hashso_buc_split) {
/*
* end of bucket, scan bucket being split if there was a split in
* progress at the start of scan.
*/
*bufp = so->hashso_split_bucket_buf;
/*
* buffer for bucket being split must be valid as we acquire the pin
* on it before the start of scan and retain it till end of scan.
*/
Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
/*
* setting hashso_buc_split to true indicates that we are scanning
* bucket being split.
*/
so->hashso_buc_split = true;
block_found = true;
}
if (block_found) {
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
}
}
/*
* Advance to previous page in a bucket, if any.
* Advance to previous page in a bucket, if any. If the current scan has
* started during split operation then this function advances to bucket
* being populated after the first bucket page of bucket being split.
*/
static void _hash_readprev(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
static void _hash_readprev(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
{
BlockNumber blkno;
Relation rel = scan->indexRelation;
HashScanOpaque so = (HashScanOpaque) scan->opaque;
bool haveprevblk;
blkno = (*opaquep)->hasho_prevblkno;
_hash_relbuf(rel, *bufp);
/*
* Retain the pin on primary bucket page till the end of scan. Refer the
* comments in _hash_first to know the reason of retaining pin.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) {
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
haveprevblk = false;
} else {
_hash_relbuf(rel, *bufp);
haveprevblk = true;
}
*bufp = InvalidBuffer;
/* check for interrupts while we're not holding any buffer lock */
CHECK_FOR_INTERRUPTS();
if (BlockNumberIsValid(blkno)) {
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
if (haveprevblk) {
Assert(BlockNumberIsValid(blkno));
*bufp = _hash_getbuf(rel, blkno, HASH_READ,
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
/*
* We always maintain the pin on bucket page for whole scan operation,
* so releasing the additional pin we have acquired here.
*/
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
_hash_dropbuf(rel, *bufp);
} else if (so->hashso_buc_populated && so->hashso_buc_split) {
/*
* end of bucket, scan bucket being populated if there was a split in
* progress at the start of scan.
*/
*bufp = so->hashso_bucket_buf;
/*
* buffer for bucket being populated must be valid as we acquire the
* pin on it before the start of scan and retain it till end of scan.
*/
Assert(BufferIsValid(*bufp));
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
*pagep = BufferGetPage(*bufp);
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
/* move to the end of bucket chain */
while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
_hash_readnext(scan, bufp, pagep, opaquep);
/*
* setting hashso_buc_split to false indicates that we are scanning
* bucket being populated.
*/
so->hashso_buc_split = false;
}
}
@ -117,12 +208,9 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
ScanKey cur;
uint32 hashkey;
Bucket bucket;
BlockNumber blkno;
Buffer buf;
Buffer metabuf;
Page page;
HashPageOpaque opaque;
HashMetaPage metap;
IndexTuple itup;
ItemPointer current;
OffsetNumber offnum;
@ -174,48 +262,71 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
so->hashso_sk_hash = hashkey;
/*
* Acquire shared split lock so we can compute the target bucket safely
* (see README).
*/
_hash_getlock(rel, 0, HASH_SHARE);
/* Read the metapage */
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
/*
* Compute the target bucket number, and convert to block number.
*/
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
blkno = BUCKET_TO_BLKNO(metap, bucket);
/* done with the metapage */
_hash_relbuf(rel, metabuf);
/*
* Acquire share lock on target bucket; then we can release split lock.
*/
_hash_getlock(rel, blkno, HASH_SHARE);
_hash_droplock(rel, 0, HASH_SHARE);
/* Update scan opaque state to show we have lock on the bucket */
so->hashso_bucket = bucket;
so->hashso_bucket_valid = true;
so->hashso_bucket_blkno = blkno;
/* Fetch the primary bucket page for the bucket */
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
page = BufferGetPage(buf);
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
bucket = opaque->hasho_bucket;
so->hashso_bucket_buf = buf;
/*
* If a bucket split is in progress, then while scanning the bucket being
* populated, we need to skip tuples that were copied from bucket being
* split. We also need to maintain a pin on the bucket being split to
* ensure that split-cleanup work done by vacuum doesn't remove tuples
* from it till this scan is done. We need to maintain a pin on the
* bucket being populated to ensure that vacuum doesn't squeeze that
* bucket till this scan is complete; otherwise, the ordering of tuples
* can't be maintained during forward and backward scans. Here, we have
* to be cautious about locking order: first, acquire the lock on bucket
* being split; then, release the lock on it but not the pin; then,
* acquire a lock on bucket being populated and again re-verify whether
* the bucket split is still in progress. Acquiring the lock on bucket
* being split first ensures that the vacuum waits for this scan to
* finish.
*/
if (H_BUCKET_BEING_POPULATED(opaque)) {
BlockNumber old_blkno;
Buffer old_buf;
old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
/*
* release the lock on new bucket and re-acquire it after acquiring
* the lock on old bucket.
*/
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
/*
* remember the split bucket buffer so as to use it later for
* scanning.
*/
so->hashso_split_bucket_buf = old_buf;
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
LockBuffer(buf, BUFFER_LOCK_SHARE);
page = BufferGetPage(buf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
Assert(opaque->hasho_bucket == bucket);
if (H_BUCKET_BEING_POPULATED(opaque)) {
so->hashso_buc_populated = true;
} else {
_hash_dropbuf(rel, so->hashso_split_bucket_buf);
so->hashso_split_bucket_buf = InvalidBuffer;
}
}
/* If a backwards scan is requested, move to the end of the chain */
if (ScanDirectionIsBackward(dir)) {
while (BlockNumberIsValid(opaque->hasho_nextblkno))
_hash_readnext(rel, &buf, &page, &opaque);
/*
* Backward scans that start during split needs to start from end of
* bucket being split.
*/
while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
(so->hashso_buc_populated && !so->hashso_buc_split))
_hash_readnext(scan, &buf, &page, &opaque);
}
/* Now find the first tuple satisfying the qualification */
@ -239,6 +350,12 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
* false. Else, return true and set the hashso_curpos for the
* scan to the right thing.
*
* Here we need to ensure that if the scan has started during split, then
* skip the tuples that are moved by split while scanning bucket being
* populated and then scan the bucket being split to cover all such
* tuples. This is done to ensure that we don't miss tuples in the scans
* that are started during split.
*
* 'bufP' points to the current buffer, which is pinned and read-locked.
* On success exit, we have pin and read-lock on whichever page
* contains the right item; on failure, we have released all buffers.
@ -283,9 +400,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
do {
switch (dir) {
case ForwardScanDirection:
if (offnum != InvalidOffsetNumber)
if (offnum != InvalidOffsetNumber) {
offnum = OffsetNumberNext(offnum); /* move forward */
else {
} else {
/* new page, locate starting position by binary search */
offnum = _hash_binsearch(page, so->hashso_sk_hash);
}
@ -298,14 +415,27 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
if (offnum <= maxoff) {
Assert(offnum >= FirstOffsetNumber);
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
/*
* skip the tuples that are moved by split operation
* for the scan that has started when split was in
* progress
*/
if (so->hashso_buc_populated && !so->hashso_buc_split &&
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
offnum = OffsetNumberNext(offnum); /* move forward */
continue;
}
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
break; /* yes, so exit for-loop */
}
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_hash_kill_items(scan);
/*
* ran off the end of this page, try the next
*/
_hash_readnext(rel, &buf, &page, &opaque);
_hash_readnext(scan, &buf, &page, &opaque);
if (BufferIsValid(buf)) {
maxoff = PageGetMaxOffsetNumber(page);
offnum = _hash_binsearch(page, so->hashso_sk_hash);
@ -318,9 +448,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
break;
case BackwardScanDirection:
if (offnum != InvalidOffsetNumber)
if (offnum != InvalidOffsetNumber) {
offnum = OffsetNumberPrev(offnum); /* move back */
else {
} else {
/* new page, locate starting position by binary search */
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
}
@ -333,14 +463,26 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
if (offnum >= FirstOffsetNumber) {
Assert(offnum <= maxoff);
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
/*
* skip the tuples that are moved by split operation
* for the scan that has started when split was in
* progress
*/
if (so->hashso_buc_populated && !so->hashso_buc_split &&
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
offnum = OffsetNumberPrev(offnum); /* move back */
continue;
}
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
break; /* yes, so exit for-loop */
}
/* Before leaving current page, deal with any killed items */
if (so->numKilled > 0)
_hash_kill_items(scan);
/*
* ran off the end of this page, try the next
*/
_hash_readprev(rel, &buf, &page, &opaque);
_hash_readprev(scan, &buf, &page, &opaque);
if (BufferIsValid(buf)) {
maxoff = PageGetMaxOffsetNumber(page);
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
@ -360,9 +502,16 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
}
if (itup == NULL) {
/* we ran off the end of the bucket without finding a match */
/*
* We ran off the end of the bucket without finding a match.
* Release the pin on bucket buffers. Normally, such pins are
* released at end of scan, however scrolling cursors can
* reacquire the bucket lock and pin in the same scan multiple
* times.
*/
*bufP = so->hashso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(current);
_hash_dropscanbuf(rel, so);
return false;
}

View File

@ -14,8 +14,8 @@
* plenty of locality of access.
*
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
@ -37,15 +37,23 @@
struct HSpool {
Tuplesortstate *sortstate; /* state data for tuplesort.c */
Relation index;
/*
* We sort the hash keys based on the buckets they belong to. Below masks
* are used in _hash_hashkey2bucket to determine the bucket of given hash
* key.
*/
uint32 high_mask;
uint32 low_mask;
uint32 max_buckets;
};
/*
* create and initialize a spool structure
*/
HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void *meminfo)
{
HSpool *hspool = (HSpool *)palloc0(sizeof(HSpool));
uint32 hash_mask;
UtilityDesc *desc = (UtilityDesc *)meminfo;
int work_mem = (desc->query_mem[0] > 0) ? desc->query_mem[0] : u_sess->attr.attr_memory.maintenance_work_mem;
int max_mem = (desc->query_mem[1] > 0) ? desc->query_mem[1] : 0;
@ -57,18 +65,26 @@ HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
* num_buckets buckets in the index, the appropriate mask can be computed
* as follows.
*
* Note: at present, the passed-in num_buckets is always a power of 2, so
* we could just compute num_buckets - 1. We prefer not to assume that
* here, though.
* NOTE : This hash mask calculation should be in sync with similar
* calculation in _hash_init_metabuffer.
*/
hash_mask = (((uint32)1) << _hash_log2(num_buckets)) - 1;
hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
hspool->low_mask = (hspool->high_mask >> 1);
hspool->max_buckets = num_buckets - 1;
/*
* We size the sort area as maintenance_work_mem rather than work_mem to
* speed index creation. This should be OK since a single backend can't
* run multiple index creations in parallel.
*/
hspool->sortstate = tuplesort_begin_index_hash(index, hash_mask, work_mem, false, max_mem);
hspool->sortstate = tuplesort_begin_index_hash(heap,
index,
hspool->high_mask,
hspool->low_mask,
hspool->max_buckets,
work_mem,
false,
max_mem);
return hspool;
}
@ -94,7 +110,7 @@ void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, const bool *isnul
* given a spool loaded by successive calls to _h_spool,
* create an entire index.
*/
void _h_indexbuild(HSpool *hspool)
void _h_indexbuild(HSpool *hspool, Relation heapRel)
{
IndexTuple itup;
bool should_free = false;
@ -102,7 +118,7 @@ void _h_indexbuild(HSpool *hspool)
tuplesort_performsort(hspool->sortstate);
while ((itup = tuplesort_getindextuple(hspool->sortstate, true, &should_free)) != NULL) {
_hash_doinsert(hspool->index, itup);
_hash_doinsert(hspool->index, itup, heapRel);
if (should_free)
pfree(itup);
}

View File

@ -3,8 +3,8 @@
* hashutil.cpp
* Utility code for Postgres hash implementation.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
@ -22,7 +22,9 @@
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/rel_gs.h"
#include "storage/buf/buf_internals.h"
#define CALC_NEW_BUCKET(old_bucket, lowmask) ((old_bucket) | ((lowmask) + 1))
/*
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
*/
@ -133,6 +135,70 @@ uint32 _hash_log2(uint32 num)
return i;
}
/*
* _hash_spareindex -- returns spare index / global splitpoint phase of the bucket
*/
uint32 _hash_spareindex(uint32 num_bucket)
{
uint32 splitpoint_group;
uint32 splitpoint_phases;
splitpoint_group = _hash_log2(num_bucket);
if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
return splitpoint_group;
/* account for single-phase groups */
splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
/* account for multi-phase groups before splitpoint_group */
splitpoint_phases +=
((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
HASH_SPLITPOINT_PHASE_BITS);
/* account for phases within current group */
splitpoint_phases +=
(((num_bucket - 1) >>
(splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
return splitpoint_phases;
}
/*
* _hash_get_totalbuckets -- returns total number of buckets allocated till
* the given splitpoint phase.
*/
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
{
uint32 splitpoint_group;
uint32 total_buckets;
uint32 phases_within_splitpoint_group;
if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
return (1 << splitpoint_phase);
/* get splitpoint's group */
splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
splitpoint_group +=
((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
HASH_SPLITPOINT_PHASE_BITS);
/* account for buckets before splitpoint_group */
total_buckets = (1 << (splitpoint_group - 1));
/* account for buckets within splitpoint_group */
phases_within_splitpoint_group =
(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
total_buckets +=
(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
phases_within_splitpoint_group);
return total_buckets;
}
/*
* _hash_checkpage -- sanity checks on the format of all hash pages
*
@ -216,25 +282,36 @@ uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
}
/*
* _hash_form_tuple - form an index tuple containing hash code only
* _hash_convert_tuple - convert raw index data to hash key
*
* Inputs: values and isnull arrays for the user data column(s)
* Outputs: values and isnull arrays for the index tuple, suitable for
* passing to index_form_tuple().
*
* Returns true if successful, false if not (because there are null values).
* On a false result, the given data need not be indexed.
*
* Note: callers know that the index-column arrays are always of length 1.
* In principle, there could be more than one input column, though we do not
* currently support that.
*/
IndexTuple _hash_form_tuple(Relation index, Datum *values, const bool *isnull)
bool _hash_convert_tuple(Relation index,
Datum *user_values, const bool *user_isnull,
Datum *index_values, bool *index_isnull)
{
IndexTuple itup;
uint32 hashkey;
Datum hashkeydatum;
TupleDesc hashdesc;
if (isnull[0]) {
hashkeydatum = (Datum)0;
} else {
hashkey = _hash_datum2hashkey(index, values[0]);
hashkeydatum = UInt32GetDatum(hashkey);
}
hashdesc = RelationGetDescr(index);
Assert(hashdesc->natts == 1);
itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
return itup;
/*
* We do not insert null values into hash indexes. This is okay because
* the only supported search operator is '=', and we assume it is strict.
*/
if (user_isnull[0])
return false;
hashkey = _hash_datum2hashkey(index, user_values[0]);
index_values[0] = UInt32GetDatum(hashkey);
index_isnull[0] = false;
return true;
}
/*
@ -312,3 +389,154 @@ OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
return lower;
}
/*
* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
* from which current (new) bucket is being split.
*/
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
{
Bucket old_bucket;
uint32 mask;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
/*
* To get the old bucket from the current bucket, we need a mask to modulo
* into lower half of table. This mask is stored in meta page as
* hashm_lowmask, but here we can't rely on the same, because we need a
* value of lowmask that was prevalent at the time when bucket split was
* started. Masking the most significant bit of new bucket would give us
* old bucket.
*/
mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
old_bucket = new_bucket & mask;
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
blkno = BUCKET_TO_BLKNO(metap, old_bucket);
_hash_relbuf(rel, metabuf);
return blkno;
}
/*
* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
* that will be generated after split from old bucket.
*
* This is used to find the new bucket from old bucket based on current table
* half. It is mainly required to finish the incomplete splits where we are
* sure that not more than one bucket could have split in progress from old
* bucket.
*/
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
{
Bucket new_bucket;
Buffer metabuf;
HashMetaPage metap;
BlockNumber blkno;
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
metap = HashPageGetMeta(BufferGetPage(metabuf));
new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
metap->hashm_lowmask,
metap->hashm_maxbucket);
blkno = BUCKET_TO_BLKNO(metap, new_bucket);
_hash_relbuf(rel, metabuf);
return blkno;
}
/*
* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
* generated after split from current (old) bucket.
*
* This is used to find the new bucket from old bucket. New bucket can be
* obtained by OR'ing old bucket with most significant bit of current table
* half (lowmask passed in this function can be used to identify msb of
* current table half). There could be multiple buckets that could have
* been split from current bucket. We need the first such bucket that exists.
* Caller must ensure that no more than one split has happened from old
* bucket.
*/
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
uint32 lowmask, uint32 maxbucket)
{
Bucket new_bucket;
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
if (new_bucket > maxbucket) {
lowmask = lowmask >> 1;
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
}
return new_bucket;
}
/*
* _hash_kill_items - set LP_DEAD state for items an indexscan caller has
* told us were killed.
*
* scan->opaque, referenced locally through so, contains information about the
* current page and killed tuples thereon (generally, this should only be
* called if so->numKilled > 0).
*
* We match items by heap TID before assuming they are the right ones to
* delete.
*/
void _hash_kill_items(IndexScanDesc scan)
{
HashScanOpaque so = (HashScanOpaque) scan->opaque;
Page page;
HashPageOpaque opaque;
OffsetNumber offnum;
OffsetNumber maxoff;
int numKilled = so->numKilled;
int i;
bool killedsomething = false;
Assert(so->numKilled > 0);
Assert(so->killedItems != NULL);
/*
* Always reset the scan state, so we don't look for same items on other
* pages.
*/
so->numKilled = 0;
page = BufferGetPage(so->hashso_curbuf);
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
for (i = 0; i < numKilled; i++) {
offnum = so->killedItems[i].indexOffset;
while (offnum <= maxoff) {
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple)PageGetItem(page, iid);
if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) {
/* found the item */
ItemIdMarkDead(iid);
killedsomething = true;
break; /* out of inner search loop */
}
offnum = OffsetNumberNext(offnum);
}
}
/*
* Since this can be redone later if needed, mark as dirty hint. Whenever
* we mark anything LP_DEAD, we also set the page's
* LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
*/
if (killedsomething) {
opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
MarkBufferDirtyHint(so->hashso_curbuf, true);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -843,6 +843,9 @@ void XLogBlockDataCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoB
case RM_BTREE_ID:
BtreeRedoDataBlock(blockhead, blockdatarec, bufferinfo);
break;
case RM_HASH_ID:
HashRedoDataBlock(blockhead, blockdatarec, bufferinfo);
break;
case RM_XLOG_ID:
xlog_redo_data_block(blockhead, blockdatarec, bufferinfo);
break;

View File

@ -16,9 +16,155 @@
#include "postgres.h"
#include "knl/knl_variable.h"
#include "access/hash.h"
#include "access/rmgr.h"
#include "access/hash_xlog.h"
void hash_desc(StringInfo buf, XLogReaderState *record)
{
/* nothing to do */
char *rec = XLogRecGetData(record);
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
switch (info) {
case XLOG_HASH_INIT_META_PAGE:
{
xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec;
appendStringInfo(buf, "num_tuples %g, fillfactor %d",
xlrec->num_tuples, xlrec->ffactor);
break;
}
case XLOG_HASH_INIT_BITMAP_PAGE:
{
xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec;
appendStringInfo(buf, "bmsize %d", xlrec->bmsize);
break;
}
case XLOG_HASH_INSERT:
{
xl_hash_insert *xlrec = (xl_hash_insert *) rec;
appendStringInfo(buf, "off %u", xlrec->offnum);
break;
}
case XLOG_HASH_ADD_OVFL_PAGE:
{
xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec;
appendStringInfo(buf, "bmsize %d, bmpage_found %c",
xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F');
break;
}
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
{
xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec;
appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c",
xlrec->new_bucket,
(xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F',
(xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F');
break;
}
case XLOG_HASH_SPLIT_COMPLETE:
{
xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec;
appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u",
xlrec->old_bucket_flag, xlrec->new_bucket_flag);
break;
}
case XLOG_HASH_MOVE_PAGE_CONTENTS:
{
xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec;
appendStringInfo(buf, "ntups %d, is_primary %c",
xlrec->ntups,
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
break;
}
case XLOG_HASH_SQUEEZE_PAGE:
{
xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec;
appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c",
xlrec->prevblkno,
xlrec->nextblkno,
xlrec->ntups,
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
break;
}
case XLOG_HASH_DELETE:
{
xl_hash_delete *xlrec = (xl_hash_delete *) rec;
appendStringInfo(buf, "clear_dead_marking %c, is_primary %c",
xlrec->clear_dead_marking ? 'T' : 'F',
xlrec->is_primary_bucket_page ? 'T' : 'F');
break;
}
case XLOG_HASH_UPDATE_META_PAGE:
{
xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec;
appendStringInfo(buf, "ntuples %g",
xlrec->ntuples);
break;
}
case XLOG_HASH_VACUUM_ONE_PAGE:
{
xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec;
appendStringInfo(buf, "ntuples %d",
xlrec->ntuples);
break;
}
}
}
const char *hash_identify(uint8 info)
{
const char *id = NULL;
switch (info & ~XLR_INFO_MASK) {
case XLOG_HASH_INIT_META_PAGE:
id = "INIT_META_PAGE";
break;
case XLOG_HASH_INIT_BITMAP_PAGE:
id = "INIT_BITMAP_PAGE";
break;
case XLOG_HASH_INSERT:
id = "INSERT";
break;
case XLOG_HASH_ADD_OVFL_PAGE:
id = "ADD_OVFL_PAGE";
break;
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
id = "SPLIT_ALLOCATE_PAGE";
break;
case XLOG_HASH_SPLIT_PAGE:
id = "SPLIT_PAGE";
break;
case XLOG_HASH_SPLIT_COMPLETE:
id = "SPLIT_COMPLETE";
break;
case XLOG_HASH_MOVE_PAGE_CONTENTS:
id = "MOVE_PAGE_CONTENTS";
break;
case XLOG_HASH_SQUEEZE_PAGE:
id = "SQUEEZE_PAGE";
break;
case XLOG_HASH_DELETE:
id = "DELETE";
break;
case XLOG_HASH_SPLIT_CLEANUP:
id = "SPLIT_CLEANUP";
break;
case XLOG_HASH_UPDATE_META_PAGE:
id = "UPDATE_META_PAGE";
break;
case XLOG_HASH_VACUUM_ONE_PAGE:
id = "VACUUM_ONE_PAGE";
}
return id;
}

View File

@ -32,6 +32,7 @@
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/nbtree.h"
#include "access/hash_xlog.h"
#include "access/xlogreader.h"
#include "access/gist_private.h"
#include "access/multixact.h"
@ -165,7 +166,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE },
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
@ -1031,8 +1032,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
/* Run from the dispatcher thread. */
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
{
DispatchTxnRecord(record, expectedTLIs, recordXTime, false, true);
return true;
bool isNeedFullSync = false;
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
if (IsHashVacuumPages(record) && g_supportHotStandby) {
GetSlotIds(record, ANY_WORKER, true);
/* sync with trxn thread */
/* only need to process in pageworker thread, wait trxn sync */
/* pageworker exe, trxn don't need exe */
DispatchToSpecPageWorker(record, expectedTLIs, true);
} else {
DispatchRecordWithPages(record, expectedTLIs, true);
}
return isNeedFullSync;
}
/* Run from the dispatcher thread. */

View File

@ -31,6 +31,7 @@
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/nbtree.h"
#include "access/hash_xlog.h"
#include "access/xlogreader.h"
#include "access/gist_private.h"
#include "access/multixact.h"
@ -165,7 +166,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE },
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
@ -912,8 +913,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
/* Run from the dispatcher thread. */
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
{
DispatchTxnRecord(record, expectedTLIs, recordXTime, false);
return true;
bool isNeedFullSync = false;
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
if (IsHashVacuumPages(record) && g_supportHotStandby) {
GetWorkerIds(record, ANY_WORKER, true);
/* sync with trxn thread */
/* only need to process in pageworker thread, wait trxn sync */
/* pageworker exe, trxn don't need exe */
DispatchToSpecPageWorker(record, expectedTLIs, true);
} else {
DispatchRecordWithPages(record, expectedTLIs, true);
}
return isNeedFullSync;
}
static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)

View File

@ -29,6 +29,7 @@
#include "access/gin.h"
#include "access/gist_private.h"
#include "access/hash.h"
#include "access/hash_xlog.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/nbtree.h"

View File

@ -5264,6 +5264,51 @@ bool ConditionalLockBufferForCleanup(Buffer buffer)
return false;
}
/*
* IsBufferCleanupOK - as above, but we already have the lock
*
* Check whether it's OK to perform cleanup on a buffer we've already
* locked. If we observe that the pin count is 1, our exclusive lock
* happens to be a cleanup lock, and we can proceed with anything that
* would have been allowable had we sought a cleanup lock originally.
*/
bool IsBufferCleanupOK(Buffer buffer)
{
BufferDesc *bufHdr;
uint32 buf_state;
Assert(BufferIsValid(buffer));
if (BufferIsLocal(buffer)) {
/* There should be exactly one pin */
if (u_sess->storage_cxt.LocalRefCount[-buffer - 1] != 1)
return false;
/* Nobody else to wait for */
return true;
}
/* There should be exactly one local pin */
if (GetPrivateRefCount(buffer) != 1)
return false;
bufHdr = GetBufferDescriptor(buffer - 1);
/* caller must hold exclusive lock on buffer */
Assert(LWLockHeldByMeInMode(bufHdr->content_lock, LW_EXCLUSIVE));
buf_state = LockBufHdr(bufHdr);
Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) {
/* pincount is OK. */
UnlockBufHdr(bufHdr, buf_state);
return true;
}
UnlockBufHdr(bufHdr, buf_state);
return false;
}
/*
* Functions for buffer I/O handling
*

View File

@ -400,3 +400,28 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno)
((PageHeader)page)->pd_checksum = pg_checksum_page((char*)page, blkno);
}
/*
* PageGetFreeSpaceForMultipleTuples
* Returns the size of the free (allocatable) space on a page,
* reduced by the space needed for multiple new line pointers.
*
* Note: this should usually only be used on index pages. Use
* PageGetHeapFreeSpace on heap pages.
*/
Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
{
int space;
/*
* Use signed arithmetic here so that we behave sensibly if pd_lower >
* pd_upper.
*/
space = (int)((PageHeader)page)->pd_upper - (int)((PageHeader)page)->pd_lower;
if (space < (int)(ntups * sizeof(ItemIdData)))
return 0;
space -= ntups * sizeof(ItemIdData);
return (Size) space;
}

View File

@ -4,7 +4,7 @@
* header file for postgres hash access method implementation
*
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/hash.h
@ -33,36 +33,59 @@
*/
typedef uint32 Bucket;
#define INVALID_BUCKET_NUM (0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B) + 1) - 1] : 0)) + 1)
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B) + 1) - 1] : 0)) + 1)
/*
* Special space for hash index pages.
*
* hasho_flag tells us which type of page we're looking at. For
* example, knowing overflow pages from bucket pages is necessary
* information when you're deleting tuples from a page. If all the
* tuples are deleted from an overflow page, the overflow is made
* available to other buckets by calling _hash_freeovflpage(). If all
* the tuples are deleted from a bucket page, no additional action is
* necessary.
* hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
* Additional bits in the flag word are used for more transient purposes.
*
* To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
* However, we ensure that each used page type has a distinct bit so that
* we can OR together page types for uses such as the allowable-page-types
* argument of _hash_checkpage().
*/
#define LH_UNUSED_PAGE (0)
#define LH_OVERFLOW_PAGE (1 << 0)
#define LH_BUCKET_PAGE (1 << 1)
#define LH_BITMAP_PAGE (1 << 2)
#define LH_META_PAGE (1 << 3)
#define LH_BUCKET_BEING_POPULATED (1 << 4)
#define LH_BUCKET_BEING_SPLIT (1 << 5)
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
#define LH_PAGE_TYPE \
(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
/*
* In an overflow page, hasho_prevblkno stores the block number of the previous
* page in the bucket chain; in a bucket page, hasho_prevblkno stores the
* hashm_maxbucket value as of the last time the bucket was last split, or
* else as of the time the bucket was created. The latter convention is used
* to determine whether a cached copy of the metapage is too stale to be used
* without needing to lock or pin the metapage.
*
* hasho_nextblkno is always the block number of the next page in the
* bucket chain, or InvalidBlockNumber if there are no more such pages.
*/
typedef struct HashPageOpaqueData {
BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
BlockNumber hasho_nextblkno; /* next ovfl blkno */
Bucket hasho_bucket; /* bucket number this pg belongs to */
uint16 hasho_flag; /* page type code, see above */
uint16 hasho_page_id; /* for identification of hash indexes */
BlockNumber hasho_prevblkno; /* see above */
BlockNumber hasho_nextblkno; /* see above */
Bucket hasho_bucket; /* bucket number this pg belongs to */
uint16 hasho_flag; /* page type code + flag bits, see above */
uint16 hasho_page_id; /* for identification of hash indexes */
} HashPageOpaqueData;
typedef HashPageOpaqueData* HashPageOpaque;
#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
/*
* The page ID is for the convenience of pg_filedump and similar utilities,
* which otherwise would have a hard time telling pages of different index
@ -71,26 +94,19 @@ typedef HashPageOpaqueData* HashPageOpaque;
*/
#define HASHO_PAGE_ID 0xFF80
typedef struct HashScanPosItem {
ItemPointerData heapTid; /* TID of referenced heap item */
OffsetNumber indexOffset; /* index item's location within page */
} HashScanPosItem;
/*
* HashScanOpaqueData is private state for a hash index scan.
* HashScanOpaqueData is private state for a hash index scan.
*/
typedef struct HashScanOpaqueData {
/* Hash value of the scan key, ie, the hash key we seek */
uint32 hashso_sk_hash;
/*
* By definition, a hash scan should be examining only one bucket. We
* record the bucket number here as soon as it is known.
*/
Bucket hashso_bucket;
bool hashso_bucket_valid;
/*
* If we have a share lock on the bucket, we record it here. When
* hashso_bucket_blkno is zero, we have no such lock.
*/
BlockNumber hashso_bucket_blkno;
/*
* We also want to remember which buffer we're currently examining in the
* scan. We keep the buffer pinned (but not locked) across hashgettuple
@ -99,11 +115,33 @@ typedef struct HashScanOpaqueData {
*/
Buffer hashso_curbuf;
/* remember the buffer associated with primary bucket */
Buffer hashso_bucket_buf;
/*
* remember the buffer associated with primary bucket page of bucket being
* split. it is required during the scan of the bucket which is being
* populated during split operation.
*/
Buffer hashso_split_bucket_buf;
/* Current position of the scan, as an index TID */
ItemPointerData hashso_curpos;
/* Current position of the scan, as a heap TID */
ItemPointerData hashso_heappos;
/* Whether scan starts on bucket being populated due to split */
bool hashso_buc_populated;
/*
* Whether scanning bucket being split? The value of this parameter is
* referred only when hashso_buc_populated is true.
*/
bool hashso_buc_split;
/* info about killed items if any (killedItems is NULL if never used) */
HashScanPosItem *killedItems; /* tids and offset numbers of killed items */
int numKilled; /* number of currently stored items */
} HashScanOpaqueData;
typedef HashScanOpaqueData* HashScanOpaque;
@ -115,7 +153,7 @@ typedef HashScanOpaqueData* HashScanOpaque;
#define HASH_METAPAGE 0 /* metapage is always block 0 */
#define HASH_MAGIC 0x6440640
#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */
#define HASH_VERSION 4
/*
* Spares[] holds the number of overflow pages currently allocated at or
@ -128,17 +166,32 @@ typedef HashScanOpaqueData* HashScanOpaque;
*
* ovflpages that have been recycled for reuse can be found by looking at
* bitmaps that are stored within ovflpages dedicated for the purpose.
* The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the
* The blknos of these bitmap pages are kept in mapp[]; nmaps is the
* number of currently existing bitmaps.
*
* The limitation on the size of spares[] comes from the fact that there's
* no point in having more than 2^32 buckets with only uint32 hashcodes.
* (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
* adjusted in such a way to accommodate multi phased allocation of buckets
* after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
*
* There is no particular upper limit on the size of mapp[], other than
* needing to fit into the metapage. (With 8K block size, 128 bitmaps
* limit us to 64 Gb of overflow space...)
* needing to fit into the metapage. (With 8K block size, 1024 bitmaps
* limit us to 256 GB of overflow space...)
*/
#define HASH_MAX_SPLITPOINTS 32
#define HASH_MAX_BITMAPS 128
#define HASH_MAX_BITMAPS 1024
#define HASH_SPLITPOINT_PHASE_BITS 2
#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
/* defines max number of splitpoit phases a hash index can have */
#define HASH_MAX_SPLITPOINT_GROUP 32
#define HASH_MAX_SPLITPOINTS \
(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
HASH_SPLITPOINT_PHASES_PER_GRP) + \
HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
typedef struct HashMetaPageData {
uint32 hashm_magic; /* magic no. for hash tables */
@ -280,37 +333,40 @@ extern Datum hash_new_uint32(uint32 k);
/* private routines */
/* hashinsert.c */
extern void _hash_doinsert(Relation rel, IndexTuple itup);
extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup);
extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
OffsetNumber *itup_offsets, uint16 nitups);
/* hashovfl.c */
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy);
extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum);
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy);
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy);
/* hashpage.c */
extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags);
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
BlockNumber blkno, int flags);
extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh);
extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
int access, HashMetaPage *cachedmetap);
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage);
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum);
extern Buffer _hash_getbuf_with_strategy(
Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy);
extern void _hash_relbuf(Relation rel, Buffer buf);
extern void _hash_dropbuf(Relation rel, Buffer buf);
extern void _hash_wrtbuf(Relation rel, Buffer buf);
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access);
extern uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum);
extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
extern uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum);
extern void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage);
extern void _hash_pageinit(Page page, Size size);
extern void _hash_expandtable(Relation rel, Buffer metabuf);
/* hashscan.c */
extern void _hash_regscan(IndexScanDesc scan);
extern void _hash_dropscan(IndexScanDesc scan);
extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
extern void ReleaseResources_hash(void);
extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
uint32 maxbucket, uint32 highmask, uint32 lowmask);
/* hashsearch.c */
extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
@ -320,10 +376,10 @@ extern bool _hash_step(IndexScanDesc scan, Buffer* bufP, ScanDirection dir);
/* hashsort.c */
typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
extern HSpool* _h_spoolinit(Relation index, uint32 num_buckets, void* meminfo);
extern HSpool* _h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void* meminfo);
extern void _h_spooldestroy(HSpool* hspool);
extern void _h_spool(HSpool* hspool, ItemPointer self, Datum* values, const bool* isnull);
extern void _h_indexbuild(HSpool* hspool);
extern void _h_indexbuild(HSpool* hspool, Relation heapRel);
/* hashutil.c */
extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
@ -331,15 +387,28 @@ extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask);
extern uint32 _hash_log2(uint32 num);
extern uint32 _hash_spareindex(uint32 num_bucket);
extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
extern IndexTuple _hash_form_tuple(Relation index, Datum* values, const bool* isnull);
extern bool _hash_convert_tuple(Relation index, Datum *user_values, const bool *user_isnull,
Datum *index_values, bool *index_isnull);
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
uint32 lowmask, uint32 maxbucket);
extern void _hash_kill_items(IndexScanDesc scan);
/* hash.c */
extern void hash_redo(XLogReaderState* record);
extern void hash_desc(StringInfo buf, XLogReaderState* record);
extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
Buffer bucket_buf, BlockNumber bucket_blkno,
BufferAccessStrategy bstrategy,
uint32 maxbucket, uint32 highmask, uint32 lowmask,
double *tuples_removed, double *num_index_tuples,
bool bucket_has_garbage,
IndexBulkDeleteCallback callback, void *callback_state);
#ifdef PGXC
extern Datum compute_hash(Oid type, Datum value, char locator);

View File

@ -0,0 +1,352 @@
/*-------------------------------------------------------------------------
*
* hash_xlog.h
* header file for Postgres hash AM implementation
*
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/hash_xlog.h
*
*-------------------------------------------------------------------------
*/
#ifndef HASH_XLOG_H
#define HASH_XLOG_H
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
#include "storage/off.h"
/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */
#define HASH_XLOG_FREE_OVFL_BUFS 6
/*
* XLOG records for hash operations
*/
#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */
#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */
#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */
#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */
#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */
#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */
#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */
#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page
* and add to another page */
#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous
* pages in chain and free the ovfl
* page */
#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */
#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary
* bucket page after deleting tuples
* that are moved due to split */
#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */
#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index page */
typedef enum {
XLOG_HASH_INIT_META_PAGE_NUM = 0,
}XLogHashInitMetaPageEnum;
typedef enum {
XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM = 0,
XLOG_HASH_INIT_BITMAP_PAGE_META_NUM,
}XLogHashInitBitmapPageEnum;
typedef enum {
XLOG_HASH_INSERT_PAGE_NUM = 0,
XLOG_HASH_INSERT_META_NUM,
}XLogHashInsertEnum;
typedef enum {
XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM = 0,
XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM,
XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM,
XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM,
XLOG_HASH_ADD_OVFL_PAGE_META_NUM,
}XLogHashAddOvflPageEnum;
typedef enum {
XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM = 0,
XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM,
XLOG_HASH_SPLIT_ALLOCATE_PAGE_META_NUM,
}XLogHashSplitAllocatePageEnum;
typedef enum {
XLOG_HASH_SPLIT_PAGE_NUM = 0,
}XLogHashSplitPageEnum;
typedef enum {
XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM = 0,
XLOG_HASH_SPLIT_COMPLETE_NBUK_NUM,
}XLogHashSplitCompleteEnum;
typedef enum {
HASH_MOVE_BUK_BLOCK_NUM = 0,
HASH_MOVE_ADD_BLOCK_NUM,
HASH_MOVE_DELETE_OVFL_BLOCK_NUM,
}XLogHashMovePageEnum;
typedef enum {
HASH_SQUEEZE_BUK_BLOCK_NUM = 0,
HASH_SQUEEZE_ADD_BLOCK_NUM,
HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM,
HASH_SQUEEZE_UPDATE_META_BLOCK_NUM,
}XLogHashSqueezePageEnum;
typedef enum {
HASH_DELETE_BUK_BLOCK_NUM = 0,
HASH_DELETE_OVFL_BLOCK_NUM,
}XLogHashDeleteEnum;
typedef enum {
HASH_SPLIT_CLEANUP_BLOCK_NUM,
}XLogHashSplitCleanupEnum;
typedef enum {
HASH_UPDATE_META_BLOCK_NUM,
} XLogHashUpdateMateEnum;
typedef enum {
HASH_VACUUM_PAGE_BLOCK_NUM = 0,
HASH_VACUUM_META_BLOCK_NUM,
} XLogHashVacuumPageEnum;
/*
* xl_hash_split_allocate_page flag values, 8 bits are available.
*/
#define XLH_SPLIT_META_UPDATE_MASKS (1<<0)
#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1)
/*
* This is what we need to know about a HASH index create.
*
* Backup block 0: metapage
*/
typedef struct xl_hash_createidx
{
double num_tuples;
RegProcedure procid;
uint16 ffactor;
} xl_hash_createidx;
#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16))
/*
* This is what we need to know about simple (without split) insert.
*
* This data record is used for XLOG_HASH_INSERT
*
* Backup Blk 0: original page (data contains the inserted tuple)
* Backup Blk 1: metapage (HashMetaPageData)
*/
typedef struct xl_hash_insert
{
OffsetNumber offnum;
} xl_hash_insert;
#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber))
/*
* This is what we need to know about addition of overflow page.
*
* This data record is used for XLOG_HASH_ADD_OVFL_PAGE
*
* Backup Blk 0: newly allocated overflow page
* Backup Blk 1: page before new overflow page in the bucket chain
* Backup Blk 2: bitmap page
* Backup Blk 3: new bitmap page
* Backup Blk 4: metapage
*/
typedef struct xl_hash_add_ovfl_page
{
uint16 bmsize;
bool bmpage_found;
} xl_hash_add_ovfl_page;
#define SizeOfHashAddOvflPage \
(offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool))
/*
* This is what we need to know about allocating a page for split.
*
* This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE
*
* Backup Blk 0: page for old bucket
* Backup Blk 1: page for new bucket
* Backup Blk 2: metapage
*/
typedef struct xl_hash_split_allocate_page
{
uint32 new_bucket;
uint16 old_bucket_flag;
uint16 new_bucket_flag;
uint8 flags;
} xl_hash_split_allocate_page;
#define SizeOfHashSplitAllocPage \
(offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8))
/*
* This is what we need to know about completing the split operation.
*
* This data record is used for XLOG_HASH_SPLIT_COMPLETE
*
* Backup Blk 0: page for old bucket
* Backup Blk 1: page for new bucket
*/
typedef struct xl_hash_split_complete
{
uint16 old_bucket_flag;
uint16 new_bucket_flag;
} xl_hash_split_complete;
#define SizeOfHashSplitComplete \
(offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16))
/*
* This is what we need to know about move page contents required during
* squeeze operation.
*
* This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS
*
* Backup Blk 0: bucket page
* Backup Blk 1: page containing moved tuples
* Backup Blk 2: page from which tuples will be removed
*/
typedef struct xl_hash_move_page_contents
{
uint16 ntups;
bool is_prim_bucket_same_wrt; /* true if the page to which
* tuples are moved is same as
* primary bucket page */
} xl_hash_move_page_contents;
#define SizeOfHashMovePageContents \
(offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool))
/*
* This is what we need to know about the squeeze page operation.
*
* This data record is used for XLOG_HASH_SQUEEZE_PAGE
*
* Backup Blk 0: page containing tuples moved from freed overflow page
* Backup Blk 1: freed overflow page
* Backup Blk 2: page previous to the freed overflow page
* Backup Blk 3: page next to the freed overflow page
* Backup Blk 4: bitmap page containing info of freed overflow page
* Backup Blk 5: meta page
*/
typedef struct xl_hash_squeeze_page
{
BlockNumber prevblkno;
BlockNumber nextblkno;
uint16 ntups;
bool is_prim_bucket_same_wrt; /* true if the page to which
* tuples are moved is same as
* primary bucket page */
bool is_prev_bucket_same_wrt; /* true if the page to which
* tuples are moved is the page
* previous to the freed overflow
* page */
} xl_hash_squeeze_page;
#define SizeOfHashSqueezePage \
(offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool))
/*
* This is what we need to know about the deletion of index tuples from a page.
*
* This data record is used for XLOG_HASH_DELETE
*
* Backup Blk 0: primary bucket page
* Backup Blk 1: page from which tuples are deleted
*/
typedef struct xl_hash_delete
{
bool clear_dead_marking; /* true if this operation clears
* LH_PAGE_HAS_DEAD_TUPLES flag */
bool is_primary_bucket_page; /* true if the operation is for
* primary bucket page */
} xl_hash_delete;
#define SizeOfHashDelete \
(offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool))
/*
* This is what we need for metapage update operation.
*
* This data record is used for XLOG_HASH_UPDATE_META_PAGE
*
* Backup Blk 0: meta page
*/
typedef struct xl_hash_update_meta_page
{
double ntuples;
} xl_hash_update_meta_page;
#define SizeOfHashUpdateMetaPage \
(offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double))
/*
* This is what we need to initialize metapage.
*
* This data record is used for XLOG_HASH_INIT_META_PAGE
*
* Backup Blk 0: meta page
*/
typedef struct xl_hash_init_meta_page
{
double num_tuples;
RegProcedure procid;
uint16 ffactor;
} xl_hash_init_meta_page;
#define SizeOfHashInitMetaPage \
(offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16))
/*
* This is what we need to initialize bitmap page.
*
* This data record is used for XLOG_HASH_INIT_BITMAP_PAGE
*
* Backup Blk 0: bitmap page
* Backup Blk 1: meta page
*/
typedef struct xl_hash_init_bitmap_page
{
uint16 bmsize;
} xl_hash_init_bitmap_page;
#define SizeOfHashInitBitmapPage \
(offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16))
/*
* This is what we need for index tuple deletion and to
* update the meta page.
*
* This data record is used for XLOG_HASH_VACUUM_ONE_PAGE
*
* Backup Blk 0: bucket page
* Backup Blk 1: meta page
*/
typedef struct xl_hash_vacuum_one_page
{
RelFileNode hnode;
int ntuples;
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
} xl_hash_vacuum_one_page;
#define SizeOfHashVacuumOnePage \
(offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(int))
extern void hash_redo(XLogReaderState *record);
extern void hash_desc(StringInfo buf, XLogReaderState *record);
extern const char *hash_identify(uint8 info);
extern bool IsHashVacuumPages(XLogReaderState *record);
#endif /* HASH_XLOG_H */

View File

@ -754,6 +754,47 @@ void BtreeXlogUnlinkPageOperatorChildpage(RedoBufferInfo* cbuf, void* recorddata
void BtreeXlogClearIncompleteSplit(RedoBufferInfo* buffer);
void HashRedoInitMetaPageOperatorPage(RedoBufferInfo *metabuf, void *recorddata);
void HashRedoInitBitmapPageOperatorBitmapPage(RedoBufferInfo *bitmapbuf, void *recorddata);
void HashRedoInitBitmapPageOperatorMetaPage(RedoBufferInfo *metabuf);
void HashRedoInsertOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *data, Size datalen);
void HashRedoInsertOperatorMetaPage(RedoBufferInfo *metabuf);
void HashRedoAddOvflPageOperatorOvflPage(RedoBufferInfo *ovflbuf, BlockNumber leftblk, void *data, Size datalen);
void HashRedoAddOvflPageOperatorLeftPage(RedoBufferInfo *ovflbuf, BlockNumber rightblk);
void HashRedoAddOvflPageOperatorMapPage(RedoBufferInfo *mapbuf, void *data);
void HashRedoAddOvflPageOperatorNewmapPage(RedoBufferInfo *newmapbuf, void *recorddata);
void HashRedoAddOvflPageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *data, Size datalen);
void HashRedoSplitAllocatePageOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
void HashRedoSplitAllocatePageOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
void HashRedoSplitAllocatePageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *blkdata);
void HashRedoSplitCompleteOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
void HashRedoSplitCompleteOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
void HashXlogMoveAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
void HashXlogMoveDeleteOvflPageOperatorPage(RedoBufferInfo *redobuffer, void *blkdata, Size len);
void HashXlogSqueezeAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
void HashXlogSqueezeInitOvflbufOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogSqueezeUpdatePrevPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogSqueezeUpdateNextPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogSqueezeUpdateBitmapOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
void HashXlogSqueezeUpdateMateOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
void HashXlogDeleteBlockOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
void HashXlogSplitCleanupOperatorPage(RedoBufferInfo *redobuffer);
void HashXlogUpdateMetaOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void HashXlogVacuumOnePageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, Size len);
void HashXlogVacuumMateOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
void XLogRecSetBlockCommonState(XLogReaderState* record, XLogBlockParseEnum blockvalid, ForkNumber forknum,
BlockNumber blockknum, RelFileNode* relnode, XLogRecParseState* recordblockstate);
@ -787,6 +828,7 @@ extern void XLogRecSetBlockDdlState(XLogBlockDdlParse* blockddlstate, uint32 blo
char *mainData, Oid ownerid = InvalidOid);
XLogRedoAction XLogCheckBlockDataRedoAction(XLogBlockDataParse* datadecode, RedoBufferInfo* bufferinfo);
void BtreeRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
extern void HashRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
XLogRecParseState* XactXlogCsnlogParseToBlock(XLogReaderState* record, uint32* blocknum, TransactionId xid,
int nsubxids, TransactionId* subxids, CommitSeqNo csn, XLogRecParseState* recordstatehead);
extern void XLogRecSetVmBlockState(XLogReaderState* record, uint32 blockid, XLogRecParseState* recordblockstate);
@ -914,5 +956,4 @@ extern void XLogBlockDdlDoSmgrAction(XLogBlockHead* blockhead, void* blockrecbod
extern void GinRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
extern void GistRedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, RedoBufferInfo *bufferinfo);
extern bool IsCheckPoint(const XLogRecParseState *parseState);
#endif

View File

@ -19,6 +19,7 @@
#include "utils/tuplesort.h"
#define DEFAULT_INDEX_TYPE "btree"
#define DEFAULT_HASH_INDEX_TYPE "hash"
#define DEFAULT_CSTORE_INDEX_TYPE "psort"
#define DEFAULT_GIST_INDEX_TYPE "gist"
#define CSTORE_BTREE_INDEX_TYPE "cbtree"

View File

@ -55,6 +55,7 @@ extern const uint32 RANGE_LIST_DISTRIBUTION_VERSION_NUM;
extern const uint32 FIX_SQL_ADD_RELATION_REF_COUNT;
extern const uint32 GENERATED_COL_VERSION_NUM;
extern const uint32 ANALYZER_HOOK_VERSION_NUM;
extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM;
#define INPLACE_UPGRADE_PRECOMMIT_VERSION 1

View File

@ -268,6 +268,7 @@ extern void LockBuffer(Buffer buffer, int mode);
extern bool ConditionalLockBuffer(Buffer buffer);
extern void LockBufferForCleanup(Buffer buffer);
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
extern bool IsBufferCleanupOK(Buffer buffer);
extern bool HoldingBufferPinThatDelaysRecovery(void);
extern void AsyncUnpinBuffer(volatile void* bufHdr, bool forgetBuffer);
extern void AsyncCompltrPinBuffer(volatile void* bufHdr);

View File

@ -472,6 +472,7 @@ extern Page PageGetTempPageCopySpecial(Page page, bool isbtree);
extern void PageRestoreTempPage(Page tempPage, Page oldPage);
extern void PageRepairFragmentation(Page page);
extern Size PageGetFreeSpace(Page page);
extern Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups);
extern Size PageGetExactFreeSpace(Page page);
extern Size PageGetHeapFreeSpace(Page page);
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);

View File

@ -66,7 +66,8 @@ extern Tuplesortstate* tuplesort_begin_cluster(
extern Tuplesortstate* tuplesort_begin_index_btree(
Relation indexRel, bool enforceUnique, int workMem, bool randomAccess, int maxMem);
extern Tuplesortstate* tuplesort_begin_index_hash(
Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem);
Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets,
int workMem, bool randomAccess, int maxMem);
extern Tuplesortstate* tuplesort_begin_datum(
Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, bool randomAccess);
#ifdef PGXC

View File

@ -354,7 +354,6 @@ NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "mytable_pkey" fo
-- ok
create index idx_gtt1_1 on gtt1 using btree (a);
create index idx_gtt1_2 on gtt1 using hash (a);
ERROR: access method "hash" does not support row store
create global temp table tmp_t0(c0 tsvector,c1 varchar(100));
create index idx_tmp_t0_1 on tmp_t0 using gin (c0);
create index idx_tmp_t0_2 on tmp_t0 using gist (c0);

View File

@ -0,0 +1,213 @@
--------------------------------
---------- hash index ----------
--------------------------------
set enable_seqscan = off;
set enable_indexscan = off;
------------------
-- hash_table_1 --
------------------
drop table if exists hash_table_1 cascade;
NOTICE: table "hash_table_1" does not exist, skipping
create table hash_table_1 (id int, name varchar, sex varchar default 'male');
insert into hash_table_1 values (1, 'Smith');
insert into hash_table_1 values (2, 'Jones');
insert into hash_table_1 values (3, 'Williams', 'female');
insert into hash_table_1 values (4, 'Taylor');
insert into hash_table_1 values (5, 'Brown');
insert into hash_table_1 values (6, 'Davies');
drop index if exists hash_t1_id1;
NOTICE: index "hash_t1_id1" does not exist, skipping
create index hash_t1_id1 on hash_table_1 using hash (id);
-- error, does not support multicolumn indexes
drop index if exists hash_t1_id2;
NOTICE: index "hash_t1_id2" does not exist, skipping
create index hash_t1_id2 on hash_table_1 using hash (id, sex);
ERROR: access method "hash" does not support multicolumn indexes
-- compare with hash_t1_id1 and hash_t1_id3, hash index can be create in same column
drop index if exists hash_t1_id3;
NOTICE: index "hash_t1_id3" does not exist, skipping
drop index if exists hash_t1_id4;
NOTICE: index "hash_t1_id4" does not exist, skipping
create index hash_t1_id3 on hash_table_1 using btree (id);
create index hash_t1_id4 on hash_table_1 using hash (id);
-- drop superfluous index now
drop index hash_t1_id3, hash_t1_id4;
-- insert into large volumns of data into hash_table_1
insert into hash_table_1 select 4, 'XXX', 'XXX' from generate_series(1,50000);
insert into hash_table_1 select 6, 'XXX', 'XXX' from generate_series(1,50000);
analyse hash_table_1;
-- after insert, hash_t1_id1 is still work
explain(costs off) select * from hash_table_1 where id = 4;
QUERY PLAN
----------------------------------------
Bitmap Heap Scan on hash_table_1
Recheck Cond: (id = 4)
-> Bitmap Index Scan on hash_t1_id1
Index Cond: (id = 4)
(4 rows)
select count(*) from hash_table_1 where id = 6; --50001
count
-------
50001
(1 row)
-- do other dml action, then check hash_t1_id1 again
insert into hash_table_1 select random()*100, 'XXX', 'XXX' from generate_series(1,50000);
update hash_table_1 set id = 101, sex = 'male' where id = 60;
delete from hash_table_1 where id = 80;
explain(costs off) select * from hash_table_1 where id = 101;
QUERY PLAN
----------------------------------------
Bitmap Heap Scan on hash_table_1
Recheck Cond: (id = 101)
-> Bitmap Index Scan on hash_t1_id1
Index Cond: (id = 101)
(4 rows)
-- cleanup env
drop table hash_table_1 cascade;
------------------
-- hash_table_2 --
------------------
drop table if exists hash_table_2 cascade;
NOTICE: table "hash_table_2" does not exist, skipping
create table hash_table_2 (id int, name varchar, sex varchar default 'male');
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100000);
-- create index concurrently
-- In this fastcheck, we only check it can run properly. However, in a real
-- situation, you should run this sql in connection a first, then doing some DML(
-- insert, delete, update) operation about this table in connection b as soon
-- as possible. We expect the create index do not block DML operation.
-- connection a
create index concurrently hash_t2_id1 on hash_table_2 using hash (id);
-- connection b
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
explain(costs off) select * from hash_table_2 where id = 40;
QUERY PLAN
----------------------------------------
Bitmap Heap Scan on hash_table_2
Recheck Cond: (id = 40)
-> Bitmap Index Scan on hash_t2_id1
Index Cond: (id = 40)
(4 rows)
-- error, does not support unique indexes
create unique index hash_t2_id2 on hash_table_2 using hash (sex);
ERROR: access method "hash" does not support unique indexes
-- hash_t2_id3 occupies more disk space than hash_t2_id2
create index hash_t2_id2 on hash_table_2 using hash (id) with (fillfactor=25);
create index hash_t2_id3 on hash_table_2 using hash (id) with (fillfactor=75);
select count(*) from hash_table_2; --100100
count
--------
100100
(1 row)
-- cleanup env
drop table hash_table_2 cascade;
------------------
-- hash_table_3 --
------------------
drop schema if exists hash_sc_3 cascade;
NOTICE: schema "hash_sc_3" does not exist, skipping
drop tablespace if exists hash_sp_3;
NOTICE: Tablespace "hash_sp_3" does not exist, skipping.
create schema hash_sc_3;
create tablespace hash_sp_3 relative location 'tablespace/tablespace_1';
create table hash_sc_3.hash_table_3
(
id int, name varchar,
sex varchar default 'male'
)
tablespace hash_sp_3;
-- create index specify schema and tablespace
create index concurrently hash_sc_3.hash_t3_id1 on hash_sc_3.hash_table_3 using hash (id);
create index hash_sc_3.hash_t3_id2 on hash_sc_3.hash_table_3 using hash (id) tablespace hash_sp_3;
drop table hash_sc_3.hash_table_3 cascade;
drop schema hash_sc_3 cascade;
drop tablespace hash_sp_3;
------------------
-- hash_table_4 --
------------------
drop table if exists hash_table_4 cascade;
NOTICE: table "hash_table_4" does not exist, skipping
create table hash_table_4
(
id int,
name varchar,
sex varchar default 'male'
)
partition by range(id)
(
partition p1 values less than (1000),
partition p2 values less than (2000),
partition p3 values less than (3000),
partition p4 values less than (maxvalue)
);
-- hash index only support local index in partition table
drop index if exists hash_t4_id1;
NOTICE: index "hash_t4_id1" does not exist, skipping
drop index if exists hash_t4_id2;
NOTICE: index "hash_t4_id2" does not exist, skipping
drop index if exists hash_t4_id2_new;
NOTICE: index "hash_t4_id2_new" does not exist, skipping
create index hash_t4_id1 on hash_table_4 using hash(id) global;
ERROR: Global partition index only support btree.
create index hash_t4_id2 on hash_table_4 using hash(id) local
(
partition index_t4_p1,
partition index_t4_p2,
partition index_t4_p3,
partition index_t4_p4
);
-- alter index rename, unusable
insert into hash_table_4 select random()*5000, 'XXX', 'XXX' from generate_series(1,1000);
alter index hash_t4_id2 rename to hash_t4_id2_new;
alter index hash_t4_id2_new modify partition index_t4_p2 unusable;
reindex index hash_t4_id2_new partition index_t4_p2;
drop table hash_table_4 cascade;
------------------
-- hash_table_5 --
------------------
drop table if exists hash_table_5;
NOTICE: table "hash_table_5" does not exist, skipping
create temporary table hash_table_5(id int, name varchar, sex varchar default 'male');
drop index if exists hash_t5_id1;
NOTICE: index "hash_t5_id1" does not exist, skipping
create index hash_t5_id1 on hash_table_5 using hash(id) with(fillfactor = 80);
insert into hash_table_5 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
update hash_table_5 set name = 'aaa' where id = 80;
alter index hash_t5_id1 set (fillfactor = 60);
alter index hash_t5_id1 RESET (fillfactor);
explain (costs off) select * from hash_table_5 where id = 80;
QUERY PLAN
----------------------------------------
Bitmap Heap Scan on hash_table_5
Recheck Cond: (id = 80)
-> Bitmap Index Scan on hash_t5_id1
Index Cond: (id = 80)
(4 rows)
drop table hash_table_5 cascade;
------------------
-- hash_table_6 --
------------------
drop table if exists hash_table_6;
NOTICE: table "hash_table_6" does not exist, skipping
create global temporary table hash_table_6(id int, name varchar, sex varchar default 'male');
drop index if exists hash_t6_id1;
NOTICE: index "hash_t6_id1" does not exist, skipping
create index hash_t6_id1 on hash_table_6 using hash((id*10)) with (fillfactor = 30);
insert into hash_table_6 select random()*100, 'XXX', 'XXX' from generate_series(1,1000);
delete from hash_table_6 where id in (50, 60, 70);
explain (costs off) select * from hash_table_6 where id*10 = 80;
QUERY PLAN
----------------------------------------
Bitmap Heap Scan on hash_table_6
Recheck Cond: ((id * 10) = 80)
-> Bitmap Index Scan on hash_t6_id1
Index Cond: ((id * 10) = 80)
(4 rows)
drop table hash_table_6 cascade;

View File

@ -261,11 +261,8 @@ INTERVAL ('1 month')
);
NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "interval_partition_table_003_pkey" for table "interval_partition_table_003"
create index interval_partition_table_003_1 ON interval_partition_table_003 USING HASH (logdate) LOCAL;
ERROR: access method "hash" does not support row store
create index interval_partition_table_003_2 ON interval_partition_table_003 USING HASH (c2) LOCAL;
ERROR: access method "hash" does not support row store
create index interval_partition_table_003_3 ON interval_partition_table_003 USING HASH (c1) LOCAL;
ERROR: access method "hash" does not support row store
select relname from pg_partition where INDEXTBLID=(select RELFILENODE from pg_partition where relname='interval_partition_table_003_1') order by 1;
relname
---------

View File

@ -41,7 +41,6 @@ SELECT * FROM macaddr_data ORDER BY a;
CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b);
CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b);
ERROR: access method "hash" does not support row store
SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1;
a | b | trunc
----+-------------------+-------------------

View File

@ -362,7 +362,6 @@ DROP INDEX enumtest_btree;
-- Hash index / opclass with the = operator
--
CREATE INDEX enumtest_hash ON enumtest USING hash (col);
ERROR: access method "hash" does not support row store
SELECT * FROM enumtest WHERE col = 'orange';
col
--------
@ -370,7 +369,6 @@ SELECT * FROM enumtest WHERE col = 'orange';
(1 row)
DROP INDEX enumtest_hash;
ERROR: index "enumtest_hash" does not exist
--
-- End index tests
--

View File

@ -41,7 +41,6 @@ SELECT * FROM macaddr_data;
CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b);
CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b);
ERROR: access method "hash" does not support row store
SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1;
a | b | trunc
----+-------------------+-------------------

View File

@ -120,7 +120,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222
-- btree and hash index creation test
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
ERROR: access method "hash" does not support row store
-- unique index test
CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field);
-- should fail
@ -131,7 +130,7 @@ DETAIL: Key (guid_field)=(11111111-1111-1111-1111-111111111111) already exists.
SELECT count(*) FROM pg_class WHERE relkind='i' AND relname LIKE 'guid%';
count
-------
2
3
(1 row)
-- populating the test tables with additional records

View File

@ -120,7 +120,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222
-- btree and hash index creation test
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
ERROR: access method "hash" does not support row store
-- unique index test
CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field);
-- should fail
@ -131,7 +130,7 @@ DETAIL: Key (guid_field)=(11111111-1111-1111-1111-111111111111) already exists.
SELECT count(*) FROM pg_class WHERE relkind='i' AND relname LIKE 'guid%';
count
-------
2
3
(1 row)
-- populating the test tables with additional records

View File

@ -110,6 +110,7 @@ test: single_node_random
#test: single_node_portals
#test: single_node_arrays
#test: single_node_btree_index single_node_hash_index single_node_update
test: hash_index_001
test: single_node_update
#test single_node_namespace
#test: single_node_prepared_xacts

View File

@ -0,0 +1,169 @@
--------------------------------
---------- hash index ----------
--------------------------------
set enable_seqscan = off;
set enable_indexscan = off;
------------------
-- hash_table_1 --
------------------
drop table if exists hash_table_1 cascade;
create table hash_table_1 (id int, name varchar, sex varchar default 'male');
insert into hash_table_1 values (1, 'Smith');
insert into hash_table_1 values (2, 'Jones');
insert into hash_table_1 values (3, 'Williams', 'female');
insert into hash_table_1 values (4, 'Taylor');
insert into hash_table_1 values (5, 'Brown');
insert into hash_table_1 values (6, 'Davies');
drop index if exists hash_t1_id1;
create index hash_t1_id1 on hash_table_1 using hash (id);
-- error, does not support multicolumn indexes
drop index if exists hash_t1_id2;
create index hash_t1_id2 on hash_table_1 using hash (id, sex);
-- compare with hash_t1_id1 and hash_t1_id3, hash index can be create in same column
drop index if exists hash_t1_id3;
drop index if exists hash_t1_id4;
create index hash_t1_id3 on hash_table_1 using btree (id);
create index hash_t1_id4 on hash_table_1 using hash (id);
-- drop superfluous index now
drop index hash_t1_id3, hash_t1_id4;
-- insert into large volumns of data into hash_table_1
insert into hash_table_1 select 4, 'XXX', 'XXX' from generate_series(1,50000);
insert into hash_table_1 select 6, 'XXX', 'XXX' from generate_series(1,50000);
analyse hash_table_1;
-- after insert, hash_t1_id1 is still work
explain(costs off) select * from hash_table_1 where id = 4;
select count(*) from hash_table_1 where id = 6; --50001
-- do other dml action, then check hash_t1_id1 again
insert into hash_table_1 select random()*100, 'XXX', 'XXX' from generate_series(1,50000);
update hash_table_1 set id = 101, sex = 'male' where id = 60;
delete from hash_table_1 where id = 80;
explain(costs off) select * from hash_table_1 where id = 101;
-- cleanup env
drop table hash_table_1 cascade;
------------------
-- hash_table_2 --
------------------
drop table if exists hash_table_2 cascade;
create table hash_table_2 (id int, name varchar, sex varchar default 'male');
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100000);
-- create index concurrently
-- In this fastcheck, we only check it can run properly. However, in a real
-- situation, you should run this sql in connection a first, then doing some DML(
-- insert, delete, update) operation about this table in connection b as soon
-- as possible. We expect the create index do not block DML operation.
-- connection a
create index concurrently hash_t2_id1 on hash_table_2 using hash (id);
-- connection b
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
explain(costs off) select * from hash_table_2 where id = 40;
-- error, does not support unique indexes
create unique index hash_t2_id2 on hash_table_2 using hash (sex);
-- hash_t2_id3 occupies more disk space than hash_t2_id2
create index hash_t2_id2 on hash_table_2 using hash (id) with (fillfactor=25);
create index hash_t2_id3 on hash_table_2 using hash (id) with (fillfactor=75);
select count(*) from hash_table_2; --100100
-- cleanup env
drop table hash_table_2 cascade;
------------------
-- hash_table_3 --
------------------
drop schema if exists hash_sc_3 cascade;
drop tablespace if exists hash_sp_3;
create schema hash_sc_3;
create tablespace hash_sp_3 relative location 'tablespace/tablespace_1';
create table hash_sc_3.hash_table_3
(
id int, name varchar,
sex varchar default 'male'
)
tablespace hash_sp_3;
-- create index specify schema and tablespace
create index concurrently hash_sc_3.hash_t3_id1 on hash_sc_3.hash_table_3 using hash (id);
create index hash_sc_3.hash_t3_id2 on hash_sc_3.hash_table_3 using hash (id) tablespace hash_sp_3;
drop table hash_sc_3.hash_table_3 cascade;
drop schema hash_sc_3 cascade;
drop tablespace hash_sp_3;
------------------
-- hash_table_4 --
------------------
drop table if exists hash_table_4 cascade;
create table hash_table_4
(
id int,
name varchar,
sex varchar default 'male'
)
partition by range(id)
(
partition p1 values less than (1000),
partition p2 values less than (2000),
partition p3 values less than (3000),
partition p4 values less than (maxvalue)
);
-- hash index only support local index in partition table
drop index if exists hash_t4_id1;
drop index if exists hash_t4_id2;
drop index if exists hash_t4_id2_new;
create index hash_t4_id1 on hash_table_4 using hash(id) global;
create index hash_t4_id2 on hash_table_4 using hash(id) local
(
partition index_t4_p1,
partition index_t4_p2,
partition index_t4_p3,
partition index_t4_p4
);
-- alter index rename, unusable
insert into hash_table_4 select random()*5000, 'XXX', 'XXX' from generate_series(1,1000);
alter index hash_t4_id2 rename to hash_t4_id2_new;
alter index hash_t4_id2_new modify partition index_t4_p2 unusable;
reindex index hash_t4_id2_new partition index_t4_p2;
drop table hash_table_4 cascade;
------------------
-- hash_table_5 --
------------------
drop table if exists hash_table_5;
create temporary table hash_table_5(id int, name varchar, sex varchar default 'male');
drop index if exists hash_t5_id1;
create index hash_t5_id1 on hash_table_5 using hash(id) with(fillfactor = 80);
insert into hash_table_5 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
update hash_table_5 set name = 'aaa' where id = 80;
alter index hash_t5_id1 set (fillfactor = 60);
alter index hash_t5_id1 RESET (fillfactor);
explain (costs off) select * from hash_table_5 where id = 80;
drop table hash_table_5 cascade;
------------------
-- hash_table_6 --
------------------
drop table if exists hash_table_6;
create global temporary table hash_table_6(id int, name varchar, sex varchar default 'male');
drop index if exists hash_t6_id1;
create index hash_t6_id1 on hash_table_6 using hash((id*10)) with (fillfactor = 30);
insert into hash_table_6 select random()*100, 'XXX', 'XXX' from generate_series(1,1000);
delete from hash_table_6 where id in (50, 60, 70);
explain (costs off) select * from hash_table_6 where id*10 = 80;
drop table hash_table_6 cascade;