forked from openGauss-Ecosystem/openGauss-server
!1236 openGauss support hash index
Merge pull request !1236 from 陈栋/hash_index
This commit is contained in:
commit
010fdac4e4
|
@ -13,6 +13,7 @@
|
|||
#include "access/gin.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/multixact.h"
|
||||
#include "access/nbtree.h"
|
||||
|
|
|
@ -363,7 +363,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
|
|||
Page page;
|
||||
OffsetNumber maxoff;
|
||||
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
|
@ -390,7 +389,6 @@ static void pgstat_hash_page(pgstattuple_type* stat, Relation rel, BlockNumber b
|
|||
}
|
||||
|
||||
_hash_relbuf(rel, buf);
|
||||
_hash_droplock(rel, blkno, HASH_SHARE);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -3353,12 +3353,21 @@ IndexStmt* transformIndexStmt(Oid relid, IndexStmt* stmt, const char* queryStrin
|
|||
|
||||
if (!isColStore && (0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIN_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE))) {
|
||||
/* row store only support btree/gin/gist index */
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_GIST_INDEX_TYPE)) &&
|
||||
(0 != pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE))) {
|
||||
/* row store only support btree/gin/gist/hash index */
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
|
||||
}
|
||||
|
||||
if (0 == pg_strcasecmp(stmt->accessMethod, DEFAULT_HASH_INDEX_TYPE) &&
|
||||
t_thrd.proc->workingVersionNum < SUPPORT_HASH_XLOG_VERSION_NUM) {
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("access method \"%s\" does not support row store", stmt->accessMethod)));
|
||||
}
|
||||
|
||||
if (isColStore && (!isPsortMothed && !isCBtreeMethod && !isCGinBtreeMethod)) {
|
||||
/* column store support psort/cbtree/gin index */
|
||||
ereport(ERROR,
|
||||
|
|
|
@ -59,7 +59,7 @@ bool open_join_children = true;
|
|||
bool will_shutdown = false;
|
||||
|
||||
/* hard-wired binary version number */
|
||||
const uint32 GRAND_VERSION_NUM = 92308;
|
||||
const uint32 GRAND_VERSION_NUM = 92309;
|
||||
|
||||
const uint32 MATVIEW_VERSION_NUM = 92213;
|
||||
const uint32 PARTIALPUSH_VERSION_NUM = 92087;
|
||||
|
@ -79,6 +79,7 @@ const uint32 ML_OPT_MODEL_VERSION_NUM = 92284;
|
|||
const uint32 FIX_SQL_ADD_RELATION_REF_COUNT = 92291;
|
||||
const uint32 GENERATED_COL_VERSION_NUM = 92303;
|
||||
const uint32 ANALYZER_HOOK_VERSION_NUM = 92306;
|
||||
const uint32 SUPPORT_HASH_XLOG_VERSION_NUM = 92309;
|
||||
|
||||
/* This variable indicates wheather the instance is in progress of upgrade as a whole */
|
||||
uint32 volatile WorkingGrandVersionNum = GRAND_VERSION_NUM;
|
||||
|
|
|
@ -389,9 +389,6 @@ static void ResourceOwnerReleaseInternal(
|
|||
MemoryContextDelete(memContext);
|
||||
ResourceOwnerForgetGMemContext(t_thrd.utils_cxt.TopTransactionResourceOwner, memContext);
|
||||
}
|
||||
|
||||
/* Clean up index scans too */
|
||||
ReleaseResources_hash();
|
||||
}
|
||||
|
||||
/* Let add-on modules get a chance too */
|
||||
|
|
|
@ -109,6 +109,7 @@
|
|||
#include <limits.h>
|
||||
|
||||
#include "access/nbtree.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/tableam.h"
|
||||
#include "catalog/index.h"
|
||||
#include "commands/tablespace.h"
|
||||
|
@ -389,6 +390,7 @@ struct Tuplesortstate {
|
|||
* These variables are specific to the IndexTuple case; they are set by
|
||||
* tuplesort_begin_index_xxx and used only by the IndexTuple routines.
|
||||
*/
|
||||
Relation heapRel; /* table the index is being built on */
|
||||
Relation indexRel; /* index being built */
|
||||
|
||||
/* These are specific to the index_btree subcase: */
|
||||
|
@ -396,7 +398,9 @@ struct Tuplesortstate {
|
|||
bool enforceUnique; /* complain if we find duplicate tuples */
|
||||
|
||||
/* These are specific to the index_hash subcase: */
|
||||
uint32 hash_mask; /* mask for sortable part of hash code */
|
||||
uint32 high_mask; /* masks for sortable part of hash code */
|
||||
uint32 low_mask;
|
||||
uint32 max_buckets;
|
||||
|
||||
/*
|
||||
* These variables are specific to the Datum case; they are set by
|
||||
|
@ -917,7 +921,8 @@ Tuplesortstate* tuplesort_begin_index_btree(
|
|||
}
|
||||
|
||||
Tuplesortstate* tuplesort_begin_index_hash(
|
||||
Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem)
|
||||
Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask,
|
||||
uint32 max_buckets, int workMem, bool randomAccess, int maxMem)
|
||||
{
|
||||
Tuplesortstate* state = tuplesort_begin_common(workMem, randomAccess);
|
||||
MemoryContext oldcontext;
|
||||
|
@ -927,11 +932,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
|
|||
#ifdef TRACE_SORT
|
||||
if (u_sess->attr.attr_common.trace_sort) {
|
||||
elog(LOG,
|
||||
"begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c, maxMem = %d",
|
||||
hash_mask,
|
||||
workMem,
|
||||
randomAccess ? 't' : 'f',
|
||||
maxMem);
|
||||
"begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
|
||||
"max_buckets = 0x%x, workMem = %d, randomAccess = %c",
|
||||
high_mask,
|
||||
low_mask,
|
||||
max_buckets,
|
||||
workMem, randomAccess ? 't' : 'f');
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -946,9 +952,12 @@ Tuplesortstate* tuplesort_begin_index_hash(
|
|||
#endif
|
||||
state->reversedirection = reversedirection_index_hash;
|
||||
|
||||
state->heapRel = heapRel;
|
||||
state->indexRel = indexRel;
|
||||
|
||||
state->hash_mask = hash_mask;
|
||||
state->high_mask = high_mask;
|
||||
state->low_mask = low_mask;
|
||||
state->max_buckets = max_buckets;
|
||||
state->maxMem = maxMem * 1024L;
|
||||
|
||||
(void)MemoryContextSwitchTo(oldcontext);
|
||||
|
@ -3610,8 +3619,8 @@ static int comparetup_index_btree(const SortTuple* a, const SortTuple* b, Tuples
|
|||
|
||||
static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tuplesortstate* state)
|
||||
{
|
||||
uint32 hash1;
|
||||
uint32 hash2;
|
||||
Bucket bucket1;
|
||||
Bucket bucket2;
|
||||
IndexTuple tuple1;
|
||||
IndexTuple tuple2;
|
||||
|
||||
|
@ -3620,13 +3629,17 @@ static int comparetup_index_hash(const SortTuple* a, const SortTuple* b, Tupleso
|
|||
* that the first column of the index tuple is the hash key.
|
||||
*/
|
||||
Assert(!a->isnull1);
|
||||
hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
|
||||
bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
|
||||
state->max_buckets, state->high_mask,
|
||||
state->low_mask);
|
||||
Assert(!b->isnull1);
|
||||
hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
|
||||
bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
|
||||
state->max_buckets, state->high_mask,
|
||||
state->low_mask);
|
||||
|
||||
if (hash1 > hash2) {
|
||||
if (bucket1 > bucket2) {
|
||||
return 1;
|
||||
} else if (hash1 < hash2) {
|
||||
} else if (bucket1 < bucket2) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ ifneq "$(MAKECMDGOALS)" "clean"
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashscan.o \
|
||||
hashsearch.o hashsort.o hashutil.o
|
||||
OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o\
|
||||
hashsort.o hashutil.o hash_xlog.o
|
||||
|
||||
include $(top_srcdir)/src/gausskernel/common.mk
|
||||
|
|
|
@ -58,35 +58,51 @@ rules to support a variable number of overflow pages while not having to
|
|||
move primary bucket pages around after they are created.
|
||||
|
||||
Primary bucket pages (henceforth just "bucket pages") are allocated in
|
||||
power-of-2 groups, called "split points" in the code. Buckets 0 and 1
|
||||
are created when the index is initialized. At the first split, buckets 2
|
||||
and 3 are allocated; when bucket 4 is needed, buckets 4-7 are allocated;
|
||||
when bucket 8 is needed, buckets 8-15 are allocated; etc. All the bucket
|
||||
pages of a power-of-2 group appear consecutively in the index. This
|
||||
addressing scheme allows the physical location of a bucket page to be
|
||||
computed from the bucket number relatively easily, using only a small
|
||||
amount of control information. We take the log2() of the bucket number
|
||||
to determine which split point S the bucket belongs to, and then simply
|
||||
add "hashm_spares[S] + 1" (where hashm_spares[] is an array stored in the
|
||||
metapage) to compute the physical address. hashm_spares[S] can be
|
||||
interpreted as the total number of overflow pages that have been allocated
|
||||
before the bucket pages of splitpoint S. hashm_spares[0] is always 0,
|
||||
so that buckets 0 and 1 (which belong to splitpoint 0) always appear at
|
||||
block numbers 1 and 2, just after the meta page. We always have
|
||||
hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
|
||||
former. The difference between the two represents the number of overflow
|
||||
pages appearing between the bucket page groups of splitpoints N and N+1.
|
||||
|
||||
power-of-2 groups, called "split points" in the code. That means at every new
|
||||
splitpoint we double the existing number of buckets. Allocating huge chunks
|
||||
of bucket pages all at once isn't optimal and we will take ages to consume
|
||||
those. To avoid this exponential growth of index size, we did use a trick to
|
||||
break up allocation of buckets at the splitpoint into 4 equal phases. If
|
||||
(2 ^ x) are the total buckets need to be allocated at a splitpoint (from now on
|
||||
we shall call this as a splitpoint group), then we allocate 1/4th (2 ^ (x - 2))
|
||||
of total buckets at each phase of splitpoint group. Next quarter of allocation
|
||||
will only happen if buckets of the previous phase have been already consumed.
|
||||
For the initial splitpoint groups < 10 we will allocate all of their buckets in
|
||||
single phase only, as number of buckets allocated at initial groups are small
|
||||
in numbers. And for the groups >= 10 the allocation process is distributed
|
||||
among four equal phases. At group 10 we allocate (2 ^ 9) buckets in 4
|
||||
different phases {2 ^ 7, 2 ^ 7, 2 ^ 7, 2 ^ 7}, the numbers in curly braces
|
||||
indicate the number of buckets allocated within each phase of splitpoint group
|
||||
10. And, for splitpoint group 11 and 12 allocation phases will be
|
||||
{2 ^ 8, 2 ^ 8, 2 ^ 8, 2 ^ 8} and {2 ^ 9, 2 ^ 9, 2 ^ 9, 2 ^ 9} respectively. We
|
||||
can see that at each splitpoint group we double the total number of buckets
|
||||
from the previous group but in an incremental phase. The bucket pages
|
||||
allocated within one phase of a splitpoint group will appear consecutively in
|
||||
the index. This addressing scheme allows the physical location of a bucket
|
||||
page to be computed from the bucket number relatively easily, using only a
|
||||
small amount of control information. If we look at the function
|
||||
_hash_spareindex for a given bucket number we first compute the
|
||||
splitpoint group it belongs to and then the phase to which the bucket belongs
|
||||
to. Adding them we get the global splitpoint phase number S to which the
|
||||
bucket belongs and then simply add "hashm_spares[S] + 1" (where hashm_spares[]
|
||||
is an array stored in the metapage) with given bucket number to compute its
|
||||
physical address. The hashm_spares[S] can be interpreted as the total number
|
||||
of overflow pages that have been allocated before the bucket pages of
|
||||
splitpoint phase S. The hashm_spares[0] is always 0, so that buckets 0 and 1
|
||||
always appear at block numbers 1 and 2, just after the meta page. We always
|
||||
have hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the
|
||||
former. The difference between the two represents the number of overflow pages
|
||||
appearing between the bucket page groups of splitpoints phase N and N+1.
|
||||
(Note: the above describes what happens when filling an initially minimally
|
||||
sized hash index. In practice, we try to estimate the required index size
|
||||
and allocate a suitable number of splitpoints immediately, to avoid
|
||||
sized hash index. In practice, we try to estimate the required index size and
|
||||
allocate a suitable number of splitpoints phases immediately, to avoid
|
||||
expensive re-splitting during initial index build.)
|
||||
|
||||
When S splitpoints exist altogether, the array entries hashm_spares[0]
|
||||
through hashm_spares[S] are valid; hashm_spares[S] records the current
|
||||
total number of overflow pages. New overflow pages are created as needed
|
||||
at the end of the index, and recorded by incrementing hashm_spares[S].
|
||||
When it is time to create a new splitpoint's worth of bucket pages, we
|
||||
When it is time to create a new splitpoint phase's worth of bucket pages, we
|
||||
copy hashm_spares[S] into hashm_spares[S+1] and increment S (which is
|
||||
stored in the hashm_ovflpoint field of the meta page). This has the
|
||||
effect of reserving the correct number of bucket pages at the end of the
|
||||
|
@ -101,7 +117,7 @@ We have to allow the case "greater than" because it's possible that during
|
|||
an index extension we crash after allocating filesystem space and before
|
||||
updating the metapage. Note that on filesystems that allow "holes" in
|
||||
files, it's entirely likely that pages before the logical EOF are not yet
|
||||
allocated: when we allocate a new splitpoint's worth of bucket pages, we
|
||||
allocated: when we allocate a new splitpoint phase's worth of bucket pages, we
|
||||
physically zero the last such page to force the EOF up, and the first such
|
||||
page will be used immediately, but the intervening pages are not written
|
||||
until needed.
|
||||
|
@ -126,61 +142,98 @@ the initially created buckets.
|
|||
Lock Definitions
|
||||
----------------
|
||||
|
||||
We use both lmgr locks ("heavyweight" locks) and buffer context locks
|
||||
(LWLocks) to control access to a hash index. lmgr locks are needed for
|
||||
long-term locking since there is a (small) risk of deadlock, which we must
|
||||
be able to detect. Buffer context locks are used for short-term access
|
||||
control to individual pages of the index.
|
||||
Concurrency control for hash indexes is provided using buffer content
|
||||
locks, buffer pins, and cleanup locks. Here as elsewhere in PostgreSQL,
|
||||
cleanup lock means that we hold an exclusive lock on the buffer and have
|
||||
observed at some point after acquiring the lock that we hold the only pin
|
||||
on that buffer. For hash indexes, a cleanup lock on a primary bucket page
|
||||
represents the right to perform an arbitrary reorganization of the entire
|
||||
bucket. Therefore, scans retain a pin on the primary bucket page for the
|
||||
bucket they are currently scanning. Splitting a bucket requires a cleanup
|
||||
lock on both the old and new primary bucket pages. VACUUM therefore takes
|
||||
a cleanup lock on every bucket page in order to remove tuples. It can also
|
||||
remove tuples copied to a new bucket by any previous split operation, because
|
||||
the cleanup lock taken on the primary bucket page guarantees that no scans
|
||||
which started prior to the most recent split can still be in progress. After
|
||||
cleaning each page individually, it attempts to take a cleanup lock on the
|
||||
primary bucket page in order to "squeeze" the bucket down to the minimum
|
||||
possible number of pages.
|
||||
|
||||
We define the following lmgr locks for a hash index:
|
||||
To avoid deadlocks, we must be consistent about the lock order in which we
|
||||
lock the buckets for operations that requires locks on two different buckets.
|
||||
We choose to always lock the lower-numbered bucket first. The metapage is
|
||||
only ever locked after all bucket locks have been taken.
|
||||
|
||||
LockPage(rel, 0) represents the right to modify the hash-code-to-bucket
|
||||
mapping. A process attempting to enlarge the hash table by splitting a
|
||||
bucket must exclusive-lock this lock before modifying the metapage data
|
||||
representing the mapping. Processes intending to access a particular
|
||||
bucket must share-lock this lock until they have acquired lock on the
|
||||
correct target bucket.
|
||||
|
||||
LockPage(rel, page), where page is the page number of a hash bucket page,
|
||||
represents the right to split or compact an individual bucket. A process
|
||||
splitting a bucket must exclusive-lock both old and new halves of the
|
||||
bucket until it is done. A process doing VACUUM must exclusive-lock the
|
||||
bucket it is currently purging tuples from. Processes doing scans or
|
||||
insertions must share-lock the bucket they are scanning or inserting into.
|
||||
(It is okay to allow concurrent scans and insertions.)
|
||||
Metapage Caching
|
||||
----------------
|
||||
|
||||
The lmgr lock IDs corresponding to overflow pages are currently unused.
|
||||
These are available for possible future refinements.
|
||||
Both scanning the index and inserting tuples require locating the bucket
|
||||
where a given tuple ought to be located. To do this, we need the bucket
|
||||
count, highmask, and lowmask from the metapage; however, it's undesirable
|
||||
for performance reasons to have to have to lock and pin the metapage for
|
||||
every such operation. Instead, we retain a cached copy of the metapage
|
||||
in each each backend's relcache entry. This will produce the correct
|
||||
bucket mapping as long as the target bucket hasn't been split since the
|
||||
last cache refresh.
|
||||
|
||||
Note that these lock definitions are conceptually distinct from any sort
|
||||
of lock on the pages whose numbers they share. A process must also obtain
|
||||
read or write buffer lock on the metapage or bucket page before accessing
|
||||
said page.
|
||||
To guard against the possibility that such a split has occurred, the
|
||||
primary page of each bucket chain stores the number of buckets that
|
||||
existed as of the time the bucket was last split, or if never split as
|
||||
of the time it was created, in the space normally used for the
|
||||
previous block number (that is, hasho_prevblkno). This doesn't cost
|
||||
anything because the primary bucket page is always the first page in
|
||||
the chain, and the previous block number is therefore always, in
|
||||
reality, InvalidBlockNumber.
|
||||
|
||||
Processes performing hash index scans must hold share lock on the bucket
|
||||
they are scanning throughout the scan. This seems to be essential, since
|
||||
there is no reasonable way for a scan to cope with its bucket being split
|
||||
underneath it. This creates a possibility of deadlock external to the
|
||||
hash index code, since a process holding one of these locks could block
|
||||
waiting for an unrelated lock held by another process. If that process
|
||||
then does something that requires exclusive lock on the bucket, we have
|
||||
deadlock. Therefore the bucket locks must be lmgr locks so that deadlock
|
||||
can be detected and recovered from. This also forces the page-zero lock
|
||||
to be an lmgr lock, because as we'll see below it is held while attempting
|
||||
to acquire a bucket lock, and so it could also participate in a deadlock.
|
||||
After computing the ostensibly-correct bucket number based on our cached
|
||||
copy of the metapage, we lock the corresponding primary bucket page and
|
||||
check whether the bucket count stored in hasho_prevblkno is greater than
|
||||
our the number of buckets stored in our cached copy of the metapage. If
|
||||
so, the bucket has certainly been split, because the must originally
|
||||
have been less than the number of buckets that existed at that time and
|
||||
can't have increased except due to a split. If not, the bucket can't have
|
||||
been split, because a split would have created a new bucket with a higher
|
||||
bucket number than any we'd seen previously. In the latter case, we've
|
||||
locked the correct bucket and can proceed; in the former case, we must
|
||||
release the lock on this bucket, lock the metapage, update our cache,
|
||||
unlock the metapage, and retry.
|
||||
|
||||
Processes must obtain read (share) buffer context lock on any hash index
|
||||
page while reading it, and write (exclusive) lock while modifying it.
|
||||
To prevent deadlock we enforce these coding rules: no buffer lock may be
|
||||
held long term (across index AM calls), nor may any buffer lock be held
|
||||
while waiting for an lmgr lock, nor may more than one buffer lock
|
||||
be held at a time by any one process. (The third restriction is probably
|
||||
stronger than necessary, but it makes the proof of no deadlock obvious.)
|
||||
Needing to retry occasionally might seem expensive, but the number of times
|
||||
any given bucket can be split is limited to a few dozen no matter how
|
||||
many times the hash index is accessed, because the total number of
|
||||
buckets is limited to less than 2^32. On the other hand, the number of
|
||||
times we access a bucket is unbounded and will be several orders of
|
||||
magnitude larger even in unsympathetic cases.
|
||||
|
||||
(The metapage cache is new in v10. Older hash indexes had the primary
|
||||
bucket page's hasho_prevblkno initialized to InvalidBuffer.)
|
||||
|
||||
Pseudocode Algorithms
|
||||
---------------------
|
||||
|
||||
Various flags that are used in hash index operations are described as below:
|
||||
|
||||
The bucket-being-split and bucket-being-populated flags indicate that split
|
||||
the operation is in progress for a bucket. During split operation, a
|
||||
bucket-being-split flag is set on the old bucket and bucket-being-populated
|
||||
flag is set on new bucket. These flags are cleared once the split operation
|
||||
is finished.
|
||||
|
||||
The split-cleanup flag indicates that a bucket which has been recently split
|
||||
still contains tuples that were also copied to the new bucket; it essentially
|
||||
marks the split as incomplete. Once we're certain that no scans which
|
||||
started before the new bucket was fully populated are still in progress, we
|
||||
can remove the copies from the old bucket and clear the flag. We insist that
|
||||
this flag must be clear before splitting a bucket; thus, a bucket can't be
|
||||
split again until the previous split is totally complete.
|
||||
|
||||
The moved-by-split flag on a tuple indicates that tuple is moved from old to
|
||||
new bucket. Concurrent scans will skip such tuples until the split operation
|
||||
is finished. Once the tuple is marked as moved-by-split, it will remain so
|
||||
forever but that does no harm. We have intentionally not cleared it as that
|
||||
can generate an additional I/O which is not necessary.
|
||||
|
||||
The operations we need to support are: readers scanning the index for
|
||||
entries of a particular hash code (which by definition are all in the same
|
||||
bucket); insertion of a new tuple into the correct bucket; enlarging the
|
||||
|
@ -195,57 +248,75 @@ track of available overflow pages.
|
|||
|
||||
The reader algorithm is:
|
||||
|
||||
share-lock page 0 (to prevent active split)
|
||||
read/sharelock meta page
|
||||
compute bucket number for target hash key
|
||||
release meta page
|
||||
share-lock bucket page (to prevent split/compact of this bucket)
|
||||
release page 0 share-lock
|
||||
lock the primary bucket page of the target bucket
|
||||
if the target bucket is still being populated by a split:
|
||||
release the buffer content lock on current bucket page
|
||||
pin and acquire the buffer content lock on old bucket in shared mode
|
||||
release the buffer content lock on old bucket, but not pin
|
||||
retake the buffer content lock on new bucket
|
||||
arrange to scan the old bucket normally and the new bucket for
|
||||
tuples which are not moved-by-split
|
||||
-- then, per read request:
|
||||
read/sharelock current page of bucket
|
||||
step to next page if necessary (no chaining of locks)
|
||||
reacquire content lock on current page
|
||||
step to next page if necessary (no chaining of content locks, but keep
|
||||
the pin on the primary bucket throughout the scan; we also maintain
|
||||
a pin on the page currently being scanned)
|
||||
get tuple
|
||||
release current page
|
||||
release content lock
|
||||
-- at scan shutdown:
|
||||
release bucket share-lock
|
||||
release all pins still held
|
||||
|
||||
By holding the page-zero lock until lock on the target bucket is obtained,
|
||||
the reader ensures that the target bucket calculation is valid (otherwise
|
||||
the bucket might be split before the reader arrives at it, and the target
|
||||
entries might go into the new bucket). Holding the bucket sharelock for
|
||||
the remainder of the scan prevents the reader's current-tuple pointer from
|
||||
being invalidated by splits or compactions. Notice that the reader's lock
|
||||
does not prevent other buckets from being split or compacted.
|
||||
Holding the buffer pin on the primary bucket page for the whole scan prevents
|
||||
the reader's current-tuple pointer from being invalidated by splits or
|
||||
compactions. (Of course, other buckets can still be split or compacted.)
|
||||
|
||||
To keep concurrency reasonably good, we require readers to cope with
|
||||
concurrent insertions, which means that they have to be able to re-find
|
||||
their current scan position after re-acquiring the page sharelock. Since
|
||||
deletion is not possible while a reader holds the bucket sharelock, and
|
||||
we assume that heap tuple TIDs are unique, this can be implemented by
|
||||
their current scan position after re-acquiring the buffer content lock on
|
||||
page. Since deletion is not possible while a reader holds the pin on bucket,
|
||||
and we assume that heap tuple TIDs are unique, this can be implemented by
|
||||
searching for the same heap tuple TID previously returned. Insertion does
|
||||
not move index entries across pages, so the previously-returned index entry
|
||||
should always be on the same page, at the same or higher offset number,
|
||||
as it was before.
|
||||
|
||||
To allow for scans during a bucket split, if at the start of the scan, the
|
||||
bucket is marked as bucket-being-populated, it scan all the tuples in that
|
||||
bucket except for those that are marked as moved-by-split. Once it finishes
|
||||
the scan of all the tuples in the current bucket, it scans the old bucket from
|
||||
which this bucket is formed by split.
|
||||
|
||||
The insertion algorithm is rather similar:
|
||||
|
||||
share-lock page 0 (to prevent active split)
|
||||
read/sharelock meta page
|
||||
compute bucket number for target hash key
|
||||
release meta page
|
||||
share-lock bucket page (to prevent split/compact of this bucket)
|
||||
release page 0 share-lock
|
||||
-- (so far same as reader)
|
||||
read/exclusive-lock current page of bucket
|
||||
if full, release, read/exclusive-lock next page; repeat as needed
|
||||
lock the primary bucket page of the target bucket
|
||||
-- (so far same as reader, except for acquisition of buffer content lock in
|
||||
exclusive mode on primary bucket page)
|
||||
if the bucket-being-split flag is set for a bucket and pin count on it is
|
||||
one, then finish the split
|
||||
release the buffer content lock on current bucket
|
||||
get the "new" bucket which was being populated by the split
|
||||
scan the new bucket and form the hash table of TIDs
|
||||
conditionally get the cleanup lock on old and new buckets
|
||||
if we get the lock on both the buckets
|
||||
finish the split using algorithm mentioned below for split
|
||||
release the pin on old bucket and restart the insert from beginning.
|
||||
if current page is full, first check if this page contains any dead tuples.
|
||||
if yes, remove dead tuples from the current page and again check for the
|
||||
availability of the space. If enough space found, insert the tuple else
|
||||
release lock but not pin, read/exclusive-lock
|
||||
next page; repeat as needed
|
||||
>> see below if no space in any page of bucket
|
||||
take buffer content lock in exclusive mode on metapage
|
||||
insert tuple at appropriate place in page
|
||||
write/release current page
|
||||
release bucket share-lock
|
||||
read/exclusive-lock meta page
|
||||
mark current page dirty
|
||||
increment tuple count, decide if split needed
|
||||
write/release meta page
|
||||
done if no split needed, else enter Split algorithm below
|
||||
mark meta page dirty
|
||||
write WAL for insertion of tuple
|
||||
release the buffer content lock on metapage
|
||||
release buffer content lock on current page
|
||||
if current page is not a bucket page, release the pin on bucket page
|
||||
if split is needed, enter Split algorithm below
|
||||
release the pin on metapage
|
||||
|
||||
To speed searches, the index entries within any individual index page are
|
||||
kept sorted by hash code; the insertion code must take care to insert new
|
||||
|
@ -254,11 +325,13 @@ bucket that is being actively scanned, because readers can cope with this
|
|||
as explained above. We only need the short-term buffer locks to ensure
|
||||
that readers do not see a partially-updated page.
|
||||
|
||||
It is clearly impossible for readers and inserters to deadlock, and in
|
||||
fact this algorithm allows them a very high degree of concurrency.
|
||||
(The exclusive metapage lock taken to update the tuple count is stronger
|
||||
than necessary, since readers do not care about the tuple count, but the
|
||||
lock is held for such a short time that this is probably not an issue.)
|
||||
To avoid deadlock between readers and inserters, whenever there is a need
|
||||
to lock multiple buckets, we always take in the order suggested in Lock
|
||||
Definitions above. This algorithm allows them a very high degree of
|
||||
concurrency. (The exclusive metapage lock taken to update the tuple count
|
||||
is stronger than necessary, since readers do not care about the tuple count,
|
||||
but the lock is held for such a short time that this is probably not an
|
||||
issue.)
|
||||
|
||||
When an inserter cannot find space in any existing page of a bucket, it
|
||||
must obtain an overflow page and add that page to the bucket's chain.
|
||||
|
@ -269,82 +342,95 @@ index is overfull (has a higher-than-wanted ratio of tuples to buckets).
|
|||
The algorithm attempts, but does not necessarily succeed, to split one
|
||||
existing bucket in two, thereby lowering the fill ratio:
|
||||
|
||||
exclusive-lock page 0 (assert the right to begin a split)
|
||||
read/exclusive-lock meta page
|
||||
pin meta page and take buffer content lock in exclusive mode
|
||||
check split still needed
|
||||
if split not needed anymore, drop locks and exit
|
||||
if split not needed anymore, drop buffer content lock and pin and exit
|
||||
decide which bucket to split
|
||||
Attempt to X-lock old bucket number (definitely could fail)
|
||||
Attempt to X-lock new bucket number (shouldn't fail, but...)
|
||||
if above fail, drop locks and exit
|
||||
update meta page to reflect new number of buckets
|
||||
write/release meta page
|
||||
release X-lock on page 0
|
||||
-- now, accesses to all other buckets can proceed.
|
||||
Perform actual split of bucket, moving tuples as needed
|
||||
>> see below about acquiring needed extra space
|
||||
Release X-locks of old and new buckets
|
||||
try to take a cleanup lock on that bucket; if fail, give up
|
||||
if that bucket is still being split or has split-cleanup work:
|
||||
try to finish the split and the cleanup work
|
||||
if that succeeds, start over; if it fails, give up
|
||||
mark the old and new buckets indicating split is in progress
|
||||
mark both old and new buckets as dirty
|
||||
write WAL for allocation of new page for split
|
||||
copy the tuples that belongs to new bucket from old bucket, marking
|
||||
them as moved-by-split
|
||||
write WAL record for moving tuples to new page once the new page is full
|
||||
or all the pages of old bucket are finished
|
||||
release lock but not pin for primary bucket page of old bucket,
|
||||
read/shared-lock next page; repeat as needed
|
||||
clear the bucket-being-split and bucket-being-populated flags
|
||||
mark the old bucket indicating split-cleanup
|
||||
write WAL for changing the flags on both old and new buckets
|
||||
|
||||
Note the page zero and metapage locks are not held while the actual tuple
|
||||
rearrangement is performed, so accesses to other buckets can proceed in
|
||||
parallel; in fact, it's possible for multiple bucket splits to proceed
|
||||
in parallel.
|
||||
|
||||
Split's attempt to X-lock the old bucket number could fail if another
|
||||
process holds S-lock on it. We do not want to wait if that happens, first
|
||||
because we don't want to wait while holding the metapage exclusive-lock,
|
||||
and second because it could very easily result in deadlock. (The other
|
||||
process might be out of the hash AM altogether, and could do something
|
||||
that blocks on another lock this process holds; so even if the hash
|
||||
algorithm itself is deadlock-free, a user-induced deadlock could occur.)
|
||||
So, this is a conditional LockAcquire operation, and if it fails we just
|
||||
abandon the attempt to split. This is all right since the index is
|
||||
overfull but perfectly functional. Every subsequent inserter will try to
|
||||
split, and eventually one will succeed. If multiple inserters failed to
|
||||
split, the index might still be overfull, but eventually, the index will
|
||||
The split operation's attempt to acquire cleanup-lock on the old bucket number
|
||||
could fail if another process holds any lock or pin on it. We do not want to
|
||||
wait if that happens, because we don't want to wait while holding the metapage
|
||||
exclusive-lock. So, this is a conditional LWLockAcquire operation, and if
|
||||
it fails we just abandon the attempt to split. This is all right since the
|
||||
index is overfull but perfectly functional. Every subsequent inserter will
|
||||
try to split, and eventually one will succeed. If multiple inserters failed
|
||||
to split, the index might still be overfull, but eventually, the index will
|
||||
not be overfull and split attempts will stop. (We could make a successful
|
||||
splitter loop to see if the index is still overfull, but it seems better to
|
||||
distribute the split overhead across successive insertions.)
|
||||
|
||||
A problem is that if a split fails partway through (eg due to insufficient
|
||||
disk space) the index is left corrupt. The probability of that could be
|
||||
made quite low if we grab a free page or two before we update the meta
|
||||
page, but the only real solution is to treat a split as a WAL-loggable,
|
||||
must-complete action. I'm not planning to teach hash about WAL in this
|
||||
go-round.
|
||||
If a split fails partway through (e.g. due to insufficient disk space or an
|
||||
interrupt), the index will not be corrupted. Instead, we'll retry the split
|
||||
every time a tuple is inserted into the old bucket prior to inserting the new
|
||||
tuple; eventually, we should succeed. The fact that a split is left
|
||||
unfinished doesn't prevent subsequent buckets from being split, but we won't
|
||||
try to split the bucket again until the prior split is finished. In other
|
||||
words, a bucket can be in the middle of being split for some time, but it can't
|
||||
be in the middle of two splits at the same time.
|
||||
|
||||
The fourth operation is garbage collection (bulk deletion):
|
||||
|
||||
next bucket := 0
|
||||
read/sharelock meta page
|
||||
pin metapage and take buffer content lock in exclusive mode
|
||||
fetch current max bucket number
|
||||
release meta page
|
||||
release meta page buffer content lock and pin
|
||||
while next bucket <= max bucket do
|
||||
Acquire X lock on target bucket
|
||||
Scan and remove tuples, compact free space as needed
|
||||
Release X lock
|
||||
acquire cleanup lock on primary bucket page
|
||||
loop:
|
||||
scan and remove tuples
|
||||
mark the target page dirty
|
||||
write WAL for deleting tuples from target page
|
||||
if this is the last bucket page, break out of loop
|
||||
pin and x-lock next page
|
||||
release prior lock and pin (except keep pin on primary bucket page)
|
||||
if the page we have locked is not the primary bucket page:
|
||||
release lock and take exclusive lock on primary bucket page
|
||||
if there are no other pins on the primary bucket page:
|
||||
squeeze the bucket to remove free space
|
||||
release the pin on primary bucket page
|
||||
next bucket ++
|
||||
end loop
|
||||
exclusive-lock meta page
|
||||
pin metapage and take buffer content lock in exclusive mode
|
||||
check if number of buckets changed
|
||||
if so, release lock and return to for-each-bucket loop
|
||||
if so, release content lock and pin and return to for-each-bucket loop
|
||||
else update metapage tuple count
|
||||
write/release meta page
|
||||
mark meta page dirty and write WAL for update of metapage
|
||||
release buffer content lock and pin
|
||||
|
||||
Note that this is designed to allow concurrent splits. If a split occurs,
|
||||
tuples relocated into the new bucket will be visited twice by the scan,
|
||||
but that does no harm. (We must however be careful about the statistics
|
||||
Note that this is designed to allow concurrent splits and scans. If a split
|
||||
occurs, tuples relocated into the new bucket will be visited twice by the
|
||||
scan, but that does no harm. As we release the lock on bucket page during
|
||||
cleanup scan of a bucket, it will allow concurrent scan to start on a bucket
|
||||
and ensures that scan will always be behind cleanup. It is must to keep scans
|
||||
behind cleanup, else vacuum could decrease the TIDs that are required to
|
||||
complete the scan. Now, as the scan that returns multiple tuples from the
|
||||
same bucket page always expect next valid TID to be greater than or equal to
|
||||
the current TID, it might miss the tuples. This holds true for backward scans
|
||||
as well (backward scans first traverse each bucket starting from first bucket
|
||||
to last overflow page in the chain). We must be careful about the statistics
|
||||
reported by the VACUUM operation. What we can do is count the number of
|
||||
tuples scanned, and believe this in preference to the stored tuple count
|
||||
if the stored tuple count and number of buckets did *not* change at any
|
||||
time during the scan. This provides a way of correcting the stored tuple
|
||||
count if it gets out of sync for some reason. But if a split or insertion
|
||||
does occur concurrently, the scan count is untrustworthy; instead,
|
||||
subtract the number of tuples deleted from the stored tuple count and
|
||||
use that.)
|
||||
|
||||
The exclusive lock request could deadlock in some strange scenarios, but
|
||||
we can just error out without any great harm being done.
|
||||
tuples scanned, and believe this in preference to the stored tuple count if
|
||||
the stored tuple count and number of buckets did *not* change at any time
|
||||
during the scan. This provides a way of correcting the stored tuple count if
|
||||
it gets out of sync for some reason. But if a split or insertion does occur
|
||||
concurrently, the scan count is untrustworthy; instead, subtract the number of
|
||||
tuples deleted from the stored tuple count and use that.
|
||||
|
||||
|
||||
Free Space Management
|
||||
|
@ -360,25 +446,23 @@ overflow page to the free pool.
|
|||
|
||||
Obtaining an overflow page:
|
||||
|
||||
read/exclusive-lock meta page
|
||||
take metapage content lock in exclusive mode
|
||||
determine next bitmap page number; if none, exit loop
|
||||
release meta page lock
|
||||
read/exclusive-lock bitmap page
|
||||
release meta page content lock
|
||||
pin bitmap page and take content lock in exclusive mode
|
||||
search for a free page (zero bit in bitmap)
|
||||
if found:
|
||||
set bit in bitmap
|
||||
write/release bitmap page
|
||||
read/exclusive-lock meta page
|
||||
mark bitmap page dirty
|
||||
take metapage buffer content lock in exclusive mode
|
||||
if first-free-bit value did not change,
|
||||
update it and write meta page
|
||||
release meta page
|
||||
return page number
|
||||
update it and mark meta page dirty
|
||||
else (not found):
|
||||
release bitmap page
|
||||
release bitmap page buffer content lock
|
||||
loop back to try next bitmap page, if any
|
||||
-- here when we have checked all bitmap pages; we hold meta excl. lock
|
||||
extend index to add another overflow page; update meta information
|
||||
write/release meta page
|
||||
mark meta page dirty
|
||||
return page number
|
||||
|
||||
It is slightly annoying to release and reacquire the metapage lock
|
||||
|
@ -398,12 +482,17 @@ like this:
|
|||
|
||||
-- having determined that no space is free in the target bucket:
|
||||
remember last page of bucket, drop write lock on it
|
||||
call free-page-acquire routine
|
||||
re-write-lock last page of bucket
|
||||
if it is not last anymore, step to the last page
|
||||
update (former) last page to point to new page
|
||||
execute free-page-acquire (obtaining an overflow page) mechanism
|
||||
described above
|
||||
update (former) last page to point to the new page and mark buffer dirty
|
||||
write-lock and initialize new page, with back link to former last page
|
||||
write and release former last page
|
||||
write WAL for addition of overflow page
|
||||
release the locks on meta page and bitmap page acquired in
|
||||
free-page-acquire algorithm
|
||||
release the lock on former last page
|
||||
release the lock on new overflow page
|
||||
insert tuple into new page
|
||||
-- etc.
|
||||
|
||||
|
@ -418,27 +507,27 @@ free page; there can be no other process holding lock on it.
|
|||
|
||||
Bucket splitting uses a similar algorithm if it has to extend the new
|
||||
bucket, but it need not worry about concurrent extension since it has
|
||||
exclusive lock on the new bucket.
|
||||
buffer content lock in exclusive mode on the new bucket.
|
||||
|
||||
Freeing an overflow page is done by garbage collection and by bucket
|
||||
splitting (the old bucket may contain no-longer-needed overflow pages).
|
||||
In both cases, the process holds exclusive lock on the containing bucket,
|
||||
so need not worry about other accessors of pages in the bucket. The
|
||||
algorithm is:
|
||||
Freeing an overflow page requires the process to hold buffer content lock in
|
||||
exclusive mode on the containing bucket, so need not worry about other
|
||||
accessors of pages in the bucket. The algorithm is:
|
||||
|
||||
delink overflow page from bucket chain
|
||||
(this requires read/update/write/release of fore and aft siblings)
|
||||
read/share-lock meta page
|
||||
pin meta page and take buffer content lock in shared mode
|
||||
determine which bitmap page contains the free space bit for page
|
||||
release meta page
|
||||
read/exclusive-lock bitmap page
|
||||
release meta page buffer content lock
|
||||
pin bitmap page and take buffer content lock in exclusive mode
|
||||
retake meta page buffer content lock in exclusive mode
|
||||
move (insert) tuples that belong to the overflow page being freed
|
||||
update bitmap bit
|
||||
write/release bitmap page
|
||||
if page number is less than what we saw as first-free-bit in meta:
|
||||
read/exclusive-lock meta page
|
||||
mark bitmap page dirty
|
||||
if page number is still less than first-free-bit,
|
||||
update first-free-bit field and write meta page
|
||||
release meta page
|
||||
update first-free-bit field and mark meta page dirty
|
||||
write WAL for delinking overflow page operation
|
||||
release buffer content lock and pin
|
||||
release meta page buffer content lock and pin
|
||||
|
||||
We have to do it this way because we must clear the bitmap bit before
|
||||
changing the first-free-bit field (hashm_firstfree). It is possible that
|
||||
|
@ -448,21 +537,96 @@ page acquirer will scan more bitmap bits than he needs to. What must be
|
|||
avoided is having first-free-bit greater than the actual first free bit,
|
||||
because then that free page would never be found by searchers.
|
||||
|
||||
All the freespace operations should be called while holding no buffer
|
||||
locks. Since they need no lmgr locks, deadlock is not possible.
|
||||
The reason of moving tuples from overflow page while delinking the later is
|
||||
to make that as an atomic operation. Not doing so could lead to spurious reads
|
||||
on standby. Basically, the user might see the same tuple twice.
|
||||
|
||||
|
||||
WAL Considerations
|
||||
------------------
|
||||
|
||||
The hash index operations like create index, insert, delete, bucket split,
|
||||
allocate overflow page, and squeeze in themselves don't guarantee hash index
|
||||
consistency after a crash. To provide robustness, we write WAL for each of
|
||||
these operations.
|
||||
|
||||
CREATE INDEX writes multiple WAL records. First, we write a record to cover
|
||||
the initializatoin of the metapage, followed by one for each new bucket
|
||||
created, followed by one for the initial bitmap page. It's not important for
|
||||
index creation to appear atomic, because the index isn't yet visible to any
|
||||
other transaction, and the creating transaction will roll back in the event of
|
||||
a crash. It would be difficult to cover the whole operation with a single
|
||||
write-ahead log record anyway, because we can log only a fixed number of
|
||||
pages, as given by XLR_MAX_BLOCK_ID (32), with current XLog machinery.
|
||||
|
||||
Ordinary item insertions (that don't force a page split or need a new overflow
|
||||
page) are single WAL entries. They touch a single bucket page and the
|
||||
metapage. The metapage is updated during replay as it is updated during
|
||||
original operation.
|
||||
|
||||
If an insertion causes the addition of an overflow page, there will be one
|
||||
WAL entry for the new overflow page and second entry for insert itself.
|
||||
|
||||
If an insertion causes a bucket split, there will be one WAL entry for insert
|
||||
itself, followed by a WAL entry for allocating a new bucket, followed by a WAL
|
||||
entry for each overflow bucket page in the new bucket to which the tuples are
|
||||
moved from old bucket, followed by a WAL entry to indicate that split is
|
||||
complete for both old and new buckets. A split operation which requires
|
||||
overflow pages to complete the operation will need to write a WAL record for
|
||||
each new allocation of an overflow page.
|
||||
|
||||
As splitting involves multiple atomic actions, it's possible that the system
|
||||
crashes between moving tuples from bucket pages of the old bucket to new
|
||||
bucket. In such a case, after recovery, the old and new buckets will be
|
||||
marked with bucket-being-split and bucket-being-populated flags respectively
|
||||
which indicates that split is in progress for those buckets. The reader
|
||||
algorithm works correctly, as it will scan both the old and new buckets when
|
||||
the split is in progress as explained in the reader algorithm section above.
|
||||
|
||||
We finish the split at next insert or split operation on the old bucket as
|
||||
explained in insert and split algorithm above. It could be done during
|
||||
searches, too, but it seems best not to put any extra updates in what would
|
||||
otherwise be a read-only operation (updating is not possible in hot standby
|
||||
mode anyway). It would seem natural to complete the split in VACUUM, but since
|
||||
splitting a bucket might require allocating a new page, it might fail if you
|
||||
run out of disk space. That would be bad during VACUUM - the reason for
|
||||
running VACUUM in the first place might be that you run out of disk space,
|
||||
and now VACUUM won't finish because you're out of disk space. In contrast,
|
||||
an insertion can require enlarging the physical file anyway.
|
||||
|
||||
Deletion of tuples from a bucket is performed for two reasons: to remove dead
|
||||
tuples, and to remove tuples that were moved by a bucket split. A WAL entry
|
||||
is made for each bucket page from which tuples are removed, and then another
|
||||
WAL entry is made when we clear the needs-split-cleanup flag. If dead tuples
|
||||
are removed, a separate WAL entry is made to update the metapage.
|
||||
|
||||
As deletion involves multiple atomic operations, it is quite possible that
|
||||
system crashes after (a) removing tuples from some of the bucket pages, (b)
|
||||
before clearing the garbage flag, or (c) before updating the metapage. If the
|
||||
system crashes before completing (b), it will again try to clean the bucket
|
||||
during next vacuum or insert after recovery which can have some performance
|
||||
impact, but it will work fine. If the system crashes before completing (c),
|
||||
after recovery there could be some additional splits until the next vacuum
|
||||
updates the metapage, but the other operations like insert, delete and scan
|
||||
will work correctly. We can fix this problem by actually updating the
|
||||
metapage based on delete operation during replay, but it's not clear whether
|
||||
it's worth the complication.
|
||||
|
||||
A squeeze operation moves tuples from one of the buckets later in the chain to
|
||||
one of the bucket earlier in chain and writes WAL record when either the
|
||||
bucket to which it is writing tuples is filled or bucket from which it
|
||||
is removing the tuples becomes empty.
|
||||
|
||||
As a squeeze operation involves writing multiple atomic operations, it is
|
||||
quite possible that the system crashes before completing the operation on
|
||||
entire bucket. After recovery, the operations will work correctly, but
|
||||
the index will remain bloated and this can impact performance of read and
|
||||
insert operations until the next vacuum squeeze the bucket completely.
|
||||
|
||||
|
||||
Other Notes
|
||||
-----------
|
||||
|
||||
All the shenanigans with locking prevent a split occurring while *another*
|
||||
process is stopped in a given bucket. They do not ensure that one of
|
||||
our *own* backend's scans is not stopped in the bucket, because lmgr
|
||||
doesn't consider a process's own locks to conflict. So the Split
|
||||
algorithm must check for that case separately before deciding it can go
|
||||
ahead with the split. VACUUM does not have this problem since nothing
|
||||
else can be happening within the vacuuming backend.
|
||||
|
||||
Should we instead try to fix the state of any conflicting local scan?
|
||||
Seems mighty ugly --- got to move the held bucket S-lock as well as lots
|
||||
of other messiness. For now, just punt and don't split.
|
||||
Clean up locks prevent a split from occurring while *another* process is stopped
|
||||
in a given bucket. It also ensures that one of our *own* backend's scans is not
|
||||
stopped in the bucket.
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* hash.cpp
|
||||
* Implementation of Margo Seltzer's Hashing package for postgres.
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -20,6 +20,8 @@
|
|||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xloginsert.h"
|
||||
#include "access/tableam.h"
|
||||
#include "access/relscan.h"
|
||||
#include "catalog/index.h"
|
||||
|
@ -34,6 +36,7 @@
|
|||
typedef struct {
|
||||
HSpool *spool; /* NULL if not using spooling */
|
||||
double indtuples; /* # tuples accepted into index */
|
||||
Relation heapRel; /* heap relation descriptor */
|
||||
} HashBuildState;
|
||||
|
||||
static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, const bool *isnull, bool tupleIsAlive,
|
||||
|
@ -52,6 +55,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
|
|||
double reltuples;
|
||||
double allvisfrac;
|
||||
uint32 num_buckets;
|
||||
long sort_threshold;
|
||||
HashBuildState buildstate;
|
||||
|
||||
/*
|
||||
|
@ -66,7 +70,7 @@ Datum hashbuild(PG_FUNCTION_ARGS)
|
|||
estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac, NULL);
|
||||
|
||||
/* Initialize the hash index metadata page and initial buckets */
|
||||
num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
|
||||
num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
|
||||
/*
|
||||
* If we just insert the tuples into the index in scan order, then
|
||||
* (assuming their hash codes are pretty random) there will be no locality
|
||||
|
@ -74,25 +78,38 @@ Datum hashbuild(PG_FUNCTION_ARGS)
|
|||
* then we'll thrash horribly. To prevent that scenario, we can sort the
|
||||
* tuples by (expected) bucket number. However, such a sort is useless
|
||||
* overhead when the index does fit in RAM. We choose to sort if the
|
||||
* initial index size exceeds NBuffers.
|
||||
* initial index size exceeds maintenance_work_mem, or the number of
|
||||
* buffers usable for the index, whichever is less. (Limiting by the
|
||||
* number of buffers should reduce thrashing between PG buffers and kernel
|
||||
* buffers, which seems useful even if no physical I/O results. Limiting
|
||||
* by maintenance_work_mem is useful to allow easy testing of the sort
|
||||
* code path, and may be useful to DBAs as an additional control knob.)
|
||||
*
|
||||
* NOTE: this test will need adjustment if a bucket is ever different from
|
||||
* one page.
|
||||
* one page. Also, "initial index size" accounting does not include the
|
||||
* metapage, nor the first bitmap page.
|
||||
*/
|
||||
if (num_buckets >= (uint32)g_instance.attr.attr_storage.NBuffers)
|
||||
buildstate.spool = _h_spoolinit(index, num_buckets, &indexInfo->ii_desc);
|
||||
sort_threshold = (u_sess->attr.attr_memory.maintenance_work_mem * 1024L) / BLCKSZ;
|
||||
if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
|
||||
sort_threshold = Min(sort_threshold, g_instance.attr.attr_storage.NBuffers);
|
||||
else
|
||||
sort_threshold = Min(sort_threshold, u_sess->storage_cxt.NLocBuffer);
|
||||
|
||||
if (num_buckets >= (uint32)sort_threshold)
|
||||
buildstate.spool = _h_spoolinit(heap, index, num_buckets, &indexInfo->ii_desc);
|
||||
else
|
||||
buildstate.spool = NULL;
|
||||
|
||||
/* prepare to build the index */
|
||||
buildstate.indtuples = 0;
|
||||
buildstate.heapRel = heap;
|
||||
|
||||
/* do the heap scan */
|
||||
reltuples = tableam_index_build_scan(heap, index, indexInfo, true, hashbuildCallback, (void*)&buildstate);
|
||||
|
||||
if (buildstate.spool != NULL) {
|
||||
/* sort the tuples and insert them into the index */
|
||||
_h_indexbuild(buildstate.spool);
|
||||
_h_indexbuild(buildstate.spool, buildstate.heapRel);
|
||||
_h_spooldestroy(buildstate.spool);
|
||||
}
|
||||
|
||||
|
@ -114,7 +131,7 @@ Datum hashbuildempty(PG_FUNCTION_ARGS)
|
|||
{
|
||||
Relation index = (Relation)PG_GETARG_POINTER(0);
|
||||
|
||||
_hash_metapinit(index, 0, INIT_FORKNUM);
|
||||
_hash_init(index, 0, INIT_FORKNUM);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
@ -126,21 +143,24 @@ static void hashbuildCallback(Relation index, HeapTuple htup, Datum *values, con
|
|||
void *state)
|
||||
{
|
||||
HashBuildState *buildstate = (HashBuildState *)state;
|
||||
Datum index_values[1];
|
||||
bool index_isnull[1];
|
||||
IndexTuple itup;
|
||||
|
||||
/* Hash indexes don't index nulls, see notes in hashinsert */
|
||||
if (isnull[0]) {
|
||||
/* convert data to a hash key; on failure, do not insert anything */
|
||||
if (!_hash_convert_tuple(index,
|
||||
values, isnull,
|
||||
index_values, index_isnull))
|
||||
return;
|
||||
}
|
||||
|
||||
/* Either spool the tuple for sorting, or just put it into the index */
|
||||
if (buildstate->spool != NULL) {
|
||||
_h_spool(buildstate->spool, &htup->t_self, values, isnull);
|
||||
_h_spool(buildstate->spool, &htup->t_self, index_values, index_isnull);
|
||||
} else {
|
||||
/* form an index tuple and point it at the heap tuple */
|
||||
itup = _hash_form_tuple(index, values, isnull);
|
||||
itup = index_form_tuple(RelationGetDescr(index), index_values, index_isnull);
|
||||
itup->t_tid = htup->t_self;
|
||||
_hash_doinsert(index, itup);
|
||||
_hash_doinsert(index, itup, buildstate->heapRel);
|
||||
pfree(itup);
|
||||
}
|
||||
|
||||
|
@ -159,30 +179,22 @@ Datum hashinsert(PG_FUNCTION_ARGS)
|
|||
Datum *values = (Datum *)PG_GETARG_POINTER(1);
|
||||
bool *isnull = (bool *)PG_GETARG_POINTER(2);
|
||||
ItemPointer ht_ctid = (ItemPointer)PG_GETARG_POINTER(3);
|
||||
|
||||
#ifdef NOT_USED
|
||||
Relation heapRel = (Relation)PG_GETARG_POINTER(4);
|
||||
IndexUniqueCheck checkUnique = (IndexUniqueCheck)PG_GETARG_INT32(5);
|
||||
#endif
|
||||
Datum index_values[1];
|
||||
bool index_isnull[1];
|
||||
IndexTuple itup;
|
||||
|
||||
/*
|
||||
* If the single index key is null, we don't insert it into the index.
|
||||
* Hash tables support scans on '='. Relational algebra says that A = B
|
||||
* returns null if either A or B is null. This means that no
|
||||
* qualification used in an index scan could ever return true on a null
|
||||
* attribute. It also means that indices can't be used by ISNULL or
|
||||
* NOTNULL scans, but that's an artifact of the strategy map architecture
|
||||
* chosen in 1986, not of the way nulls are handled here.
|
||||
*/
|
||||
if (isnull[0])
|
||||
PG_RETURN_BOOL(false);
|
||||
/* convert data to a hash key; on failure, do not insert anything */
|
||||
if (!_hash_convert_tuple(rel,
|
||||
values, isnull,
|
||||
index_values, index_isnull))
|
||||
return false;
|
||||
|
||||
/* generate an index tuple */
|
||||
itup = _hash_form_tuple(rel, values, isnull);
|
||||
/* form an index tuple and point it at the heap tuple */
|
||||
itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
|
||||
itup->t_tid = *ht_ctid;
|
||||
|
||||
_hash_doinsert(rel, itup);
|
||||
_hash_doinsert(rel, itup, heapRel);
|
||||
|
||||
pfree(itup);
|
||||
|
||||
|
@ -212,7 +224,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
* Reacquire the read lock here.
|
||||
*/
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
|
||||
|
||||
/*
|
||||
* If we've already initialized this scan, we can just advance it in the
|
||||
|
@ -224,16 +236,21 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
/*
|
||||
* An insertion into the current index page could have happened while
|
||||
* we didn't have read lock on it. Re-find our position by looking
|
||||
* for the TID we previously returned. (Because we hold share lock on
|
||||
* the bucket, no deletions or splits could have occurred; therefore
|
||||
* we can expect that the TID still exists in the current index page,
|
||||
* at an offset >= where we were.)
|
||||
* for the TID we previously returned. (Because we hold a pin on the
|
||||
* primary bucket page, no deletions or splits could have occurred;
|
||||
* therefore we can expect that the TID still exists in the current
|
||||
* index page, at an offset >= where we were.)
|
||||
*/
|
||||
OffsetNumber maxoffnum;
|
||||
|
||||
buf = so->hashso_curbuf;
|
||||
Assert(BufferIsValid(buf));
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
/*
|
||||
* We don't need test for old snapshot here as the current buffer is
|
||||
* pinned, so vacuum can't clean the page.
|
||||
*/
|
||||
maxoffnum = PageGetMaxOffsetNumber(page);
|
||||
for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) {
|
||||
IndexTuple itup;
|
||||
|
@ -253,14 +270,22 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
*/
|
||||
if (scan->kill_prior_tuple) {
|
||||
/*
|
||||
* Yes, so mark it by setting the LP_DEAD state in the item flags.
|
||||
* Yes, so remember it for later. (We'll deal with all such tuples
|
||||
* at once right after leaving the index page or at end of scan.)
|
||||
* In case if caller reverses the indexscan direction it is quite
|
||||
* possible that the same item might get entered multiple times.
|
||||
* But, we don't detect that; instead, we just forget any excess
|
||||
* entries.
|
||||
*/
|
||||
ItemIdMarkDead(PageGetItemId(page, offnum));
|
||||
if (so->killedItems == NULL)
|
||||
so->killedItems = (HashScanPosItem *)palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem));
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, mark as a hint.
|
||||
*/
|
||||
MarkBufferDirtyHint(buf, true);
|
||||
if (so->numKilled < MaxIndexTuplesPerPage) {
|
||||
so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
|
||||
so->killedItems[so->numKilled].indexOffset =
|
||||
ItemPointerGetOffsetNumber(&(so->hashso_curpos));
|
||||
so->numKilled++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -285,7 +310,7 @@ Datum hashgettuple(PG_FUNCTION_ARGS)
|
|||
|
||||
/* Release read lock on current buffer, but keep it pinned */
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/* Return current heap TID on success */
|
||||
scan->xs_ctup.t_self = so->hashso_heappos;
|
||||
|
@ -353,17 +378,20 @@ Datum hashbeginscan(PG_FUNCTION_ARGS)
|
|||
scan = RelationGetIndexScan(rel, nkeys, norderbys);
|
||||
|
||||
so = (HashScanOpaque)palloc(sizeof(HashScanOpaqueData));
|
||||
so->hashso_bucket_valid = false;
|
||||
so->hashso_bucket_blkno = 0;
|
||||
so->hashso_curbuf = InvalidBuffer;
|
||||
so->hashso_bucket_buf = InvalidBuffer;
|
||||
so->hashso_split_bucket_buf = InvalidBuffer;
|
||||
/* set position invalid (this will cause _hash_first call) */
|
||||
ItemPointerSetInvalid(&(so->hashso_curpos));
|
||||
ItemPointerSetInvalid(&(so->hashso_heappos));
|
||||
|
||||
scan->opaque = so;
|
||||
so->hashso_buc_populated = false;
|
||||
so->hashso_buc_split = false;
|
||||
|
||||
/* register scan in case we change pages it's using */
|
||||
_hash_regscan(scan);
|
||||
so->killedItems = NULL;
|
||||
so->numKilled = 0;
|
||||
|
||||
scan->opaque = so;
|
||||
|
||||
PG_RETURN_POINTER(scan);
|
||||
}
|
||||
|
@ -381,14 +409,13 @@ Datum hashrescan(PG_FUNCTION_ARGS)
|
|||
Relation rel = scan->indexRelation;
|
||||
|
||||
/* release any pin we still hold */
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_dropbuf(rel, so->hashso_curbuf);
|
||||
so->hashso_curbuf = InvalidBuffer;
|
||||
if (so->numKilled > 0) {
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
|
||||
_hash_kill_items(scan);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
|
||||
/* release lock on bucket, too */
|
||||
if (so->hashso_bucket_blkno)
|
||||
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
|
||||
so->hashso_bucket_blkno = 0;
|
||||
_hash_dropscanbuf(rel, so);
|
||||
|
||||
/* set position invalid (this will cause _hash_first call) */
|
||||
ItemPointerSetInvalid(&(so->hashso_curpos));
|
||||
|
@ -400,10 +427,11 @@ Datum hashrescan(PG_FUNCTION_ARGS)
|
|||
rc = memmove_s(scan->keyData, (unsigned)scan->numberOfKeys * sizeof(ScanKeyData), scankey,
|
||||
(unsigned)scan->numberOfKeys * sizeof(ScanKeyData));
|
||||
securec_check(rc, "", "");
|
||||
|
||||
so->hashso_bucket_valid = false;
|
||||
}
|
||||
|
||||
so->hashso_buc_populated = false;
|
||||
so->hashso_buc_split = false;
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
@ -416,18 +444,20 @@ Datum hashendscan(PG_FUNCTION_ARGS)
|
|||
HashScanOpaque so = (HashScanOpaque)scan->opaque;
|
||||
Relation rel = scan->indexRelation;
|
||||
|
||||
/* don't need scan registered anymore */
|
||||
_hash_dropscan(scan);
|
||||
/*
|
||||
* Before leaving current page, deal with any killed items. Also, ensure
|
||||
* that we acquire lock on current page before calling _hash_kill_items.
|
||||
*/
|
||||
if (so->numKilled > 0) {
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
|
||||
_hash_kill_items(scan);
|
||||
LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
|
||||
/* release any pin we still hold */
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_dropbuf(rel, so->hashso_curbuf);
|
||||
so->hashso_curbuf = InvalidBuffer;
|
||||
_hash_dropscanbuf(rel, so);
|
||||
|
||||
/* release lock on bucket, too */
|
||||
if (so->hashso_bucket_blkno)
|
||||
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
|
||||
so->hashso_bucket_blkno = 0;
|
||||
if (so->killedItems != NULL)
|
||||
pfree(so->killedItems);
|
||||
|
||||
pfree(so);
|
||||
scan->opaque = NULL;
|
||||
|
@ -458,6 +488,9 @@ Datum hashrestrpos(PG_FUNCTION_ARGS)
|
|||
* The set of target tuples is specified via a callback routine that tells
|
||||
* whether any given heap tuple (identified by ItemPointer) is being deleted.
|
||||
*
|
||||
* This function also deletes the tuples that are moved by split to other
|
||||
* bucket.
|
||||
*
|
||||
* Result: a palloc'd struct containing statistical info for VACUUM displays.
|
||||
*/
|
||||
Datum hashbulkdelete(PG_FUNCTION_ARGS)
|
||||
|
@ -473,29 +506,24 @@ Datum hashbulkdelete(PG_FUNCTION_ARGS)
|
|||
Bucket orig_maxbucket;
|
||||
Bucket cur_maxbucket;
|
||||
Bucket cur_bucket;
|
||||
Buffer metabuf;
|
||||
Buffer metabuf = InvalidBuffer;
|
||||
HashMetaPage metap;
|
||||
HashMetaPageData local_metapage;
|
||||
errno_t rc;
|
||||
HashMetaPage cachedmetap;
|
||||
|
||||
tuples_removed = 0;
|
||||
num_index_tuples = 0;
|
||||
|
||||
/*
|
||||
* Read the metapage to fetch original bucket and tuple counts. Also, we
|
||||
* keep a copy of the last-seen metapage so that we can use its
|
||||
* hashm_spares[] values to compute bucket page addresses. This is a bit
|
||||
* hokey but perfectly safe, since the interesting entries in the spares
|
||||
* array cannot change under us; and it beats rereading the metapage for
|
||||
* each bucket.
|
||||
* We need a copy of the metapage so that we can use its hashm_spares[]
|
||||
* values to compute bucket page addresses, but a cached copy should be
|
||||
* good enough. (If not, we'll detect that further down and refresh the
|
||||
* cache as necessary.)
|
||||
*/
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
orig_maxbucket = metap->hashm_maxbucket;
|
||||
orig_ntuples = metap->hashm_ntuples;
|
||||
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
|
||||
securec_check(rc, "", "");
|
||||
_hash_relbuf(rel, metabuf);
|
||||
cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
|
||||
Assert(cachedmetap != NULL);
|
||||
|
||||
orig_maxbucket = cachedmetap->hashm_maxbucket;
|
||||
orig_ntuples = cachedmetap->hashm_ntuples;
|
||||
|
||||
/* Scan the buckets that we know exist */
|
||||
cur_bucket = 0;
|
||||
|
@ -505,90 +533,85 @@ loop_top:
|
|||
while (cur_bucket <= cur_maxbucket) {
|
||||
BlockNumber bucket_blkno;
|
||||
BlockNumber blkno;
|
||||
bool bucket_dirty = false;
|
||||
Buffer bucket_buf;
|
||||
Buffer buf;
|
||||
HashPageOpaque bucket_opaque;
|
||||
Page page;
|
||||
bool split_cleanup = false;
|
||||
|
||||
/* Get address of bucket's start page */
|
||||
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
|
||||
bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
|
||||
|
||||
/* Exclusive-lock the bucket so we can shrink it */
|
||||
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
|
||||
|
||||
/* Shouldn't have any active scans locally, either */
|
||||
if (_hash_has_active_scan(rel, cur_bucket))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SQL_ROUTINE_EXCEPTION), (errmsg("hash index has active scan during VACUUM."))));
|
||||
|
||||
/* Scan each page in bucket */
|
||||
blkno = bucket_blkno;
|
||||
while (BlockNumberIsValid(blkno)) {
|
||||
Buffer buf;
|
||||
Page page;
|
||||
HashPageOpaque opaque;
|
||||
OffsetNumber offno;
|
||||
OffsetNumber maxoffno;
|
||||
OffsetNumber deletable[MaxOffsetNumber];
|
||||
int ndeletable = 0;
|
||||
|
||||
vacuum_delay_point();
|
||||
|
||||
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(opaque->hasho_bucket == cur_bucket);
|
||||
|
||||
/* Scan each tuple in page */
|
||||
maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
|
||||
IndexTuple itup;
|
||||
ItemPointer htup;
|
||||
|
||||
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offno));
|
||||
htup = &(itup->t_tid);
|
||||
if (callback(htup, callback_state, InvalidOid)) {
|
||||
/* mark the item for deletion */
|
||||
deletable[ndeletable++] = offno;
|
||||
tuples_removed += 1;
|
||||
} else
|
||||
num_index_tuples += 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Apply deletions and write page if needed, advance to next page.
|
||||
* We need to acquire a cleanup lock on the primary bucket page to out
|
||||
* wait concurrent scans before deleting the dead tuples.
|
||||
*/
|
||||
blkno = opaque->hasho_nextblkno;
|
||||
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
|
||||
LockBufferForCleanup(buf);
|
||||
_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
|
||||
|
||||
if (ndeletable > 0) {
|
||||
PageIndexMultiDelete(page, deletable, ndeletable);
|
||||
_hash_wrtbuf(rel, buf);
|
||||
bucket_dirty = true;
|
||||
} else
|
||||
_hash_relbuf(rel, buf);
|
||||
page = BufferGetPage(buf);
|
||||
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* If the bucket contains tuples that are moved by split, then we need
|
||||
* to delete such tuples. We can't delete such tuples if the split
|
||||
* operation on bucket is not finished as those are needed by scans.
|
||||
*/
|
||||
if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) {
|
||||
split_cleanup = true;
|
||||
|
||||
/*
|
||||
* This bucket might have been split since we last held a lock on
|
||||
* the metapage. If so, hashm_maxbucket, hashm_highmask and
|
||||
* hashm_lowmask might be old enough to cause us to fail to remove
|
||||
* tuples left behind by the most recent split. To prevent that,
|
||||
* now that the primary page of the target bucket has been locked
|
||||
* (and thus can't be further split), check whether we need to
|
||||
* update our cached metapage data.
|
||||
*/
|
||||
Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
|
||||
if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) {
|
||||
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
|
||||
Assert(cachedmetap != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* If we deleted anything, try to compact free space */
|
||||
if (bucket_dirty)
|
||||
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy);
|
||||
bucket_buf = buf;
|
||||
|
||||
/* Release bucket lock */
|
||||
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
|
||||
hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
|
||||
cachedmetap->hashm_maxbucket,
|
||||
cachedmetap->hashm_highmask,
|
||||
cachedmetap->hashm_lowmask, &tuples_removed,
|
||||
&num_index_tuples, split_cleanup,
|
||||
callback, callback_state);
|
||||
|
||||
_hash_dropbuf(rel, bucket_buf);
|
||||
|
||||
/* Advance to next bucket */
|
||||
cur_bucket++;
|
||||
}
|
||||
|
||||
if (BufferIsInvalid(metabuf))
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
|
||||
|
||||
/* Write-lock metapage and check for split since we started */
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
|
||||
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
if (cur_maxbucket != metap->hashm_maxbucket) {
|
||||
/* There's been a split, so process the additional bucket(s) */
|
||||
cur_maxbucket = metap->hashm_maxbucket;
|
||||
rc = memcpy_s(&local_metapage, sizeof(local_metapage), metap, sizeof(local_metapage));
|
||||
securec_check(rc, "", "");
|
||||
_hash_relbuf(rel, metabuf);
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
|
||||
Assert(cachedmetap != NULL);
|
||||
cur_maxbucket = cachedmetap->hashm_maxbucket;
|
||||
goto loop_top;
|
||||
}
|
||||
|
||||
/* Okay, we're really done. Update tuple count in metapage. */
|
||||
START_CRIT_SECTION();
|
||||
if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) {
|
||||
/*
|
||||
* No one has split or inserted anything since start of scan, so
|
||||
|
@ -609,7 +632,27 @@ loop_top:
|
|||
num_index_tuples = metap->hashm_ntuples;
|
||||
}
|
||||
|
||||
_hash_wrtbuf(rel, metabuf);
|
||||
MarkBufferDirty(metabuf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_update_meta_page xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.ntuples = metap->hashm_ntuples;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
|
||||
|
||||
XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
|
||||
PageSetLSN(BufferGetPage(metabuf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
/* return statistics */
|
||||
if (stats == NULL)
|
||||
|
@ -645,9 +688,244 @@ Datum hashvacuumcleanup(PG_FUNCTION_ARGS)
|
|||
PG_RETURN_POINTER(stats);
|
||||
}
|
||||
|
||||
void hash_redo(XLogReaderState *record)
|
||||
/*
|
||||
* Helper function to perform deletion of index entries from a bucket.
|
||||
*
|
||||
* This function expects that the caller has acquired a cleanup lock on the
|
||||
* primary bucket page, and will return with a write lock again held on the
|
||||
* primary bucket page. The lock won't necessarily be held continuously,
|
||||
* though, because we'll release it when visiting overflow pages.
|
||||
*
|
||||
* It would be very bad if this function cleaned a page while some other
|
||||
* backend was in the midst of scanning it, because hashgettuple assumes
|
||||
* that the next valid TID will be greater than or equal to the current
|
||||
* valid TID. There can't be any concurrent scans in progress when we first
|
||||
* enter this function because of the cleanup lock we hold on the primary
|
||||
* bucket page, but as soon as we release that lock, there might be. We
|
||||
* handle that by conspiring to prevent those scans from passing our cleanup
|
||||
* scan. To do that, we lock the next page in the bucket chain before
|
||||
* releasing the lock on the previous page. (This type of lock chaining is
|
||||
* not ideal, so we might want to look for a better solution at some point.)
|
||||
*
|
||||
* We need to retain a pin on the primary bucket to ensure that no concurrent
|
||||
* split can start.
|
||||
*/
|
||||
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
|
||||
BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
|
||||
uint32 maxbucket, uint32 highmask, uint32 lowmask,
|
||||
double *tuples_removed, double *num_index_tuples,
|
||||
bool split_cleanup,
|
||||
IndexBulkDeleteCallback callback, void *callback_state)
|
||||
{
|
||||
ereport(PANIC, (errmsg("hash_redo: unimplemented")));
|
||||
BlockNumber blkno;
|
||||
Buffer buf;
|
||||
Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
|
||||
bool bucket_dirty = false;
|
||||
|
||||
blkno = bucket_blkno;
|
||||
buf = bucket_buf;
|
||||
|
||||
if (split_cleanup)
|
||||
new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
|
||||
lowmask, maxbucket);
|
||||
|
||||
/* Scan each page in bucket */
|
||||
for (;;) {
|
||||
HashPageOpaque opaque;
|
||||
OffsetNumber offno;
|
||||
OffsetNumber maxoffno;
|
||||
Buffer next_buf;
|
||||
Page page;
|
||||
OffsetNumber deletable[MaxOffsetNumber];
|
||||
int ndeletable = 0;
|
||||
bool retain_pin = false;
|
||||
bool clear_dead_marking = false;
|
||||
|
||||
vacuum_delay_point();
|
||||
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* Scan each tuple in page */
|
||||
maxoffno = PageGetMaxOffsetNumber(page);
|
||||
for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) {
|
||||
ItemPointer htup;
|
||||
IndexTuple itup;
|
||||
Bucket bucket;
|
||||
bool kill_tuple = false;
|
||||
|
||||
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno));
|
||||
htup = &(itup->t_tid);
|
||||
|
||||
/*
|
||||
* To remove the dead tuples, we strictly want to rely on results
|
||||
* of callback function. refer btvacuumpage for detailed reason.
|
||||
*/
|
||||
if (callback && callback(htup, callback_state, InvalidOid)) {
|
||||
kill_tuple = true;
|
||||
if (tuples_removed)
|
||||
*tuples_removed += 1;
|
||||
} else if (split_cleanup) {
|
||||
/* delete the tuples that are moved by split. */
|
||||
bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
|
||||
maxbucket, highmask, lowmask);
|
||||
/* mark the item for deletion */
|
||||
if (bucket != cur_bucket) {
|
||||
/*
|
||||
* We expect tuples to either belong to current bucket or
|
||||
* new_bucket. This is ensured because we don't allow
|
||||
* further splits from bucket that contains garbage. See
|
||||
* comments in _hash_expandtable.
|
||||
*/
|
||||
Assert(bucket == new_bucket);
|
||||
kill_tuple = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (kill_tuple) {
|
||||
/* mark the item for deletion */
|
||||
deletable[ndeletable++] = offno;
|
||||
} else {
|
||||
/* we're keeping it, so count it */
|
||||
if (num_index_tuples)
|
||||
*num_index_tuples += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* retain the pin on primary bucket page till end of bucket scan */
|
||||
if (blkno == bucket_blkno)
|
||||
retain_pin = true;
|
||||
else
|
||||
retain_pin = false;
|
||||
|
||||
blkno = opaque->hasho_nextblkno;
|
||||
|
||||
/*
|
||||
* Apply deletions, advance to next page and write page if needed.
|
||||
*/
|
||||
if (ndeletable > 0) {
|
||||
/* No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
PageIndexMultiDelete(page, deletable, ndeletable);
|
||||
bucket_dirty = true;
|
||||
|
||||
/*
|
||||
* Let us mark the page as clean if vacuum removes the DEAD tuples
|
||||
* from an index page. We do this by clearing
|
||||
* LH_PAGE_HAS_DEAD_TUPLES flag.
|
||||
*/
|
||||
if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) {
|
||||
opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
||||
clear_dead_marking = true;
|
||||
}
|
||||
|
||||
MarkBufferDirty(buf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_delete xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.clear_dead_marking = clear_dead_marking;
|
||||
xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
|
||||
|
||||
/*
|
||||
* bucket buffer needs to be registered to ensure that we can
|
||||
* acquire a cleanup lock on it during replay.
|
||||
*/
|
||||
if (!xlrec.is_primary_bucket_page) {
|
||||
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
|
||||
}
|
||||
|
||||
XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
|
||||
XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber));
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
|
||||
if (!xlrec.is_primary_bucket_page) {
|
||||
PageSetLSN(BufferGetPage(bucket_buf), recptr);
|
||||
}
|
||||
PageSetLSN(BufferGetPage(buf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
|
||||
/* bail out if there are no more pages to scan. */
|
||||
if (!BlockNumberIsValid(blkno))
|
||||
break;
|
||||
|
||||
next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
|
||||
LH_OVERFLOW_PAGE,
|
||||
bstrategy);
|
||||
|
||||
/*
|
||||
* release the lock on previous page after acquiring the lock on next
|
||||
* page
|
||||
*/
|
||||
if (retain_pin)
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
else
|
||||
_hash_relbuf(rel, buf);
|
||||
|
||||
buf = next_buf;
|
||||
}
|
||||
|
||||
/*
|
||||
* lock the bucket page to clear the garbage flag and squeeze the bucket.
|
||||
* if the current buffer is same as bucket buffer, then we already have
|
||||
* lock on bucket page.
|
||||
*/
|
||||
if (buf != bucket_buf) {
|
||||
_hash_relbuf(rel, buf);
|
||||
LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear the garbage flag from bucket after deleting the tuples that are
|
||||
* moved by split. We purposefully clear the flag before squeeze bucket,
|
||||
* so that after restart, vacuum shouldn't again try to delete the moved
|
||||
* by split tuples.
|
||||
*/
|
||||
if (split_cleanup) {
|
||||
HashPageOpaque bucket_opaque;
|
||||
Page page;
|
||||
|
||||
page = BufferGetPage(bucket_buf);
|
||||
bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
|
||||
MarkBufferDirty(bucket_buf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
XLogRecPtr recptr;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
|
||||
PageSetLSN(page, recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
|
||||
/*
|
||||
* If we have deleted anything, try to compact free space. For squeezing
|
||||
* the bucket, we must have a cleanup lock, else it can impact the
|
||||
* ordering of tuples for a scan that has started before it.
|
||||
*/
|
||||
if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
|
||||
_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy);
|
||||
else
|
||||
LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
|
||||
Datum hashmerge(PG_FUNCTION_ARGS)
|
||||
|
|
|
@ -0,0 +1,861 @@
|
|||
/* -------------------------------------------------------------------------
|
||||
*
|
||||
* hash_xlog.cpp
|
||||
* WAL replay logic for hash index.
|
||||
*
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/gausskernel/storage/access/hash/hash_xlog.cpp
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "access/xlogproc.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/transam.h"
|
||||
#include "access/xlogproc.h"
|
||||
#include "storage/procarray.h"
|
||||
#include "miscadmin.h"
|
||||
|
||||
/*
|
||||
* replay a hash index meta page
|
||||
*/
|
||||
static void hash_xlog_init_meta_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo metabuf;
|
||||
ForkNumber forknum;
|
||||
|
||||
/* create the index' metapage */
|
||||
XLogInitBufferForRedo(record, 0, &metabuf);
|
||||
Assert(BufferIsValid(metabuf.buf));
|
||||
HashRedoInitMetaPageOperatorPage(&metabuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
|
||||
/*
|
||||
* Force the on-disk state of init forks to always be in sync with the
|
||||
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
|
||||
* special handling for init forks as create index operations don't log a
|
||||
* full page image of the metapage.
|
||||
*/
|
||||
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
|
||||
if (forknum == INIT_FORKNUM)
|
||||
FlushOneBuffer(metabuf.buf);
|
||||
|
||||
/* all done */
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay a hash index bitmap page
|
||||
*/
|
||||
static void hash_xlog_init_bitmap_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo bitmapbuf;
|
||||
RedoBufferInfo metabuf;
|
||||
ForkNumber forknum;
|
||||
|
||||
/*
|
||||
* Initialize bitmap page
|
||||
*/
|
||||
XLogInitBufferForRedo(record, 0, &bitmapbuf);
|
||||
HashRedoInitBitmapPageOperatorBitmapPage(&bitmapbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(bitmapbuf.buf);
|
||||
|
||||
/*
|
||||
* Force the on-disk state of init forks to always be in sync with the
|
||||
* state in shared buffers. See XLogReadBufferForRedoExtended. We need
|
||||
* special handling for init forks as create index operations don't log a
|
||||
* full page image of the metapage.
|
||||
*/
|
||||
XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL);
|
||||
if (forknum == INIT_FORKNUM)
|
||||
FlushOneBuffer(bitmapbuf.buf);
|
||||
UnlockReleaseBuffer(bitmapbuf.buf);
|
||||
|
||||
/* add the new bitmap page to the metapage's list of bitmaps */
|
||||
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
|
||||
/*
|
||||
* Note: in normal operation, we'd update the metapage while still
|
||||
* holding lock on the bitmap page. But during replay it's not
|
||||
* necessary to hold that lock, since nobody can see it yet; the
|
||||
* creating transaction hasn't yet committed.
|
||||
*/
|
||||
HashRedoInitBitmapPageOperatorMetaPage(&metabuf);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
|
||||
XLogRecGetBlockTag(record, 1, NULL, &forknum, NULL);
|
||||
if (forknum == INIT_FORKNUM)
|
||||
FlushOneBuffer(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay a hash index insert without split
|
||||
*/
|
||||
static void hash_xlog_insert(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buffer;
|
||||
RedoBufferInfo metabuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
|
||||
Size datalen;
|
||||
char *datapos = XLogRecGetBlockData(record, 0, &datalen);
|
||||
|
||||
HashRedoInsertOperatorPage(&buffer, XLogRecGetData(record), datapos, datalen);
|
||||
MarkBufferDirty(buffer.buf);
|
||||
}
|
||||
if (BufferIsValid(buffer.buf))
|
||||
UnlockReleaseBuffer(buffer.buf);
|
||||
|
||||
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
|
||||
/*
|
||||
* Note: in normal operation, we'd update the metapage while still
|
||||
* holding lock on the page we inserted into. But during replay it's
|
||||
* not necessary to hold that lock, since no other index updates can
|
||||
* be happening concurrently.
|
||||
*/
|
||||
HashRedoInsertOperatorMetaPage(&metabuf);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay addition of overflow page for hash index
|
||||
*/
|
||||
static void hash_xlog_add_ovfl_page(XLogReaderState* record)
|
||||
{
|
||||
RedoBufferInfo leftbuf;
|
||||
RedoBufferInfo ovflbuf;
|
||||
RedoBufferInfo metabuf;
|
||||
BlockNumber leftblk;
|
||||
BlockNumber rightblk;
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk);
|
||||
XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk);
|
||||
|
||||
XLogInitBufferForRedo(record, 0, &ovflbuf);
|
||||
Assert(BufferIsValid(ovflbuf.buf));
|
||||
|
||||
data = XLogRecGetBlockData(record, 0, &datalen);
|
||||
HashRedoAddOvflPageOperatorOvflPage(&ovflbuf, leftblk, data, datalen);
|
||||
MarkBufferDirty(ovflbuf.buf);
|
||||
|
||||
if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) {
|
||||
HashRedoAddOvflPageOperatorLeftPage(&leftbuf, rightblk);
|
||||
MarkBufferDirty(leftbuf.buf);
|
||||
}
|
||||
|
||||
if (BufferIsValid(leftbuf.buf))
|
||||
UnlockReleaseBuffer(leftbuf.buf);
|
||||
UnlockReleaseBuffer(ovflbuf.buf);
|
||||
|
||||
/*
|
||||
* Note: in normal operation, we'd update the bitmap and meta page while
|
||||
* still holding lock on the overflow pages. But during replay it's not
|
||||
* necessary to hold those locks, since no other index updates can be
|
||||
* happening concurrently.
|
||||
*/
|
||||
if (XLogRecHasBlockRef(record, 2)) {
|
||||
RedoBufferInfo mapbuffer;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) {
|
||||
data = XLogRecGetBlockData(record, 2, &datalen);
|
||||
|
||||
HashRedoAddOvflPageOperatorMapPage(&mapbuffer, data);
|
||||
MarkBufferDirty(mapbuffer.buf);
|
||||
}
|
||||
if (BufferIsValid(mapbuffer.buf))
|
||||
UnlockReleaseBuffer(mapbuffer.buf);
|
||||
}
|
||||
|
||||
if (XLogRecHasBlockRef(record, 3)) {
|
||||
RedoBufferInfo newmapbuf;
|
||||
|
||||
XLogInitBufferForRedo(record, 3, &newmapbuf);
|
||||
|
||||
HashRedoAddOvflPageOperatorNewmapPage(&newmapbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(newmapbuf.buf);
|
||||
|
||||
UnlockReleaseBuffer(newmapbuf.buf);
|
||||
}
|
||||
|
||||
if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) {
|
||||
data = XLogRecGetBlockData(record, 4, &datalen);
|
||||
|
||||
HashRedoAddOvflPageOperatorMetaPage(&metabuf, XLogRecGetData(record), data, datalen);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay allocation of page for split operation
|
||||
*/
|
||||
static void hash_xlog_split_allocate_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo oldbuf;
|
||||
RedoBufferInfo newbuf;
|
||||
RedoBufferInfo metabuf;
|
||||
Size datalen PG_USED_FOR_ASSERTS_ONLY;
|
||||
char *data = NULL;
|
||||
XLogRedoAction action;
|
||||
|
||||
/*
|
||||
* To be consistent with normal operation, here we take cleanup locks on
|
||||
* both the old and new buckets even though there can't be any concurrent
|
||||
* inserts.
|
||||
*/
|
||||
|
||||
/* replay the record for old bucket */
|
||||
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf);
|
||||
|
||||
/*
|
||||
* Note that we still update the page even if it was restored from a full
|
||||
* page image, because the special space is not included in the image.
|
||||
*/
|
||||
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
|
||||
HashRedoSplitAllocatePageOperatorObukPage(&oldbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(oldbuf.buf);
|
||||
}
|
||||
|
||||
/* replay the record for new bucket */
|
||||
XLogInitBufferForRedo(record, 1, &newbuf);
|
||||
HashRedoSplitAllocatePageOperatorNbukPage(&newbuf, XLogRecGetData(record));
|
||||
if (!IsBufferCleanupOK(newbuf.buf))
|
||||
elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock");
|
||||
MarkBufferDirty(newbuf.buf);
|
||||
|
||||
/*
|
||||
* We can release the lock on old bucket early as well but doing here to
|
||||
* consistent with normal operation.
|
||||
*/
|
||||
if (BufferIsValid(oldbuf.buf))
|
||||
UnlockReleaseBuffer(oldbuf.buf);
|
||||
if (BufferIsValid(newbuf.buf))
|
||||
UnlockReleaseBuffer(newbuf.buf);
|
||||
|
||||
/*
|
||||
* Note: in normal operation, we'd update the meta page while still
|
||||
* holding lock on the old and new bucket pages. But during replay it's
|
||||
* not necessary to hold those locks, since no other bucket splits can be
|
||||
* happening concurrently.
|
||||
*/
|
||||
|
||||
/* replay the record for metapage changes */
|
||||
if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) {
|
||||
data = XLogRecGetBlockData(record, 2, &datalen);
|
||||
|
||||
HashRedoSplitAllocatePageOperatorMetaPage(&metabuf, XLogRecGetData(record), data);
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay of split operation
|
||||
*/
|
||||
static void hash_xlog_split_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED)
|
||||
elog(ERROR, "Hash split record did not contain a full-page image");
|
||||
|
||||
if (BufferIsValid(buf.buf))
|
||||
UnlockReleaseBuffer(buf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay completion of split operation
|
||||
*/
|
||||
static void hash_xlog_split_complete(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo oldbuf;
|
||||
RedoBufferInfo newbuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
/* replay the record for old bucket */
|
||||
action = XLogReadBufferForRedo(record, 0, &oldbuf);
|
||||
|
||||
/*
|
||||
* Note that we still update the page even if it was restored from a full
|
||||
* page image, because the bucket flag is not included in the image.
|
||||
*/
|
||||
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
|
||||
HashRedoSplitCompleteOperatorObukPage(&oldbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(oldbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(oldbuf.buf))
|
||||
UnlockReleaseBuffer(oldbuf.buf);
|
||||
|
||||
/* replay the record for new bucket */
|
||||
action = XLogReadBufferForRedo(record, 1, &newbuf);
|
||||
|
||||
/*
|
||||
* Note that we still update the page even if it was restored from a full
|
||||
* page image, because the bucket flag is not included in the image.
|
||||
*/
|
||||
if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) {
|
||||
HashRedoSplitCompleteOperatorNbukPage(&newbuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(newbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(newbuf.buf))
|
||||
UnlockReleaseBuffer(newbuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay move of page contents for squeeze operation of hash index
|
||||
*/
|
||||
static void hash_xlog_move_page_contents(XLogReaderState *record)
|
||||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record);
|
||||
RedoBufferInfo bucketbuf;
|
||||
RedoBufferInfo writebuf;
|
||||
RedoBufferInfo deletebuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
bucketbuf.buf = InvalidBuffer;
|
||||
writebuf.buf = InvalidBuffer;
|
||||
deletebuf.buf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Ensure we have a cleanup lock on primary bucket page before we start
|
||||
* with the actual replay operation. This is to ensure that neither a
|
||||
* scan can start nor a scan can be already-in-progress during the replay
|
||||
* of this operation. If we allow scans during this operation, then they
|
||||
* can miss some records or show the same record multiple times.
|
||||
*/
|
||||
if (xldata->is_prim_bucket_same_wrt) {
|
||||
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
|
||||
} else {
|
||||
/*
|
||||
* we don't care for return value as the purpose of reading bucketbuf
|
||||
* is to ensure a cleanup lock on primary bucket page.
|
||||
*/
|
||||
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
||||
|
||||
PageSetLSN(bucketbuf.pageinfo.page, lsn);
|
||||
|
||||
action = XLogReadBufferForRedo(record, 1, &writebuf);
|
||||
}
|
||||
|
||||
/* replay the record for adding entries in overflow buffer */
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 1, &datalen);
|
||||
|
||||
HashXlogMoveAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
|
||||
|
||||
MarkBufferDirty(writebuf.buf);
|
||||
}
|
||||
|
||||
/* replay the record for deleting entries from overflow buffer */
|
||||
if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) {
|
||||
char *ptr = NULL;
|
||||
Size len;
|
||||
|
||||
ptr = XLogRecGetBlockData(record, 2, &len);
|
||||
|
||||
HashXlogMoveDeleteOvflPageOperatorPage(&deletebuf, (void *)ptr, len);
|
||||
|
||||
MarkBufferDirty(deletebuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Replay is complete, now we can release the buffers. We release locks at
|
||||
* end of replay operation to ensure that we hold lock on primary bucket
|
||||
* page till end of operation. We can optimize by releasing the lock on
|
||||
* write buffer as soon as the operation for same is complete, if it is
|
||||
* not same as primary bucket page, but that doesn't seem to be worth
|
||||
* complicating the code.
|
||||
*/
|
||||
if (BufferIsValid(deletebuf.buf))
|
||||
UnlockReleaseBuffer(deletebuf.buf);
|
||||
|
||||
if (BufferIsValid(writebuf.buf))
|
||||
UnlockReleaseBuffer(writebuf.buf);
|
||||
|
||||
if (BufferIsValid(bucketbuf.buf))
|
||||
UnlockReleaseBuffer(bucketbuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay squeeze page operation of hash index
|
||||
*/
|
||||
static void hash_xlog_squeeze_page(XLogReaderState *record)
|
||||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record);
|
||||
RedoBufferInfo bucketbuf;
|
||||
RedoBufferInfo writebuf;
|
||||
RedoBufferInfo ovflbuf;
|
||||
RedoBufferInfo prevbuf;
|
||||
RedoBufferInfo mapbuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
bucketbuf.buf = InvalidBuffer;
|
||||
prevbuf.buf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Ensure we have a cleanup lock on primary bucket page before we start
|
||||
* with the actual replay operation. This is to ensure that neither a
|
||||
* scan can start nor a scan can be already-in-progress during the replay
|
||||
* of this operation. If we allow scans during this operation, then they
|
||||
* can miss some records or show the same record multiple times.
|
||||
*/
|
||||
if (xldata->is_prim_bucket_same_wrt) {
|
||||
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf);
|
||||
} else {
|
||||
/*
|
||||
* we don't care for return value as the purpose of reading bucketbuf
|
||||
* is to ensure a cleanup lock on primary bucket page.
|
||||
*/
|
||||
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
||||
|
||||
PageSetLSN(bucketbuf.pageinfo.page, lsn);
|
||||
|
||||
action = XLogReadBufferForRedo(record, 1, &writebuf);
|
||||
}
|
||||
|
||||
/* replay the record for adding entries in overflow buffer */
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 1, &datalen);
|
||||
|
||||
HashXlogSqueezeAddPageOperatorPage(&writebuf, XLogRecGetData(record), (void *)data, datalen);
|
||||
|
||||
MarkBufferDirty(writebuf.buf);
|
||||
}
|
||||
|
||||
/* replay the record for initializing overflow buffer */
|
||||
if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogSqueezeInitOvflbufOperatorPage(&ovflbuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(ovflbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(ovflbuf.buf))
|
||||
UnlockReleaseBuffer(ovflbuf.buf);
|
||||
|
||||
/* replay the record for page previous to the freed overflow page */
|
||||
if (!xldata->is_prev_bucket_same_wrt &&
|
||||
XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogSqueezeUpdatePrevPageOperatorPage(&prevbuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(prevbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(prevbuf.buf))
|
||||
UnlockReleaseBuffer(prevbuf.buf);
|
||||
|
||||
/* replay the record for page next to the freed overflow page */
|
||||
if (XLogRecHasBlockRef(record, 4)) {
|
||||
RedoBufferInfo nextbuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogSqueezeUpdateNextPageOperatorPage(&nextbuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(nextbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(nextbuf.buf))
|
||||
UnlockReleaseBuffer(nextbuf.buf);
|
||||
}
|
||||
|
||||
if (BufferIsValid(writebuf.buf))
|
||||
UnlockReleaseBuffer(writebuf.buf);
|
||||
|
||||
if (BufferIsValid(bucketbuf.buf))
|
||||
UnlockReleaseBuffer(bucketbuf.buf);
|
||||
|
||||
/*
|
||||
* Note: in normal operation, we'd update the bitmap and meta page while
|
||||
* still holding lock on the primary bucket page and overflow pages. But
|
||||
* during replay it's not necessary to hold those locks, since no other
|
||||
* index updates can be happening concurrently.
|
||||
*/
|
||||
/* replay the record for bitmap page */
|
||||
if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 5, &datalen);
|
||||
HashXlogSqueezeUpdateBitmapOperatorPage(&mapbuf, (void *)data);
|
||||
|
||||
MarkBufferDirty(mapbuf.buf);
|
||||
}
|
||||
if (BufferIsValid(mapbuf.buf))
|
||||
UnlockReleaseBuffer(mapbuf.buf);
|
||||
|
||||
/* replay the record for meta page */
|
||||
if (XLogRecHasBlockRef(record, 6)) {
|
||||
RedoBufferInfo metabuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) {
|
||||
char *data = NULL;
|
||||
Size datalen;
|
||||
|
||||
data = XLogRecGetBlockData(record, 6, &datalen);
|
||||
HashXlogSqueezeUpdateMateOperatorPage(&metabuf, (void *)data);
|
||||
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* replay delete operation of hash index
|
||||
*/
|
||||
static void hash_xlog_delete(XLogReaderState *record)
|
||||
{
|
||||
XLogRecPtr lsn = record->EndRecPtr;
|
||||
xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record);
|
||||
RedoBufferInfo bucketbuf;
|
||||
RedoBufferInfo deletebuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
bucketbuf.buf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Ensure we have a cleanup lock on primary bucket page before we start
|
||||
* with the actual replay operation. This is to ensure that neither a
|
||||
* scan can start nor a scan can be already-in-progress during the replay
|
||||
* of this operation. If we allow scans during this operation, then they
|
||||
* can miss some records or show the same record multiple times.
|
||||
*/
|
||||
if (xldata->is_primary_bucket_page) {
|
||||
action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf);
|
||||
} else {
|
||||
/*
|
||||
* we don't care for return value as the purpose of reading bucketbuf
|
||||
* is to ensure a cleanup lock on primary bucket page.
|
||||
*/
|
||||
(void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf);
|
||||
|
||||
PageSetLSN(bucketbuf.pageinfo.page, lsn);
|
||||
|
||||
action = XLogReadBufferForRedo(record, 1, &deletebuf);
|
||||
}
|
||||
|
||||
/* replay the record for deleting entries in bucket page */
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
char *ptr = NULL;
|
||||
Size len;
|
||||
|
||||
ptr = XLogRecGetBlockData(record, 1, &len);
|
||||
|
||||
HashXlogDeleteBlockOperatorPage(&deletebuf, XLogRecGetData(record), (void *)ptr, len);
|
||||
|
||||
MarkBufferDirty(deletebuf.buf);
|
||||
}
|
||||
if (BufferIsValid(deletebuf.buf))
|
||||
UnlockReleaseBuffer(deletebuf.buf);
|
||||
|
||||
if (BufferIsValid(bucketbuf.buf))
|
||||
UnlockReleaseBuffer(bucketbuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay split cleanup flag operation for primary bucket page.
|
||||
*/
|
||||
static void hash_xlog_split_cleanup(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buffer;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) {
|
||||
HashXlogSplitCleanupOperatorPage(&buffer);
|
||||
|
||||
MarkBufferDirty(buffer.buf);
|
||||
}
|
||||
if (BufferIsValid(buffer.buf))
|
||||
UnlockReleaseBuffer(buffer.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* replay for update meta page
|
||||
*/
|
||||
static void hash_xlog_update_meta_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo metabuf;
|
||||
|
||||
if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogUpdateMetaOperatorPage(&metabuf, XLogRecGetData(record));
|
||||
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the latestRemovedXid from the heap pages pointed at by the index
|
||||
* tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid,
|
||||
* on which this function is based.
|
||||
*/
|
||||
static TransactionId hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record)
|
||||
{
|
||||
xl_hash_vacuum_one_page *xlrec;
|
||||
OffsetNumber *unused = NULL;
|
||||
Buffer ibuffer;
|
||||
Buffer hbuffer;
|
||||
Page ipage;
|
||||
Page hpage;
|
||||
RelFileNode rnode;
|
||||
BlockNumber blkno;
|
||||
ItemId iitemid;
|
||||
ItemId hitemid;
|
||||
IndexTuple itup;
|
||||
BlockNumber hblkno;
|
||||
OffsetNumber hoffnum;
|
||||
TransactionId latestRemovedXid = InvalidTransactionId;
|
||||
int i;
|
||||
|
||||
xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record);
|
||||
|
||||
/*
|
||||
* If there's nothing running on the standby we don't need to derive a
|
||||
* full latestRemovedXid value, so use a fast path out of here. This
|
||||
* returns InvalidTransactionId, and so will conflict with all HS
|
||||
* transactions; but since we just worked out that that's zero people,
|
||||
* it's OK.
|
||||
*
|
||||
* XXX There is a race condition here, which is that a new backend might
|
||||
* start just after we look. If so, it cannot need to conflict, but this
|
||||
* coding will result in throwing a conflict anyway.
|
||||
*/
|
||||
if (CountDBBackends(InvalidOid) == 0)
|
||||
return latestRemovedXid;
|
||||
|
||||
/*
|
||||
* Check if WAL replay has reached a consistent database state. If not, we
|
||||
* must PANIC. See the definition of
|
||||
* btree_xlog_delete_get_latestRemovedXid for more details.
|
||||
*/
|
||||
if (!t_thrd.xlog_cxt.reachedConsistency)
|
||||
elog(PANIC, "hash_xlog_vacuum_get_latestRemovedXid: cannot operate with inconsistent data");
|
||||
|
||||
/*
|
||||
* Get index page. If the DB is consistent, this should not fail, nor
|
||||
* should any of the heap page fetches below. If one does, we return
|
||||
* InvalidTransactionId to cancel all HS transactions. That's probably
|
||||
* overkill, but it's safe, and certainly better than panicking here.
|
||||
*/
|
||||
XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno);
|
||||
ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL);
|
||||
|
||||
if (!BufferIsValid(ibuffer))
|
||||
return InvalidTransactionId;
|
||||
LockBuffer(ibuffer, HASH_READ);
|
||||
ipage = (Page) BufferGetPage(ibuffer);
|
||||
|
||||
/*
|
||||
* Loop through the deleted index items to obtain the TransactionId from
|
||||
* the heap items they point to.
|
||||
*/
|
||||
unused = (OffsetNumber *) ((char *) xlrec + SizeOfHashVacuumOnePage);
|
||||
|
||||
for (i = 0; i < xlrec->ntuples; i++) {
|
||||
/*
|
||||
* Identify the index tuple about to be deleted.
|
||||
*/
|
||||
iitemid = PageGetItemId(ipage, unused[i]);
|
||||
itup = (IndexTuple) PageGetItem(ipage, iitemid);
|
||||
|
||||
/*
|
||||
* Locate the heap page that the index tuple points at
|
||||
*/
|
||||
hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
|
||||
hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL);
|
||||
|
||||
if (!BufferIsValid(hbuffer)) {
|
||||
UnlockReleaseBuffer(ibuffer);
|
||||
return InvalidTransactionId;
|
||||
}
|
||||
LockBuffer(hbuffer, HASH_READ);
|
||||
hpage = (Page) BufferGetPage(hbuffer);
|
||||
|
||||
/*
|
||||
* Look up the heap tuple header that the index tuple points at by
|
||||
* using the heap node supplied with the xlrec. We can't use
|
||||
* heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
|
||||
* Note that we are not looking at tuple data here, just headers.
|
||||
*/
|
||||
hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
|
||||
hitemid = PageGetItemId(hpage, hoffnum);
|
||||
|
||||
/*
|
||||
* Follow any redirections until we find something useful.
|
||||
*/
|
||||
while (ItemIdIsRedirected(hitemid)) {
|
||||
hoffnum = ItemIdGetRedirect(hitemid);
|
||||
hitemid = PageGetItemId(hpage, hoffnum);
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
}
|
||||
|
||||
/*
|
||||
* If the heap item has storage, then read the header and use that to
|
||||
* set latestRemovedXid.
|
||||
*
|
||||
* Some LP_DEAD items may not be accessible, so we ignore them.
|
||||
*/
|
||||
if (ItemIdHasStorage(hitemid)) {
|
||||
HeapTupleData tuple;
|
||||
tuple.t_data = (HeapTupleHeader) PageGetItem(hpage, hitemid);
|
||||
HeapTupleCopyBaseFromPage(&tuple, &hpage);
|
||||
HeapTupleHeaderAdvanceLatestRemovedXid(&tuple, &latestRemovedXid);
|
||||
} else if (ItemIdIsDead(hitemid)) {
|
||||
/*
|
||||
* Conjecture: if hitemid is dead then it had xids before the xids
|
||||
* marked on LP_NORMAL items. So we just ignore this item and move
|
||||
* onto the next, for the purposes of calculating
|
||||
* latestRemovedxids.
|
||||
*/
|
||||
} else
|
||||
Assert(!ItemIdIsUsed(hitemid));
|
||||
|
||||
UnlockReleaseBuffer(hbuffer);
|
||||
}
|
||||
|
||||
UnlockReleaseBuffer(ibuffer);
|
||||
|
||||
/*
|
||||
* If all heap tuples were LP_DEAD then we will be returning
|
||||
* InvalidTransactionId here, which avoids conflicts. This matches
|
||||
* existing logic which assumes that LP_DEAD tuples must already be older
|
||||
* than the latestRemovedXid on the cleanup record that set them as
|
||||
* LP_DEAD, hence must already have generated a conflict.
|
||||
*/
|
||||
return latestRemovedXid;
|
||||
}
|
||||
|
||||
/*
|
||||
* replay delete operation in hash index to remove
|
||||
* tuples marked as DEAD during index tuple insertion.
|
||||
*/
|
||||
static void hash_xlog_vacuum_one_page(XLogReaderState *record)
|
||||
{
|
||||
RedoBufferInfo buffer;
|
||||
RedoBufferInfo metabuf;
|
||||
XLogRedoAction action;
|
||||
|
||||
/*
|
||||
* If we have any conflict processing to do, it must happen before we
|
||||
* update the page.
|
||||
*
|
||||
* Hash index records that are marked as LP_DEAD and being removed during
|
||||
* hash index tuple insertion can conflict with standby queries. You might
|
||||
* think that vacuum records would conflict as well, but we've handled
|
||||
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
|
||||
* cleaned by the vacuum of the heap and so we can resolve any conflicts
|
||||
* just once when that arrives. After that we know that no conflicts
|
||||
* exist from individual hash index vacuum records on that index.
|
||||
*/
|
||||
if (InHotStandby) {
|
||||
TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record);
|
||||
RelFileNode rnode;
|
||||
|
||||
XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL);
|
||||
ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode);
|
||||
}
|
||||
|
||||
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer);
|
||||
|
||||
if (action == BLK_NEEDS_REDO) {
|
||||
Size len;
|
||||
|
||||
len = XLogRecGetDataLen(record);
|
||||
HashXlogVacuumOnePageOperatorPage(&buffer, XLogRecGetData(record), len);
|
||||
|
||||
MarkBufferDirty(buffer.buf);
|
||||
}
|
||||
if (BufferIsValid(buffer.buf))
|
||||
UnlockReleaseBuffer(buffer.buf);
|
||||
|
||||
if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) {
|
||||
HashXlogVacuumMateOperatorPage(&metabuf, XLogRecGetData(record));
|
||||
MarkBufferDirty(metabuf.buf);
|
||||
}
|
||||
if (BufferIsValid(metabuf.buf))
|
||||
UnlockReleaseBuffer(metabuf.buf);
|
||||
}
|
||||
|
||||
void hash_redo(XLogReaderState *record)
|
||||
{
|
||||
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
||||
|
||||
switch (info) {
|
||||
case XLOG_HASH_INIT_META_PAGE:
|
||||
hash_xlog_init_meta_page(record);
|
||||
break;
|
||||
case XLOG_HASH_INIT_BITMAP_PAGE:
|
||||
hash_xlog_init_bitmap_page(record);
|
||||
break;
|
||||
case XLOG_HASH_INSERT:
|
||||
hash_xlog_insert(record);
|
||||
break;
|
||||
case XLOG_HASH_ADD_OVFL_PAGE:
|
||||
hash_xlog_add_ovfl_page(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
||||
hash_xlog_split_allocate_page(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_PAGE:
|
||||
hash_xlog_split_page(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_COMPLETE:
|
||||
hash_xlog_split_complete(record);
|
||||
break;
|
||||
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
||||
hash_xlog_move_page_contents(record);
|
||||
break;
|
||||
case XLOG_HASH_SQUEEZE_PAGE:
|
||||
hash_xlog_squeeze_page(record);
|
||||
break;
|
||||
case XLOG_HASH_DELETE:
|
||||
hash_xlog_delete(record);
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_CLEANUP:
|
||||
hash_xlog_split_cleanup(record);
|
||||
break;
|
||||
case XLOG_HASH_UPDATE_META_PAGE:
|
||||
hash_xlog_update_meta_page(record);
|
||||
break;
|
||||
case XLOG_HASH_VACUUM_ONE_PAGE:
|
||||
hash_xlog_vacuum_one_page(record);
|
||||
break;
|
||||
default:
|
||||
elog(PANIC, "hash_redo: unknown op code %u", info);
|
||||
}
|
||||
}
|
||||
|
||||
bool IsHashVacuumPages(XLogReaderState *record)
|
||||
{
|
||||
uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK));
|
||||
|
||||
if (XLogRecGetRmid(record) == RM_HASH_ID) {
|
||||
if (info == XLOG_HASH_DELETE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
|
@ -3,8 +3,8 @@
|
|||
* hashinsert.cpp
|
||||
* Item insertion in hash tables for Postgres.
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -17,8 +17,16 @@
|
|||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/xloginsert.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rel_gs.h"
|
||||
#include "storage/lock/lwlock.h"
|
||||
#include "storage/buf/buf_internals.h"
|
||||
|
||||
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode);
|
||||
|
||||
/*
|
||||
* _hash_doinsert() -- Handle insertion of a single index tuple.
|
||||
|
@ -26,12 +34,13 @@
|
|||
* This routine is called by the public interface routines, hashbuild
|
||||
* and hashinsert. By here, itup is completely filled in.
|
||||
*/
|
||||
void _hash_doinsert(Relation rel, IndexTuple itup)
|
||||
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel)
|
||||
{
|
||||
Buffer buf;
|
||||
Buffer bucket_buf;
|
||||
Buffer metabuf;
|
||||
HashMetaPage metap;
|
||||
BlockNumber blkno;
|
||||
HashMetaPage usedmetap = NULL;
|
||||
Page metapage;
|
||||
Page page;
|
||||
HashPageOpaque pageopaque;
|
||||
|
@ -39,7 +48,7 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
|
|||
bool do_expand = false;
|
||||
uint32 hashkey;
|
||||
Bucket bucket;
|
||||
|
||||
OffsetNumber itup_off;
|
||||
/*
|
||||
* Get the hash key for the item (it's stored in the index tuple itself).
|
||||
*/
|
||||
|
@ -49,16 +58,16 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
|
|||
itemsz = IndexTupleDSize(*itup);
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
|
||||
* need to be consistent */
|
||||
/*
|
||||
* Acquire shared split lock so we can compute the target bucket safely
|
||||
* (see README).
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Read the metapage */
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
restart_insert:
|
||||
|
||||
/*
|
||||
* Read the metapage. We don't lock it yet; HashMaxItemSize() will
|
||||
* examine pd_pagesize_version, but that can't change so we can examine it
|
||||
* without a lock.
|
||||
*/
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
|
||||
metapage = BufferGetPage(metabuf);
|
||||
metap = HashPageGetMeta(metapage);
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a hash page at all. (Eventually, we
|
||||
|
@ -73,87 +82,154 @@ void _hash_doinsert(Relation rel, IndexTuple itup)
|
|||
(unsigned long)HashMaxItemSize(metapage)),
|
||||
errhint("Values larger than a buffer page cannot be indexed.")));
|
||||
|
||||
/*
|
||||
* Compute the target bucket number, and convert to block number.
|
||||
*/
|
||||
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
|
||||
/* Lock the primary bucket page for the target bucket. */
|
||||
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_WRITE, &usedmetap);
|
||||
Assert(usedmetap != NULL);
|
||||
|
||||
blkno = BUCKET_TO_BLKNO(metap, bucket);
|
||||
/* remember the primary bucket buffer to release the pin on it at end. */
|
||||
bucket_buf = buf;
|
||||
|
||||
/* release lock on metapage, but keep pin since we'll need it again */
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
|
||||
|
||||
/*
|
||||
* Acquire share lock on target bucket; then we can release split lock.
|
||||
*/
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
|
||||
_hash_droplock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Fetch the primary bucket page for the bucket */
|
||||
buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
|
||||
page = BufferGetPage(buf);
|
||||
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(pageopaque->hasho_bucket == bucket);
|
||||
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
bucket = pageopaque->hasho_bucket;
|
||||
|
||||
/*
|
||||
* If this bucket is in the process of being split, try to finish the
|
||||
* split before inserting, because that might create room for the
|
||||
* insertion to proceed without allocating an additional overflow page.
|
||||
* It's only interesting to finish the split if we're trying to insert
|
||||
* into the bucket from which we're removing tuples (the "old" bucket),
|
||||
* not if we're trying to insert into the bucket into which tuples are
|
||||
* being moved (the "new" bucket).
|
||||
*/
|
||||
if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) {
|
||||
/* release the lock on bucket buffer, before completing the split. */
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
_hash_finish_split(rel, metabuf, buf, bucket,
|
||||
usedmetap->hashm_maxbucket,
|
||||
usedmetap->hashm_highmask,
|
||||
usedmetap->hashm_lowmask);
|
||||
|
||||
/* release the pin on old and meta buffer. retry for insert. */
|
||||
_hash_dropbuf(rel, buf);
|
||||
_hash_dropbuf(rel, metabuf);
|
||||
goto restart_insert;
|
||||
}
|
||||
|
||||
/* Do the insertion */
|
||||
while (PageGetFreeSpace(page) < itemsz) {
|
||||
BlockNumber nextblkno;
|
||||
|
||||
/*
|
||||
* Check if current page has any DEAD tuples. If yes, delete these
|
||||
* tuples and see if we can get a space for the new item to be
|
||||
* inserted before moving to the next page in the bucket chain.
|
||||
*/
|
||||
if (H_HAS_DEAD_TUPLES(pageopaque)) {
|
||||
if (IsBufferCleanupOK(buf)) {
|
||||
_hash_vacuum_one_page(rel, metabuf, buf, heapRel->rd_node);
|
||||
|
||||
if (PageGetFreeSpace(page) >= itemsz)
|
||||
break; /* OK, now we have enough space */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* no space on this page; check for an overflow page
|
||||
*/
|
||||
BlockNumber nextblkno = pageopaque->hasho_nextblkno;
|
||||
nextblkno = pageopaque->hasho_nextblkno;
|
||||
|
||||
if (BlockNumberIsValid(nextblkno)) {
|
||||
/*
|
||||
* ovfl page exists; go get it. if it doesn't have room, we'll
|
||||
* find out next pass through the loop test above.
|
||||
* find out next pass through the loop test above. we always
|
||||
* release both the lock and pin if this is an overflow page, but
|
||||
* only the lock if this is the primary bucket page, since the pin
|
||||
* on the primary bucket must be retained throughout the scan.
|
||||
*/
|
||||
if (buf != bucket_buf)
|
||||
_hash_relbuf(rel, buf);
|
||||
else
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
|
||||
page = BufferGetPage(buf);
|
||||
} else {
|
||||
/*
|
||||
* we're at the end of the bucket chain and we haven't found a
|
||||
* page with enough room. allocate a new overflow page.
|
||||
*
|
||||
* release our write lock without modifying buffer
|
||||
*/
|
||||
_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
|
||||
|
||||
/* release our write lock without modifying buffer */
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/* chain to a new overflow page */
|
||||
buf = _hash_addovflpage(rel, metabuf, buf);
|
||||
buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
/* should fit now, given test above */
|
||||
Assert(PageGetFreeSpace(page) >= itemsz);
|
||||
}
|
||||
pageopaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
|
||||
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_OVERFLOW_PAGE);
|
||||
Assert(pageopaque->hasho_bucket == bucket);
|
||||
}
|
||||
|
||||
/* found page with enough space, so add the item here */
|
||||
(void)_hash_pgaddtup(rel, buf, itemsz, itup);
|
||||
|
||||
/* write and release the modified page */
|
||||
_hash_wrtbuf(rel, buf);
|
||||
|
||||
/* We can drop the bucket lock now */
|
||||
_hash_droplock(rel, blkno, HASH_SHARE);
|
||||
|
||||
/*
|
||||
* Write-lock the metapage so we can increment the tuple count. After
|
||||
* incrementing it, check to see if it's time for a split.
|
||||
*/
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
|
||||
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/* Do the update. No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
/* found page with enough space, so add the item here */
|
||||
itup_off = _hash_pgaddtup(rel, buf, itemsz, itup);
|
||||
MarkBufferDirty(buf);
|
||||
|
||||
/* metapage operations */
|
||||
metap = HashPageGetMeta(metapage);
|
||||
metap->hashm_ntuples += 1;
|
||||
|
||||
/* Make sure this stays in sync with _hash_expandtable() */
|
||||
do_expand = metap->hashm_ntuples > (double)metap->hashm_ffactor * (metap->hashm_maxbucket + 1);
|
||||
|
||||
/* Write out the metapage and drop lock, but keep pin */
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
|
||||
MarkBufferDirty(metabuf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_insert xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.offnum = itup_off;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashInsert);
|
||||
|
||||
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
|
||||
|
||||
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
||||
XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup));
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT);
|
||||
|
||||
PageSetLSN(BufferGetPage(buf), recptr);
|
||||
PageSetLSN(BufferGetPage(metabuf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
/* drop lock on metapage, but keep pin */
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/*
|
||||
* Release the modified page and ensure to release the pin on primary
|
||||
* page.
|
||||
*/
|
||||
_hash_relbuf(rel, buf);
|
||||
if (buf != bucket_buf)
|
||||
_hash_dropbuf(rel, bucket_buf);
|
||||
|
||||
/* Attempt to split if a split is needed */
|
||||
if (do_expand)
|
||||
|
@ -192,3 +268,130 @@ OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple
|
|||
|
||||
return itup_off;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_pgaddmultitup() -- add a tuple vector to a particular page in the index.
|
||||
*
|
||||
* This routine has same requirements for locking and tuple ordering as
|
||||
* _hash_pgaddtup().
|
||||
*
|
||||
* Returns the offset number array at which the tuples were inserted.
|
||||
*/
|
||||
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
|
||||
{
|
||||
OffsetNumber itup_off;
|
||||
Page page;
|
||||
uint32 hashkey;
|
||||
int i;
|
||||
|
||||
_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
for (i = 0; i < nitups; i++) {
|
||||
Size itemsize;
|
||||
|
||||
itemsize = IndexTupleDSize(*itups[i]);
|
||||
itemsize = MAXALIGN(itemsize);
|
||||
|
||||
/* Find where to insert the tuple (preserving page's hashkey ordering) */
|
||||
hashkey = _hash_get_indextuple_hashkey(itups[i]);
|
||||
itup_off = _hash_binsearch(page, hashkey);
|
||||
|
||||
itup_offsets[i] = itup_off;
|
||||
|
||||
if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) == InvalidOffsetNumber)
|
||||
elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_vacuum_one_page - vacuum just one index page.
|
||||
*
|
||||
* Try to remove LP_DEAD items from the given page. We must acquire cleanup
|
||||
* lock on the page being modified before calling this function.
|
||||
*/
|
||||
|
||||
static void _hash_vacuum_one_page(Relation rel, Buffer metabuf, Buffer buf, RelFileNode hnode)
|
||||
{
|
||||
OffsetNumber deletable[MaxOffsetNumber];
|
||||
int ndeletable = 0;
|
||||
OffsetNumber offnum;
|
||||
OffsetNumber maxoff;
|
||||
Page page = BufferGetPage(buf);
|
||||
HashPageOpaque pageopaque;
|
||||
HashMetaPage metap;
|
||||
|
||||
/* Scan each tuple in page to see if it is marked as LP_DEAD */
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) {
|
||||
ItemId itemId = PageGetItemId(page, offnum);
|
||||
|
||||
if (ItemIdIsDead(itemId))
|
||||
deletable[ndeletable++] = offnum;
|
||||
}
|
||||
|
||||
if (ndeletable > 0) {
|
||||
/*
|
||||
* Write-lock the meta page so that we can decrement tuple count.
|
||||
*/
|
||||
LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/* No ereport(ERROR) until changes are logged */
|
||||
START_CRIT_SECTION();
|
||||
|
||||
PageIndexMultiDelete(page, deletable, ndeletable);
|
||||
|
||||
/*
|
||||
* Mark the page as not containing any LP_DEAD items. This is not
|
||||
* certainly true (there might be some that have recently been marked,
|
||||
* but weren't included in our target-item list), but it will almost
|
||||
* always be true and it doesn't seem worth an additional page scan to
|
||||
* check it. Remember that LH_PAGE_HAS_DEAD_TUPLES is only a hint
|
||||
* anyway.
|
||||
*/
|
||||
pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
|
||||
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
metap->hashm_ntuples -= ndeletable;
|
||||
|
||||
MarkBufferDirty(buf);
|
||||
MarkBufferDirty(metabuf);
|
||||
|
||||
/* XLOG stuff */
|
||||
if (RelationNeedsWAL(rel)) {
|
||||
xl_hash_vacuum_one_page xlrec;
|
||||
XLogRecPtr recptr;
|
||||
|
||||
xlrec.hnode = hnode;
|
||||
xlrec.ntuples = ndeletable;
|
||||
|
||||
XLogBeginInsert();
|
||||
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
|
||||
XLogRegisterData((char *) &xlrec, SizeOfHashVacuumOnePage);
|
||||
|
||||
/*
|
||||
* We need the target-offsets array whether or not we store the
|
||||
* whole buffer, to allow us to find the latestRemovedXid on a
|
||||
* standby server.
|
||||
*/
|
||||
XLogRegisterData((char *) deletable,
|
||||
ndeletable * sizeof(OffsetNumber));
|
||||
|
||||
XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD);
|
||||
|
||||
recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_VACUUM_ONE_PAGE);
|
||||
|
||||
PageSetLSN(BufferGetPage(buf), recptr);
|
||||
PageSetLSN(BufferGetPage(metabuf), recptr);
|
||||
}
|
||||
|
||||
END_CRIT_SECTION();
|
||||
|
||||
/*
|
||||
* Releasing write lock on meta page as we have updated the tuple
|
||||
* count.
|
||||
*/
|
||||
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,138 +0,0 @@
|
|||
/* -------------------------------------------------------------------------
|
||||
*
|
||||
* hashscan.cpp
|
||||
* manage scans on hash tables
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/gausskernel/storage/access/hash/hashscan.cpp
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/relscan.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rel_gs.h"
|
||||
#include "utils/resowner.h"
|
||||
|
||||
/*
|
||||
* We track all of a backend's active scans on hash indexes using a list
|
||||
* of HashScanListData structs, which are allocated in t_thrd.top_mem_cxt.
|
||||
* It's okay to use a long-lived context because we rely on the ResourceOwner
|
||||
* mechanism to clean up unused entries after transaction or subtransaction
|
||||
* abort. We can't safely keep the entries in the executor's per-query
|
||||
* context, because that might be already freed before we get a chance to
|
||||
* clean up the list. (XXX seems like there should be a better way to
|
||||
* manage this...)
|
||||
*/
|
||||
typedef struct HashScanListData {
|
||||
IndexScanDesc hashsl_scan;
|
||||
ResourceOwner hashsl_owner;
|
||||
struct HashScanListData *hashsl_next;
|
||||
} HashScanListData;
|
||||
|
||||
typedef HashScanListData *HashScanList;
|
||||
|
||||
/*
|
||||
* ReleaseResources_hash() --- clean up hash subsystem resources.
|
||||
*
|
||||
* This is here because it needs to touch this module's static var HashScans.
|
||||
*/
|
||||
void ReleaseResources_hash(void)
|
||||
{
|
||||
HashScanList l = NULL;
|
||||
HashScanList prev = NULL;
|
||||
HashScanList next = NULL;
|
||||
|
||||
/*
|
||||
* Release all HashScanList items belonging to the current ResourceOwner.
|
||||
* Note that we do not release the underlying IndexScanDesc; that's in
|
||||
* executor memory and will go away on its own (in fact quite possibly has
|
||||
* gone away already, so we mustn't try to touch it here).
|
||||
*
|
||||
* Note: this should be a no-op during normal query shutdown. However, in
|
||||
* an abort situation ExecutorEnd is not called and so there may be open
|
||||
* index scans to clean up.
|
||||
*/
|
||||
prev = NULL;
|
||||
|
||||
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = next) {
|
||||
next = l->hashsl_next;
|
||||
if (l->hashsl_owner == t_thrd.utils_cxt.CurrentResourceOwner) {
|
||||
if (prev == NULL)
|
||||
u_sess->exec_cxt.HashScans = next;
|
||||
else
|
||||
prev->hashsl_next = next;
|
||||
|
||||
pfree(l);
|
||||
/* prev does not change */
|
||||
} else
|
||||
prev = l;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_regscan() -- register a new scan.
|
||||
*/
|
||||
void _hash_regscan(IndexScanDesc scan)
|
||||
{
|
||||
HashScanList new_el;
|
||||
|
||||
new_el = (HashScanList)MemoryContextAlloc(
|
||||
SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sizeof(HashScanListData));
|
||||
new_el->hashsl_scan = scan;
|
||||
new_el->hashsl_owner = t_thrd.utils_cxt.CurrentResourceOwner;
|
||||
new_el->hashsl_next = u_sess->exec_cxt.HashScans;
|
||||
u_sess->exec_cxt.HashScans = new_el;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_dropscan() -- drop a scan from the scan list
|
||||
*/
|
||||
void _hash_dropscan(IndexScanDesc scan)
|
||||
{
|
||||
HashScanList chk = NULL;
|
||||
HashScanList last = NULL;
|
||||
|
||||
last = NULL;
|
||||
for (chk = u_sess->exec_cxt.HashScans; chk != NULL && chk->hashsl_scan != scan; chk = chk->hashsl_next)
|
||||
last = chk;
|
||||
|
||||
if (chk == NULL)
|
||||
ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("hash scan list trashed")));
|
||||
|
||||
if (last == NULL)
|
||||
u_sess->exec_cxt.HashScans = chk->hashsl_next;
|
||||
else
|
||||
last->hashsl_next = chk->hashsl_next;
|
||||
|
||||
pfree(chk);
|
||||
}
|
||||
|
||||
/*
|
||||
* Is there an active scan in this bucket?
|
||||
*/
|
||||
bool _hash_has_active_scan(Relation rel, Bucket bucket)
|
||||
{
|
||||
Oid relid = RelationGetRelid(rel);
|
||||
HashScanList l = NULL;
|
||||
|
||||
for (l = u_sess->exec_cxt.HashScans; l != NULL; l = l->hashsl_next) {
|
||||
if (relid == l->hashsl_scan->indexRelation->rd_id) {
|
||||
HashScanOpaque so = (HashScanOpaque)l->hashsl_scan->opaque;
|
||||
|
||||
if (so->hashso_bucket_valid && so->hashso_bucket == bucket)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
|
@ -3,8 +3,8 @@
|
|||
* hashsearch.cpp
|
||||
* search code for postgres hash tables
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -64,40 +64,131 @@ bool _hash_next(IndexScanDesc scan, ScanDirection dir)
|
|||
}
|
||||
|
||||
/*
|
||||
* Advance to next page in a bucket, if any.
|
||||
* Advance to next page in a bucket, if any. If we are scanning the bucket
|
||||
* being populated during split operation then this function advances to the
|
||||
* bucket being split after the last bucket page of bucket being populated.
|
||||
*/
|
||||
static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
|
||||
static void _hash_readnext(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
Relation rel = scan->indexRelation;
|
||||
HashScanOpaque so = (HashScanOpaque)scan->opaque;
|
||||
bool block_found = false;
|
||||
|
||||
blkno = (*opaquep)->hasho_nextblkno;
|
||||
|
||||
/*
|
||||
* Retain the pin on primary bucket page till the end of scan. Refer the
|
||||
* comments in _hash_first to know the reason of retaining pin.
|
||||
*/
|
||||
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
|
||||
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
|
||||
else
|
||||
_hash_relbuf(rel, *bufp);
|
||||
|
||||
*bufp = InvalidBuffer;
|
||||
/* check for interrupts while we're not holding any buffer lock */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
if (BlockNumberIsValid(blkno)) {
|
||||
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
|
||||
block_found = true;
|
||||
} else if (so->hashso_buc_populated && !so->hashso_buc_split) {
|
||||
/*
|
||||
* end of bucket, scan bucket being split if there was a split in
|
||||
* progress at the start of scan.
|
||||
*/
|
||||
*bufp = so->hashso_split_bucket_buf;
|
||||
|
||||
/*
|
||||
* buffer for bucket being split must be valid as we acquire the pin
|
||||
* on it before the start of scan and retain it till end of scan.
|
||||
*/
|
||||
Assert(BufferIsValid(*bufp));
|
||||
|
||||
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
|
||||
|
||||
/*
|
||||
* setting hashso_buc_split to true indicates that we are scanning
|
||||
* bucket being split.
|
||||
*/
|
||||
so->hashso_buc_split = true;
|
||||
|
||||
block_found = true;
|
||||
}
|
||||
|
||||
if (block_found) {
|
||||
*pagep = BufferGetPage(*bufp);
|
||||
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
|
||||
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance to previous page in a bucket, if any.
|
||||
* Advance to previous page in a bucket, if any. If the current scan has
|
||||
* started during split operation then this function advances to bucket
|
||||
* being populated after the first bucket page of bucket being split.
|
||||
*/
|
||||
static void _hash_readprev(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
|
||||
static void _hash_readprev(IndexScanDesc scan, Buffer* bufp, Page* pagep, HashPageOpaque* opaquep)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
|
||||
Relation rel = scan->indexRelation;
|
||||
HashScanOpaque so = (HashScanOpaque) scan->opaque;
|
||||
bool haveprevblk;
|
||||
|
||||
blkno = (*opaquep)->hasho_prevblkno;
|
||||
/*
|
||||
* Retain the pin on primary bucket page till the end of scan. Refer the
|
||||
* comments in _hash_first to know the reason of retaining pin.
|
||||
*/
|
||||
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf) {
|
||||
LockBuffer(*bufp, BUFFER_LOCK_UNLOCK);
|
||||
haveprevblk = false;
|
||||
} else {
|
||||
_hash_relbuf(rel, *bufp);
|
||||
haveprevblk = true;
|
||||
}
|
||||
*bufp = InvalidBuffer;
|
||||
/* check for interrupts while we're not holding any buffer lock */
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
if (BlockNumberIsValid(blkno)) {
|
||||
*bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
||||
if (haveprevblk) {
|
||||
Assert(BlockNumberIsValid(blkno));
|
||||
*bufp = _hash_getbuf(rel, blkno, HASH_READ,
|
||||
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
|
||||
*pagep = BufferGetPage(*bufp);
|
||||
*opaquep = (HashPageOpaque)PageGetSpecialPointer(*pagep);
|
||||
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
|
||||
|
||||
/*
|
||||
* We always maintain the pin on bucket page for whole scan operation,
|
||||
* so releasing the additional pin we have acquired here.
|
||||
*/
|
||||
if (*bufp == so->hashso_bucket_buf || *bufp == so->hashso_split_bucket_buf)
|
||||
_hash_dropbuf(rel, *bufp);
|
||||
} else if (so->hashso_buc_populated && so->hashso_buc_split) {
|
||||
/*
|
||||
* end of bucket, scan bucket being populated if there was a split in
|
||||
* progress at the start of scan.
|
||||
*/
|
||||
*bufp = so->hashso_bucket_buf;
|
||||
|
||||
/*
|
||||
* buffer for bucket being populated must be valid as we acquire the
|
||||
* pin on it before the start of scan and retain it till end of scan.
|
||||
*/
|
||||
Assert(BufferIsValid(*bufp));
|
||||
|
||||
LockBuffer(*bufp, BUFFER_LOCK_SHARE);
|
||||
*pagep = BufferGetPage(*bufp);
|
||||
*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
|
||||
|
||||
/* move to the end of bucket chain */
|
||||
while (BlockNumberIsValid((*opaquep)->hasho_nextblkno))
|
||||
_hash_readnext(scan, bufp, pagep, opaquep);
|
||||
|
||||
/*
|
||||
* setting hashso_buc_split to false indicates that we are scanning
|
||||
* bucket being populated.
|
||||
*/
|
||||
so->hashso_buc_split = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -117,12 +208,9 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
|
|||
ScanKey cur;
|
||||
uint32 hashkey;
|
||||
Bucket bucket;
|
||||
BlockNumber blkno;
|
||||
Buffer buf;
|
||||
Buffer metabuf;
|
||||
Page page;
|
||||
HashPageOpaque opaque;
|
||||
HashMetaPage metap;
|
||||
IndexTuple itup;
|
||||
ItemPointer current;
|
||||
OffsetNumber offnum;
|
||||
|
@ -174,48 +262,71 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
|
|||
|
||||
so->hashso_sk_hash = hashkey;
|
||||
|
||||
/*
|
||||
* Acquire shared split lock so we can compute the target bucket safely
|
||||
* (see README).
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Read the metapage */
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
/*
|
||||
* Compute the target bucket number, and convert to block number.
|
||||
*/
|
||||
bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask);
|
||||
|
||||
blkno = BUCKET_TO_BLKNO(metap, bucket);
|
||||
|
||||
/* done with the metapage */
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
/*
|
||||
* Acquire share lock on target bucket; then we can release split lock.
|
||||
*/
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
|
||||
_hash_droplock(rel, 0, HASH_SHARE);
|
||||
|
||||
/* Update scan opaque state to show we have lock on the bucket */
|
||||
so->hashso_bucket = bucket;
|
||||
so->hashso_bucket_valid = true;
|
||||
so->hashso_bucket_blkno = blkno;
|
||||
|
||||
/* Fetch the primary bucket page for the bucket */
|
||||
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
|
||||
buf = _hash_getbucketbuf_from_hashkey(rel, hashkey, HASH_READ, NULL);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque)PageGetSpecialPointer(page);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
bucket = opaque->hasho_bucket;
|
||||
|
||||
so->hashso_bucket_buf = buf;
|
||||
/*
|
||||
* If a bucket split is in progress, then while scanning the bucket being
|
||||
* populated, we need to skip tuples that were copied from bucket being
|
||||
* split. We also need to maintain a pin on the bucket being split to
|
||||
* ensure that split-cleanup work done by vacuum doesn't remove tuples
|
||||
* from it till this scan is done. We need to maintain a pin on the
|
||||
* bucket being populated to ensure that vacuum doesn't squeeze that
|
||||
* bucket till this scan is complete; otherwise, the ordering of tuples
|
||||
* can't be maintained during forward and backward scans. Here, we have
|
||||
* to be cautious about locking order: first, acquire the lock on bucket
|
||||
* being split; then, release the lock on it but not the pin; then,
|
||||
* acquire a lock on bucket being populated and again re-verify whether
|
||||
* the bucket split is still in progress. Acquiring the lock on bucket
|
||||
* being split first ensures that the vacuum waits for this scan to
|
||||
* finish.
|
||||
*/
|
||||
if (H_BUCKET_BEING_POPULATED(opaque)) {
|
||||
BlockNumber old_blkno;
|
||||
Buffer old_buf;
|
||||
|
||||
old_blkno = _hash_get_oldblock_from_newbucket(rel, bucket);
|
||||
|
||||
/*
|
||||
* release the lock on new bucket and re-acquire it after acquiring
|
||||
* the lock on old bucket.
|
||||
*/
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE);
|
||||
|
||||
/*
|
||||
* remember the split bucket buffer so as to use it later for
|
||||
* scanning.
|
||||
*/
|
||||
so->hashso_split_bucket_buf = old_buf;
|
||||
LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
LockBuffer(buf, BUFFER_LOCK_SHARE);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
Assert(opaque->hasho_bucket == bucket);
|
||||
|
||||
if (H_BUCKET_BEING_POPULATED(opaque)) {
|
||||
so->hashso_buc_populated = true;
|
||||
} else {
|
||||
_hash_dropbuf(rel, so->hashso_split_bucket_buf);
|
||||
so->hashso_split_bucket_buf = InvalidBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
/* If a backwards scan is requested, move to the end of the chain */
|
||||
if (ScanDirectionIsBackward(dir)) {
|
||||
while (BlockNumberIsValid(opaque->hasho_nextblkno))
|
||||
_hash_readnext(rel, &buf, &page, &opaque);
|
||||
/*
|
||||
* Backward scans that start during split needs to start from end of
|
||||
* bucket being split.
|
||||
*/
|
||||
while (BlockNumberIsValid(opaque->hasho_nextblkno) ||
|
||||
(so->hashso_buc_populated && !so->hashso_buc_split))
|
||||
_hash_readnext(scan, &buf, &page, &opaque);
|
||||
}
|
||||
|
||||
/* Now find the first tuple satisfying the qualification */
|
||||
|
@ -239,6 +350,12 @@ bool _hash_first(IndexScanDesc scan, ScanDirection dir)
|
|||
* false. Else, return true and set the hashso_curpos for the
|
||||
* scan to the right thing.
|
||||
*
|
||||
* Here we need to ensure that if the scan has started during split, then
|
||||
* skip the tuples that are moved by split while scanning bucket being
|
||||
* populated and then scan the bucket being split to cover all such
|
||||
* tuples. This is done to ensure that we don't miss tuples in the scans
|
||||
* that are started during split.
|
||||
*
|
||||
* 'bufP' points to the current buffer, which is pinned and read-locked.
|
||||
* On success exit, we have pin and read-lock on whichever page
|
||||
* contains the right item; on failure, we have released all buffers.
|
||||
|
@ -283,9 +400,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
do {
|
||||
switch (dir) {
|
||||
case ForwardScanDirection:
|
||||
if (offnum != InvalidOffsetNumber)
|
||||
if (offnum != InvalidOffsetNumber) {
|
||||
offnum = OffsetNumberNext(offnum); /* move forward */
|
||||
else {
|
||||
} else {
|
||||
/* new page, locate starting position by binary search */
|
||||
offnum = _hash_binsearch(page, so->hashso_sk_hash);
|
||||
}
|
||||
|
@ -298,14 +415,27 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
if (offnum <= maxoff) {
|
||||
Assert(offnum >= FirstOffsetNumber);
|
||||
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
|
||||
/*
|
||||
* skip the tuples that are moved by split operation
|
||||
* for the scan that has started when split was in
|
||||
* progress
|
||||
*/
|
||||
if (so->hashso_buc_populated && !so->hashso_buc_split &&
|
||||
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
|
||||
offnum = OffsetNumberNext(offnum); /* move forward */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
|
||||
break; /* yes, so exit for-loop */
|
||||
}
|
||||
|
||||
/* Before leaving current page, deal with any killed items */
|
||||
if (so->numKilled > 0)
|
||||
_hash_kill_items(scan);
|
||||
/*
|
||||
* ran off the end of this page, try the next
|
||||
*/
|
||||
_hash_readnext(rel, &buf, &page, &opaque);
|
||||
_hash_readnext(scan, &buf, &page, &opaque);
|
||||
if (BufferIsValid(buf)) {
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
offnum = _hash_binsearch(page, so->hashso_sk_hash);
|
||||
|
@ -318,9 +448,9 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
break;
|
||||
|
||||
case BackwardScanDirection:
|
||||
if (offnum != InvalidOffsetNumber)
|
||||
if (offnum != InvalidOffsetNumber) {
|
||||
offnum = OffsetNumberPrev(offnum); /* move back */
|
||||
else {
|
||||
} else {
|
||||
/* new page, locate starting position by binary search */
|
||||
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
|
||||
}
|
||||
|
@ -333,14 +463,26 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
if (offnum >= FirstOffsetNumber) {
|
||||
Assert(offnum <= maxoff);
|
||||
itup = (IndexTuple)PageGetItem(page, PageGetItemId(page, offnum));
|
||||
/*
|
||||
* skip the tuples that are moved by split operation
|
||||
* for the scan that has started when split was in
|
||||
* progress
|
||||
*/
|
||||
if (so->hashso_buc_populated && !so->hashso_buc_split &&
|
||||
(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) {
|
||||
offnum = OffsetNumberPrev(offnum); /* move back */
|
||||
continue;
|
||||
}
|
||||
if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
|
||||
break; /* yes, so exit for-loop */
|
||||
}
|
||||
|
||||
/* Before leaving current page, deal with any killed items */
|
||||
if (so->numKilled > 0)
|
||||
_hash_kill_items(scan);
|
||||
/*
|
||||
* ran off the end of this page, try the next
|
||||
*/
|
||||
_hash_readprev(rel, &buf, &page, &opaque);
|
||||
_hash_readprev(scan, &buf, &page, &opaque);
|
||||
if (BufferIsValid(buf)) {
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
|
||||
|
@ -360,9 +502,16 @@ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
|||
}
|
||||
|
||||
if (itup == NULL) {
|
||||
/* we ran off the end of the bucket without finding a match */
|
||||
/*
|
||||
* We ran off the end of the bucket without finding a match.
|
||||
* Release the pin on bucket buffers. Normally, such pins are
|
||||
* released at end of scan, however scrolling cursors can
|
||||
* reacquire the bucket lock and pin in the same scan multiple
|
||||
* times.
|
||||
*/
|
||||
*bufP = so->hashso_curbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(current);
|
||||
_hash_dropscanbuf(rel, so);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
* plenty of locality of access.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
|
@ -37,15 +37,23 @@
|
|||
struct HSpool {
|
||||
Tuplesortstate *sortstate; /* state data for tuplesort.c */
|
||||
Relation index;
|
||||
/*
|
||||
* We sort the hash keys based on the buckets they belong to. Below masks
|
||||
* are used in _hash_hashkey2bucket to determine the bucket of given hash
|
||||
* key.
|
||||
*/
|
||||
uint32 high_mask;
|
||||
uint32 low_mask;
|
||||
uint32 max_buckets;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* create and initialize a spool structure
|
||||
*/
|
||||
HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
|
||||
HSpool *_h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void *meminfo)
|
||||
{
|
||||
HSpool *hspool = (HSpool *)palloc0(sizeof(HSpool));
|
||||
uint32 hash_mask;
|
||||
UtilityDesc *desc = (UtilityDesc *)meminfo;
|
||||
int work_mem = (desc->query_mem[0] > 0) ? desc->query_mem[0] : u_sess->attr.attr_memory.maintenance_work_mem;
|
||||
int max_mem = (desc->query_mem[1] > 0) ? desc->query_mem[1] : 0;
|
||||
|
@ -57,18 +65,26 @@ HSpool *_h_spoolinit(Relation index, uint32 num_buckets, void *meminfo)
|
|||
* num_buckets buckets in the index, the appropriate mask can be computed
|
||||
* as follows.
|
||||
*
|
||||
* Note: at present, the passed-in num_buckets is always a power of 2, so
|
||||
* we could just compute num_buckets - 1. We prefer not to assume that
|
||||
* here, though.
|
||||
* NOTE : This hash mask calculation should be in sync with similar
|
||||
* calculation in _hash_init_metabuffer.
|
||||
*/
|
||||
hash_mask = (((uint32)1) << _hash_log2(num_buckets)) - 1;
|
||||
hspool->high_mask = (((uint32) 1) << _hash_log2(num_buckets + 1)) - 1;
|
||||
hspool->low_mask = (hspool->high_mask >> 1);
|
||||
hspool->max_buckets = num_buckets - 1;
|
||||
|
||||
/*
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem to
|
||||
* speed index creation. This should be OK since a single backend can't
|
||||
* run multiple index creations in parallel.
|
||||
*/
|
||||
hspool->sortstate = tuplesort_begin_index_hash(index, hash_mask, work_mem, false, max_mem);
|
||||
hspool->sortstate = tuplesort_begin_index_hash(heap,
|
||||
index,
|
||||
hspool->high_mask,
|
||||
hspool->low_mask,
|
||||
hspool->max_buckets,
|
||||
work_mem,
|
||||
false,
|
||||
max_mem);
|
||||
|
||||
return hspool;
|
||||
}
|
||||
|
@ -94,7 +110,7 @@ void _h_spool(HSpool *hspool, ItemPointer self, Datum *values, const bool *isnul
|
|||
* given a spool loaded by successive calls to _h_spool,
|
||||
* create an entire index.
|
||||
*/
|
||||
void _h_indexbuild(HSpool *hspool)
|
||||
void _h_indexbuild(HSpool *hspool, Relation heapRel)
|
||||
{
|
||||
IndexTuple itup;
|
||||
bool should_free = false;
|
||||
|
@ -102,7 +118,7 @@ void _h_indexbuild(HSpool *hspool)
|
|||
tuplesort_performsort(hspool->sortstate);
|
||||
|
||||
while ((itup = tuplesort_getindextuple(hspool->sortstate, true, &should_free)) != NULL) {
|
||||
_hash_doinsert(hspool->index, itup);
|
||||
_hash_doinsert(hspool->index, itup, heapRel);
|
||||
if (should_free)
|
||||
pfree(itup);
|
||||
}
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
* hashutil.cpp
|
||||
* Utility code for Postgres hash implementation.
|
||||
*
|
||||
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
|
@ -22,7 +22,9 @@
|
|||
#include "utils/lsyscache.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rel_gs.h"
|
||||
#include "storage/buf/buf_internals.h"
|
||||
|
||||
#define CALC_NEW_BUCKET(old_bucket, lowmask) ((old_bucket) | ((lowmask) + 1))
|
||||
/*
|
||||
* _hash_checkqual -- does the index tuple satisfy the scan conditions?
|
||||
*/
|
||||
|
@ -133,6 +135,70 @@ uint32 _hash_log2(uint32 num)
|
|||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_spareindex -- returns spare index / global splitpoint phase of the bucket
|
||||
*/
|
||||
uint32 _hash_spareindex(uint32 num_bucket)
|
||||
{
|
||||
uint32 splitpoint_group;
|
||||
uint32 splitpoint_phases;
|
||||
|
||||
splitpoint_group = _hash_log2(num_bucket);
|
||||
|
||||
if (splitpoint_group < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
||||
return splitpoint_group;
|
||||
|
||||
/* account for single-phase groups */
|
||||
splitpoint_phases = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
|
||||
|
||||
/* account for multi-phase groups before splitpoint_group */
|
||||
splitpoint_phases +=
|
||||
((splitpoint_group - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) <<
|
||||
HASH_SPLITPOINT_PHASE_BITS);
|
||||
|
||||
/* account for phases within current group */
|
||||
splitpoint_phases +=
|
||||
(((num_bucket - 1) >>
|
||||
(splitpoint_group - (HASH_SPLITPOINT_PHASE_BITS + 1))) &
|
||||
HASH_SPLITPOINT_PHASE_MASK); /* to 0-based value. */
|
||||
|
||||
return splitpoint_phases;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_totalbuckets -- returns total number of buckets allocated till
|
||||
* the given splitpoint phase.
|
||||
*/
|
||||
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
|
||||
{
|
||||
uint32 splitpoint_group;
|
||||
uint32 total_buckets;
|
||||
uint32 phases_within_splitpoint_group;
|
||||
|
||||
if (splitpoint_phase < HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
||||
return (1 << splitpoint_phase);
|
||||
|
||||
/* get splitpoint's group */
|
||||
splitpoint_group = HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE;
|
||||
splitpoint_group +=
|
||||
((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) >>
|
||||
HASH_SPLITPOINT_PHASE_BITS);
|
||||
|
||||
/* account for buckets before splitpoint_group */
|
||||
total_buckets = (1 << (splitpoint_group - 1));
|
||||
|
||||
/* account for buckets within splitpoint_group */
|
||||
phases_within_splitpoint_group =
|
||||
(((splitpoint_phase - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) &
|
||||
HASH_SPLITPOINT_PHASE_MASK) + 1); /* from 0-based to 1-based */
|
||||
total_buckets +=
|
||||
(((1 << (splitpoint_group - 1)) >> HASH_SPLITPOINT_PHASE_BITS) *
|
||||
phases_within_splitpoint_group);
|
||||
|
||||
return total_buckets;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* _hash_checkpage -- sanity checks on the format of all hash pages
|
||||
*
|
||||
|
@ -216,25 +282,36 @@ uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
|
|||
}
|
||||
|
||||
/*
|
||||
* _hash_form_tuple - form an index tuple containing hash code only
|
||||
* _hash_convert_tuple - convert raw index data to hash key
|
||||
*
|
||||
* Inputs: values and isnull arrays for the user data column(s)
|
||||
* Outputs: values and isnull arrays for the index tuple, suitable for
|
||||
* passing to index_form_tuple().
|
||||
*
|
||||
* Returns true if successful, false if not (because there are null values).
|
||||
* On a false result, the given data need not be indexed.
|
||||
*
|
||||
* Note: callers know that the index-column arrays are always of length 1.
|
||||
* In principle, there could be more than one input column, though we do not
|
||||
* currently support that.
|
||||
*/
|
||||
IndexTuple _hash_form_tuple(Relation index, Datum *values, const bool *isnull)
|
||||
bool _hash_convert_tuple(Relation index,
|
||||
Datum *user_values, const bool *user_isnull,
|
||||
Datum *index_values, bool *index_isnull)
|
||||
{
|
||||
IndexTuple itup;
|
||||
uint32 hashkey;
|
||||
Datum hashkeydatum;
|
||||
TupleDesc hashdesc;
|
||||
|
||||
if (isnull[0]) {
|
||||
hashkeydatum = (Datum)0;
|
||||
} else {
|
||||
hashkey = _hash_datum2hashkey(index, values[0]);
|
||||
hashkeydatum = UInt32GetDatum(hashkey);
|
||||
}
|
||||
hashdesc = RelationGetDescr(index);
|
||||
Assert(hashdesc->natts == 1);
|
||||
itup = index_form_tuple(hashdesc, &hashkeydatum, isnull);
|
||||
return itup;
|
||||
/*
|
||||
* We do not insert null values into hash indexes. This is okay because
|
||||
* the only supported search operator is '=', and we assume it is strict.
|
||||
*/
|
||||
if (user_isnull[0])
|
||||
return false;
|
||||
|
||||
hashkey = _hash_datum2hashkey(index, user_values[0]);
|
||||
index_values[0] = UInt32GetDatum(hashkey);
|
||||
index_isnull[0] = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -312,3 +389,154 @@ OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
|
|||
|
||||
return lower;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
|
||||
* from which current (new) bucket is being split.
|
||||
*/
|
||||
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
|
||||
{
|
||||
Bucket old_bucket;
|
||||
uint32 mask;
|
||||
Buffer metabuf;
|
||||
HashMetaPage metap;
|
||||
BlockNumber blkno;
|
||||
|
||||
/*
|
||||
* To get the old bucket from the current bucket, we need a mask to modulo
|
||||
* into lower half of table. This mask is stored in meta page as
|
||||
* hashm_lowmask, but here we can't rely on the same, because we need a
|
||||
* value of lowmask that was prevalent at the time when bucket split was
|
||||
* started. Masking the most significant bit of new bucket would give us
|
||||
* old bucket.
|
||||
*/
|
||||
mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1;
|
||||
old_bucket = new_bucket & mask;
|
||||
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
blkno = BUCKET_TO_BLKNO(metap, old_bucket);
|
||||
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
return blkno;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
|
||||
* that will be generated after split from old bucket.
|
||||
*
|
||||
* This is used to find the new bucket from old bucket based on current table
|
||||
* half. It is mainly required to finish the incomplete splits where we are
|
||||
* sure that not more than one bucket could have split in progress from old
|
||||
* bucket.
|
||||
*/
|
||||
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
|
||||
{
|
||||
Bucket new_bucket;
|
||||
Buffer metabuf;
|
||||
HashMetaPage metap;
|
||||
BlockNumber blkno;
|
||||
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
|
||||
metap = HashPageGetMeta(BufferGetPage(metabuf));
|
||||
|
||||
new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
|
||||
metap->hashm_lowmask,
|
||||
metap->hashm_maxbucket);
|
||||
blkno = BUCKET_TO_BLKNO(metap, new_bucket);
|
||||
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
return blkno;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
|
||||
* generated after split from current (old) bucket.
|
||||
*
|
||||
* This is used to find the new bucket from old bucket. New bucket can be
|
||||
* obtained by OR'ing old bucket with most significant bit of current table
|
||||
* half (lowmask passed in this function can be used to identify msb of
|
||||
* current table half). There could be multiple buckets that could have
|
||||
* been split from current bucket. We need the first such bucket that exists.
|
||||
* Caller must ensure that no more than one split has happened from old
|
||||
* bucket.
|
||||
*/
|
||||
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
|
||||
uint32 lowmask, uint32 maxbucket)
|
||||
{
|
||||
Bucket new_bucket;
|
||||
|
||||
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
|
||||
if (new_bucket > maxbucket) {
|
||||
lowmask = lowmask >> 1;
|
||||
new_bucket = CALC_NEW_BUCKET(old_bucket, lowmask);
|
||||
}
|
||||
|
||||
return new_bucket;
|
||||
}
|
||||
|
||||
/*
|
||||
* _hash_kill_items - set LP_DEAD state for items an indexscan caller has
|
||||
* told us were killed.
|
||||
*
|
||||
* scan->opaque, referenced locally through so, contains information about the
|
||||
* current page and killed tuples thereon (generally, this should only be
|
||||
* called if so->numKilled > 0).
|
||||
*
|
||||
* We match items by heap TID before assuming they are the right ones to
|
||||
* delete.
|
||||
*/
|
||||
void _hash_kill_items(IndexScanDesc scan)
|
||||
{
|
||||
HashScanOpaque so = (HashScanOpaque) scan->opaque;
|
||||
Page page;
|
||||
HashPageOpaque opaque;
|
||||
OffsetNumber offnum;
|
||||
OffsetNumber maxoff;
|
||||
int numKilled = so->numKilled;
|
||||
int i;
|
||||
bool killedsomething = false;
|
||||
|
||||
Assert(so->numKilled > 0);
|
||||
Assert(so->killedItems != NULL);
|
||||
|
||||
/*
|
||||
* Always reset the scan state, so we don't look for same items on other
|
||||
* pages.
|
||||
*/
|
||||
so->numKilled = 0;
|
||||
|
||||
page = BufferGetPage(so->hashso_curbuf);
|
||||
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
for (i = 0; i < numKilled; i++) {
|
||||
offnum = so->killedItems[i].indexOffset;
|
||||
|
||||
while (offnum <= maxoff) {
|
||||
ItemId iid = PageGetItemId(page, offnum);
|
||||
IndexTuple ituple = (IndexTuple)PageGetItem(page, iid);
|
||||
|
||||
if (ItemPointerEquals(&ituple->t_tid, &so->killedItems[i].heapTid)) {
|
||||
/* found the item */
|
||||
ItemIdMarkDead(iid);
|
||||
killedsomething = true;
|
||||
break; /* out of inner search loop */
|
||||
}
|
||||
offnum = OffsetNumberNext(offnum);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, mark as dirty hint. Whenever
|
||||
* we mark anything LP_DEAD, we also set the page's
|
||||
* LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
|
||||
*/
|
||||
if (killedsomething) {
|
||||
opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
|
||||
MarkBufferDirtyHint(so->hashso_curbuf, true);
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -843,6 +843,9 @@ void XLogBlockDataCommonRedo(XLogBlockHead *blockhead, void *blockrecbody, RedoB
|
|||
case RM_BTREE_ID:
|
||||
BtreeRedoDataBlock(blockhead, blockdatarec, bufferinfo);
|
||||
break;
|
||||
case RM_HASH_ID:
|
||||
HashRedoDataBlock(blockhead, blockdatarec, bufferinfo);
|
||||
break;
|
||||
case RM_XLOG_ID:
|
||||
xlog_redo_data_block(blockhead, blockdatarec, bufferinfo);
|
||||
break;
|
||||
|
|
|
@ -16,9 +16,155 @@
|
|||
#include "postgres.h"
|
||||
#include "knl/knl_variable.h"
|
||||
|
||||
#include "access/hash.h"
|
||||
#include "access/rmgr.h"
|
||||
#include "access/hash_xlog.h"
|
||||
|
||||
void hash_desc(StringInfo buf, XLogReaderState *record)
|
||||
{
|
||||
/* nothing to do */
|
||||
char *rec = XLogRecGetData(record);
|
||||
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
||||
|
||||
switch (info) {
|
||||
case XLOG_HASH_INIT_META_PAGE:
|
||||
{
|
||||
xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "num_tuples %g, fillfactor %d",
|
||||
xlrec->num_tuples, xlrec->ffactor);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_INIT_BITMAP_PAGE:
|
||||
{
|
||||
xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "bmsize %d", xlrec->bmsize);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_INSERT:
|
||||
{
|
||||
xl_hash_insert *xlrec = (xl_hash_insert *) rec;
|
||||
|
||||
appendStringInfo(buf, "off %u", xlrec->offnum);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_ADD_OVFL_PAGE:
|
||||
{
|
||||
xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "bmsize %d, bmpage_found %c",
|
||||
xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
||||
{
|
||||
xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c",
|
||||
xlrec->new_bucket,
|
||||
(xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F',
|
||||
(xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_SPLIT_COMPLETE:
|
||||
{
|
||||
xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec;
|
||||
|
||||
appendStringInfo(buf, "old_bucket_flag %u, new_bucket_flag %u",
|
||||
xlrec->old_bucket_flag, xlrec->new_bucket_flag);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
||||
{
|
||||
xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec;
|
||||
|
||||
appendStringInfo(buf, "ntups %d, is_primary %c",
|
||||
xlrec->ntups,
|
||||
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_SQUEEZE_PAGE:
|
||||
{
|
||||
xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c",
|
||||
xlrec->prevblkno,
|
||||
xlrec->nextblkno,
|
||||
xlrec->ntups,
|
||||
xlrec->is_prim_bucket_same_wrt ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_DELETE:
|
||||
{
|
||||
xl_hash_delete *xlrec = (xl_hash_delete *) rec;
|
||||
|
||||
appendStringInfo(buf, "clear_dead_marking %c, is_primary %c",
|
||||
xlrec->clear_dead_marking ? 'T' : 'F',
|
||||
xlrec->is_primary_bucket_page ? 'T' : 'F');
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_UPDATE_META_PAGE:
|
||||
{
|
||||
xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "ntuples %g",
|
||||
xlrec->ntuples);
|
||||
break;
|
||||
}
|
||||
case XLOG_HASH_VACUUM_ONE_PAGE:
|
||||
{
|
||||
xl_hash_vacuum_one_page *xlrec = (xl_hash_vacuum_one_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "ntuples %d",
|
||||
xlrec->ntuples);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const char *hash_identify(uint8 info)
|
||||
{
|
||||
const char *id = NULL;
|
||||
|
||||
switch (info & ~XLR_INFO_MASK) {
|
||||
case XLOG_HASH_INIT_META_PAGE:
|
||||
id = "INIT_META_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_INIT_BITMAP_PAGE:
|
||||
id = "INIT_BITMAP_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_INSERT:
|
||||
id = "INSERT";
|
||||
break;
|
||||
case XLOG_HASH_ADD_OVFL_PAGE:
|
||||
id = "ADD_OVFL_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_ALLOCATE_PAGE:
|
||||
id = "SPLIT_ALLOCATE_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_PAGE:
|
||||
id = "SPLIT_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_COMPLETE:
|
||||
id = "SPLIT_COMPLETE";
|
||||
break;
|
||||
case XLOG_HASH_MOVE_PAGE_CONTENTS:
|
||||
id = "MOVE_PAGE_CONTENTS";
|
||||
break;
|
||||
case XLOG_HASH_SQUEEZE_PAGE:
|
||||
id = "SQUEEZE_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_DELETE:
|
||||
id = "DELETE";
|
||||
break;
|
||||
case XLOG_HASH_SPLIT_CLEANUP:
|
||||
id = "SPLIT_CLEANUP";
|
||||
break;
|
||||
case XLOG_HASH_UPDATE_META_PAGE:
|
||||
id = "UPDATE_META_PAGE";
|
||||
break;
|
||||
case XLOG_HASH_VACUUM_ONE_PAGE:
|
||||
id = "VACUUM_ONE_PAGE";
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
|
@ -32,6 +32,7 @@
|
|||
#include "access/xact.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/nbtree.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/multixact.h"
|
||||
|
@ -165,7 +166,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
|
|||
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
|
||||
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
|
||||
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE },
|
||||
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
|
||||
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
|
||||
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
|
||||
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
|
||||
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
|
||||
|
@ -1031,8 +1032,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
|
|||
/* Run from the dispatcher thread. */
|
||||
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
{
|
||||
DispatchTxnRecord(record, expectedTLIs, recordXTime, false, true);
|
||||
return true;
|
||||
bool isNeedFullSync = false;
|
||||
|
||||
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
|
||||
if (IsHashVacuumPages(record) && g_supportHotStandby) {
|
||||
GetSlotIds(record, ANY_WORKER, true);
|
||||
/* sync with trxn thread */
|
||||
/* only need to process in pageworker thread, wait trxn sync */
|
||||
/* pageworker exe, trxn don't need exe */
|
||||
DispatchToSpecPageWorker(record, expectedTLIs, true);
|
||||
} else {
|
||||
DispatchRecordWithPages(record, expectedTLIs, true);
|
||||
}
|
||||
|
||||
return isNeedFullSync;
|
||||
}
|
||||
|
||||
/* Run from the dispatcher thread. */
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include "access/xact.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/nbtree.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/multixact.h"
|
||||
|
@ -165,7 +166,7 @@ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = {
|
|||
{ DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE },
|
||||
{ DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE },
|
||||
{ DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE },
|
||||
{ DispatchHashRecord, NULL, RM_HASH_ID, 0, 0 },
|
||||
{ DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE },
|
||||
{ DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE },
|
||||
/* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */
|
||||
{ DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 },
|
||||
|
@ -912,8 +913,20 @@ static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, Time
|
|||
/* Run from the dispatcher thread. */
|
||||
static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
{
|
||||
DispatchTxnRecord(record, expectedTLIs, recordXTime, false);
|
||||
return true;
|
||||
bool isNeedFullSync = false;
|
||||
|
||||
/* index not support mvcc, so we need to sync with trx thread when the record is vacuum */
|
||||
if (IsHashVacuumPages(record) && g_supportHotStandby) {
|
||||
GetWorkerIds(record, ANY_WORKER, true);
|
||||
/* sync with trxn thread */
|
||||
/* only need to process in pageworker thread, wait trxn sync */
|
||||
/* pageworker exe, trxn don't need exe */
|
||||
DispatchToSpecPageWorker(record, expectedTLIs, true);
|
||||
} else {
|
||||
DispatchRecordWithPages(record, expectedTLIs, true);
|
||||
}
|
||||
|
||||
return isNeedFullSync;
|
||||
}
|
||||
|
||||
static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime)
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "access/gin.h"
|
||||
#include "access/gist_private.h"
|
||||
#include "access/hash.h"
|
||||
#include "access/hash_xlog.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/multixact.h"
|
||||
#include "access/nbtree.h"
|
||||
|
|
|
@ -5264,6 +5264,51 @@ bool ConditionalLockBufferForCleanup(Buffer buffer)
|
|||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* IsBufferCleanupOK - as above, but we already have the lock
|
||||
*
|
||||
* Check whether it's OK to perform cleanup on a buffer we've already
|
||||
* locked. If we observe that the pin count is 1, our exclusive lock
|
||||
* happens to be a cleanup lock, and we can proceed with anything that
|
||||
* would have been allowable had we sought a cleanup lock originally.
|
||||
*/
|
||||
bool IsBufferCleanupOK(Buffer buffer)
|
||||
{
|
||||
BufferDesc *bufHdr;
|
||||
uint32 buf_state;
|
||||
|
||||
Assert(BufferIsValid(buffer));
|
||||
|
||||
if (BufferIsLocal(buffer)) {
|
||||
/* There should be exactly one pin */
|
||||
if (u_sess->storage_cxt.LocalRefCount[-buffer - 1] != 1)
|
||||
return false;
|
||||
/* Nobody else to wait for */
|
||||
return true;
|
||||
}
|
||||
|
||||
/* There should be exactly one local pin */
|
||||
if (GetPrivateRefCount(buffer) != 1)
|
||||
return false;
|
||||
|
||||
bufHdr = GetBufferDescriptor(buffer - 1);
|
||||
|
||||
/* caller must hold exclusive lock on buffer */
|
||||
Assert(LWLockHeldByMeInMode(bufHdr->content_lock, LW_EXCLUSIVE));
|
||||
|
||||
buf_state = LockBufHdr(bufHdr);
|
||||
|
||||
Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
|
||||
if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) {
|
||||
/* pincount is OK. */
|
||||
UnlockBufHdr(bufHdr, buf_state);
|
||||
return true;
|
||||
}
|
||||
|
||||
UnlockBufHdr(bufHdr, buf_state);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Functions for buffer I/O handling
|
||||
*
|
||||
|
|
|
@ -400,3 +400,28 @@ void PageSetChecksumInplace(Page page, BlockNumber blkno)
|
|||
|
||||
((PageHeader)page)->pd_checksum = pg_checksum_page((char*)page, blkno);
|
||||
}
|
||||
|
||||
/*
|
||||
* PageGetFreeSpaceForMultipleTuples
|
||||
* Returns the size of the free (allocatable) space on a page,
|
||||
* reduced by the space needed for multiple new line pointers.
|
||||
*
|
||||
* Note: this should usually only be used on index pages. Use
|
||||
* PageGetHeapFreeSpace on heap pages.
|
||||
*/
|
||||
Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
|
||||
{
|
||||
int space;
|
||||
|
||||
/*
|
||||
* Use signed arithmetic here so that we behave sensibly if pd_lower >
|
||||
* pd_upper.
|
||||
*/
|
||||
space = (int)((PageHeader)page)->pd_upper - (int)((PageHeader)page)->pd_lower;
|
||||
|
||||
if (space < (int)(ntups * sizeof(ItemIdData)))
|
||||
return 0;
|
||||
space -= ntups * sizeof(ItemIdData);
|
||||
|
||||
return (Size) space;
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
* header file for postgres hash access method implementation
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/access/hash.h
|
||||
|
@ -33,36 +33,59 @@
|
|||
*/
|
||||
typedef uint32 Bucket;
|
||||
|
||||
#define INVALID_BUCKET_NUM (0xFFFFFFFF)
|
||||
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B) + 1) - 1] : 0)) + 1)
|
||||
#define InvalidBucket ((Bucket) 0xFFFFFFFF)
|
||||
#define BUCKET_TO_BLKNO(metap, B) ((BlockNumber)((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B) + 1) - 1] : 0)) + 1)
|
||||
|
||||
/*
|
||||
* Special space for hash index pages.
|
||||
*
|
||||
* hasho_flag tells us which type of page we're looking at. For
|
||||
* example, knowing overflow pages from bucket pages is necessary
|
||||
* information when you're deleting tuples from a page. If all the
|
||||
* tuples are deleted from an overflow page, the overflow is made
|
||||
* available to other buckets by calling _hash_freeovflpage(). If all
|
||||
* the tuples are deleted from a bucket page, no additional action is
|
||||
* necessary.
|
||||
* hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
|
||||
* Additional bits in the flag word are used for more transient purposes.
|
||||
*
|
||||
* To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
|
||||
* However, we ensure that each used page type has a distinct bit so that
|
||||
* we can OR together page types for uses such as the allowable-page-types
|
||||
* argument of _hash_checkpage().
|
||||
*/
|
||||
#define LH_UNUSED_PAGE (0)
|
||||
#define LH_OVERFLOW_PAGE (1 << 0)
|
||||
#define LH_BUCKET_PAGE (1 << 1)
|
||||
#define LH_BITMAP_PAGE (1 << 2)
|
||||
#define LH_META_PAGE (1 << 3)
|
||||
#define LH_BUCKET_BEING_POPULATED (1 << 4)
|
||||
#define LH_BUCKET_BEING_SPLIT (1 << 5)
|
||||
#define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
|
||||
#define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
|
||||
|
||||
#define LH_PAGE_TYPE \
|
||||
(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
|
||||
|
||||
/*
|
||||
* In an overflow page, hasho_prevblkno stores the block number of the previous
|
||||
* page in the bucket chain; in a bucket page, hasho_prevblkno stores the
|
||||
* hashm_maxbucket value as of the last time the bucket was last split, or
|
||||
* else as of the time the bucket was created. The latter convention is used
|
||||
* to determine whether a cached copy of the metapage is too stale to be used
|
||||
* without needing to lock or pin the metapage.
|
||||
*
|
||||
* hasho_nextblkno is always the block number of the next page in the
|
||||
* bucket chain, or InvalidBlockNumber if there are no more such pages.
|
||||
*/
|
||||
typedef struct HashPageOpaqueData {
|
||||
BlockNumber hasho_prevblkno; /* previous ovfl (or bucket) blkno */
|
||||
BlockNumber hasho_nextblkno; /* next ovfl blkno */
|
||||
BlockNumber hasho_prevblkno; /* see above */
|
||||
BlockNumber hasho_nextblkno; /* see above */
|
||||
Bucket hasho_bucket; /* bucket number this pg belongs to */
|
||||
uint16 hasho_flag; /* page type code, see above */
|
||||
uint16 hasho_flag; /* page type code + flag bits, see above */
|
||||
uint16 hasho_page_id; /* for identification of hash indexes */
|
||||
} HashPageOpaqueData;
|
||||
|
||||
typedef HashPageOpaqueData* HashPageOpaque;
|
||||
|
||||
#define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
|
||||
#define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
|
||||
#define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
|
||||
#define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
|
||||
|
||||
/*
|
||||
* The page ID is for the convenience of pg_filedump and similar utilities,
|
||||
* which otherwise would have a hard time telling pages of different index
|
||||
|
@ -71,6 +94,12 @@ typedef HashPageOpaqueData* HashPageOpaque;
|
|||
*/
|
||||
#define HASHO_PAGE_ID 0xFF80
|
||||
|
||||
typedef struct HashScanPosItem {
|
||||
ItemPointerData heapTid; /* TID of referenced heap item */
|
||||
OffsetNumber indexOffset; /* index item's location within page */
|
||||
} HashScanPosItem;
|
||||
|
||||
|
||||
/*
|
||||
* HashScanOpaqueData is private state for a hash index scan.
|
||||
*/
|
||||
|
@ -78,19 +107,6 @@ typedef struct HashScanOpaqueData {
|
|||
/* Hash value of the scan key, ie, the hash key we seek */
|
||||
uint32 hashso_sk_hash;
|
||||
|
||||
/*
|
||||
* By definition, a hash scan should be examining only one bucket. We
|
||||
* record the bucket number here as soon as it is known.
|
||||
*/
|
||||
Bucket hashso_bucket;
|
||||
bool hashso_bucket_valid;
|
||||
|
||||
/*
|
||||
* If we have a share lock on the bucket, we record it here. When
|
||||
* hashso_bucket_blkno is zero, we have no such lock.
|
||||
*/
|
||||
BlockNumber hashso_bucket_blkno;
|
||||
|
||||
/*
|
||||
* We also want to remember which buffer we're currently examining in the
|
||||
* scan. We keep the buffer pinned (but not locked) across hashgettuple
|
||||
|
@ -99,11 +115,33 @@ typedef struct HashScanOpaqueData {
|
|||
*/
|
||||
Buffer hashso_curbuf;
|
||||
|
||||
/* remember the buffer associated with primary bucket */
|
||||
Buffer hashso_bucket_buf;
|
||||
|
||||
/*
|
||||
* remember the buffer associated with primary bucket page of bucket being
|
||||
* split. it is required during the scan of the bucket which is being
|
||||
* populated during split operation.
|
||||
*/
|
||||
Buffer hashso_split_bucket_buf;
|
||||
|
||||
/* Current position of the scan, as an index TID */
|
||||
ItemPointerData hashso_curpos;
|
||||
|
||||
/* Current position of the scan, as a heap TID */
|
||||
ItemPointerData hashso_heappos;
|
||||
|
||||
/* Whether scan starts on bucket being populated due to split */
|
||||
bool hashso_buc_populated;
|
||||
|
||||
/*
|
||||
* Whether scanning bucket being split? The value of this parameter is
|
||||
* referred only when hashso_buc_populated is true.
|
||||
*/
|
||||
bool hashso_buc_split;
|
||||
/* info about killed items if any (killedItems is NULL if never used) */
|
||||
HashScanPosItem *killedItems; /* tids and offset numbers of killed items */
|
||||
int numKilled; /* number of currently stored items */
|
||||
} HashScanOpaqueData;
|
||||
|
||||
typedef HashScanOpaqueData* HashScanOpaque;
|
||||
|
@ -115,7 +153,7 @@ typedef HashScanOpaqueData* HashScanOpaque;
|
|||
#define HASH_METAPAGE 0 /* metapage is always block 0 */
|
||||
|
||||
#define HASH_MAGIC 0x6440640
|
||||
#define HASH_VERSION 2 /* 2 signifies only hash key value is stored */
|
||||
#define HASH_VERSION 4
|
||||
|
||||
/*
|
||||
* Spares[] holds the number of overflow pages currently allocated at or
|
||||
|
@ -128,17 +166,32 @@ typedef HashScanOpaqueData* HashScanOpaque;
|
|||
*
|
||||
* ovflpages that have been recycled for reuse can be found by looking at
|
||||
* bitmaps that are stored within ovflpages dedicated for the purpose.
|
||||
* The blknos of these bitmap pages are kept in bitmaps[]; nmaps is the
|
||||
* The blknos of these bitmap pages are kept in mapp[]; nmaps is the
|
||||
* number of currently existing bitmaps.
|
||||
*
|
||||
* The limitation on the size of spares[] comes from the fact that there's
|
||||
* no point in having more than 2^32 buckets with only uint32 hashcodes.
|
||||
* (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
|
||||
* adjusted in such a way to accommodate multi phased allocation of buckets
|
||||
* after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
|
||||
*
|
||||
* There is no particular upper limit on the size of mapp[], other than
|
||||
* needing to fit into the metapage. (With 8K block size, 128 bitmaps
|
||||
* limit us to 64 Gb of overflow space...)
|
||||
* needing to fit into the metapage. (With 8K block size, 1024 bitmaps
|
||||
* limit us to 256 GB of overflow space...)
|
||||
*/
|
||||
#define HASH_MAX_SPLITPOINTS 32
|
||||
#define HASH_MAX_BITMAPS 128
|
||||
#define HASH_MAX_BITMAPS 1024
|
||||
|
||||
#define HASH_SPLITPOINT_PHASE_BITS 2
|
||||
#define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
|
||||
#define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
|
||||
#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
|
||||
|
||||
/* defines max number of splitpoit phases a hash index can have */
|
||||
#define HASH_MAX_SPLITPOINT_GROUP 32
|
||||
#define HASH_MAX_SPLITPOINTS \
|
||||
(((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
|
||||
HASH_SPLITPOINT_PHASES_PER_GRP) + \
|
||||
HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
|
||||
|
||||
typedef struct HashMetaPageData {
|
||||
uint32 hashm_magic; /* magic no. for hash tables */
|
||||
|
@ -280,37 +333,40 @@ extern Datum hash_new_uint32(uint32 k);
|
|||
/* private routines */
|
||||
|
||||
/* hashinsert.c */
|
||||
extern void _hash_doinsert(Relation rel, IndexTuple itup);
|
||||
extern void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel);
|
||||
extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup);
|
||||
extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups,
|
||||
OffsetNumber *itup_offsets, uint16 nitups);
|
||||
|
||||
/* hashovfl.c */
|
||||
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf);
|
||||
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy);
|
||||
extern void _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, ForkNumber forkNum);
|
||||
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy);
|
||||
extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin);
|
||||
extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
|
||||
Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
|
||||
Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy);
|
||||
extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage);
|
||||
extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy);
|
||||
|
||||
/* hashpage.c */
|
||||
extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
|
||||
extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
|
||||
extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
|
||||
extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags);
|
||||
extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel,
|
||||
BlockNumber blkno, int flags);
|
||||
extern HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh);
|
||||
extern Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey,
|
||||
int access, HashMetaPage *cachedmetap);
|
||||
extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno);
|
||||
extern void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage);
|
||||
extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum);
|
||||
extern Buffer _hash_getbuf_with_strategy(
|
||||
Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy);
|
||||
extern void _hash_relbuf(Relation rel, Buffer buf);
|
||||
extern void _hash_dropbuf(Relation rel, Buffer buf);
|
||||
extern void _hash_wrtbuf(Relation rel, Buffer buf);
|
||||
extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access);
|
||||
extern uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum);
|
||||
extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so);
|
||||
extern uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum);
|
||||
extern void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage);
|
||||
extern void _hash_pageinit(Page page, Size size);
|
||||
extern void _hash_expandtable(Relation rel, Buffer metabuf);
|
||||
|
||||
/* hashscan.c */
|
||||
extern void _hash_regscan(IndexScanDesc scan);
|
||||
extern void _hash_dropscan(IndexScanDesc scan);
|
||||
extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
|
||||
extern void ReleaseResources_hash(void);
|
||||
extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
|
||||
uint32 maxbucket, uint32 highmask, uint32 lowmask);
|
||||
|
||||
/* hashsearch.c */
|
||||
extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
|
||||
|
@ -320,10 +376,10 @@ extern bool _hash_step(IndexScanDesc scan, Buffer* bufP, ScanDirection dir);
|
|||
/* hashsort.c */
|
||||
typedef struct HSpool HSpool; /* opaque struct in hashsort.c */
|
||||
|
||||
extern HSpool* _h_spoolinit(Relation index, uint32 num_buckets, void* meminfo);
|
||||
extern HSpool* _h_spoolinit(Relation heap, Relation index, uint32 num_buckets, void* meminfo);
|
||||
extern void _h_spooldestroy(HSpool* hspool);
|
||||
extern void _h_spool(HSpool* hspool, ItemPointer self, Datum* values, const bool* isnull);
|
||||
extern void _h_indexbuild(HSpool* hspool);
|
||||
extern void _h_indexbuild(HSpool* hspool, Relation heapRel);
|
||||
|
||||
/* hashutil.c */
|
||||
extern bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup);
|
||||
|
@ -331,15 +387,28 @@ extern uint32 _hash_datum2hashkey(Relation rel, Datum key);
|
|||
extern uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype);
|
||||
extern Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask);
|
||||
extern uint32 _hash_log2(uint32 num);
|
||||
extern uint32 _hash_spareindex(uint32 num_bucket);
|
||||
extern uint32 _hash_get_totalbuckets(uint32 splitpoint_phase);
|
||||
extern void _hash_checkpage(Relation rel, Buffer buf, int flags);
|
||||
extern uint32 _hash_get_indextuple_hashkey(IndexTuple itup);
|
||||
extern IndexTuple _hash_form_tuple(Relation index, Datum* values, const bool* isnull);
|
||||
extern bool _hash_convert_tuple(Relation index, Datum *user_values, const bool *user_isnull,
|
||||
Datum *index_values, bool *index_isnull);
|
||||
extern OffsetNumber _hash_binsearch(Page page, uint32 hash_value);
|
||||
extern OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value);
|
||||
extern BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket);
|
||||
extern BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket);
|
||||
extern Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket,
|
||||
uint32 lowmask, uint32 maxbucket);
|
||||
extern void _hash_kill_items(IndexScanDesc scan);
|
||||
|
||||
/* hash.c */
|
||||
extern void hash_redo(XLogReaderState* record);
|
||||
extern void hash_desc(StringInfo buf, XLogReaderState* record);
|
||||
extern void hashbucketcleanup(Relation rel, Bucket cur_bucket,
|
||||
Buffer bucket_buf, BlockNumber bucket_blkno,
|
||||
BufferAccessStrategy bstrategy,
|
||||
uint32 maxbucket, uint32 highmask, uint32 lowmask,
|
||||
double *tuples_removed, double *num_index_tuples,
|
||||
bool bucket_has_garbage,
|
||||
IndexBulkDeleteCallback callback, void *callback_state);
|
||||
|
||||
#ifdef PGXC
|
||||
extern Datum compute_hash(Oid type, Datum value, char locator);
|
||||
|
|
|
@ -0,0 +1,352 @@
|
|||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* hash_xlog.h
|
||||
* header file for Postgres hash AM implementation
|
||||
*
|
||||
* Portions Copyright (c) 2021 Huawei Technologies Co.,Ltd.
|
||||
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* src/include/access/hash_xlog.h
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef HASH_XLOG_H
|
||||
#define HASH_XLOG_H
|
||||
|
||||
#include "access/xlogreader.h"
|
||||
#include "lib/stringinfo.h"
|
||||
#include "storage/off.h"
|
||||
|
||||
/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */
|
||||
#define HASH_XLOG_FREE_OVFL_BUFS 6
|
||||
|
||||
/*
|
||||
* XLOG records for hash operations
|
||||
*/
|
||||
#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */
|
||||
#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */
|
||||
#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */
|
||||
#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */
|
||||
#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */
|
||||
#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */
|
||||
#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split operation */
|
||||
#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page
|
||||
* and add to another page */
|
||||
#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous
|
||||
* pages in chain and free the ovfl
|
||||
* page */
|
||||
#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */
|
||||
#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary
|
||||
* bucket page after deleting tuples
|
||||
* that are moved due to split */
|
||||
#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after vacuum */
|
||||
#define XLOG_HASH_VACUUM_ONE_PAGE 0xC0 /* remove dead tuples from index page */
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_INIT_META_PAGE_NUM = 0,
|
||||
}XLogHashInitMetaPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_INIT_BITMAP_PAGE_BITMAP_NUM = 0,
|
||||
XLOG_HASH_INIT_BITMAP_PAGE_META_NUM,
|
||||
}XLogHashInitBitmapPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_INSERT_PAGE_NUM = 0,
|
||||
XLOG_HASH_INSERT_META_NUM,
|
||||
}XLogHashInsertEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_ADD_OVFL_PAGE_OVFL_NUM = 0,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_LEFT_NUM,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_MAP_NUM,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_NEWMAP_NUM,
|
||||
XLOG_HASH_ADD_OVFL_PAGE_META_NUM,
|
||||
}XLogHashAddOvflPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_SPLIT_ALLOCATE_PAGE_OBUK_NUM = 0,
|
||||
XLOG_HASH_SPLIT_ALLOCATE_PAGE_NBUK_NUM,
|
||||
XLOG_HASH_SPLIT_ALLOCATE_PAGE_META_NUM,
|
||||
}XLogHashSplitAllocatePageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_SPLIT_PAGE_NUM = 0,
|
||||
}XLogHashSplitPageEnum;
|
||||
|
||||
typedef enum {
|
||||
XLOG_HASH_SPLIT_COMPLETE_OBUK_NUM = 0,
|
||||
XLOG_HASH_SPLIT_COMPLETE_NBUK_NUM,
|
||||
}XLogHashSplitCompleteEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_MOVE_BUK_BLOCK_NUM = 0,
|
||||
HASH_MOVE_ADD_BLOCK_NUM,
|
||||
HASH_MOVE_DELETE_OVFL_BLOCK_NUM,
|
||||
}XLogHashMovePageEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_SQUEEZE_BUK_BLOCK_NUM = 0,
|
||||
HASH_SQUEEZE_ADD_BLOCK_NUM,
|
||||
HASH_SQUEEZE_INIT_OVFLBUF_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_PREV_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_NEXT_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_BITMAP_BLOCK_NUM,
|
||||
HASH_SQUEEZE_UPDATE_META_BLOCK_NUM,
|
||||
}XLogHashSqueezePageEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_DELETE_BUK_BLOCK_NUM = 0,
|
||||
HASH_DELETE_OVFL_BLOCK_NUM,
|
||||
}XLogHashDeleteEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_SPLIT_CLEANUP_BLOCK_NUM,
|
||||
}XLogHashSplitCleanupEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_UPDATE_META_BLOCK_NUM,
|
||||
} XLogHashUpdateMateEnum;
|
||||
|
||||
typedef enum {
|
||||
HASH_VACUUM_PAGE_BLOCK_NUM = 0,
|
||||
HASH_VACUUM_META_BLOCK_NUM,
|
||||
} XLogHashVacuumPageEnum;
|
||||
|
||||
/*
|
||||
* xl_hash_split_allocate_page flag values, 8 bits are available.
|
||||
*/
|
||||
#define XLH_SPLIT_META_UPDATE_MASKS (1<<0)
|
||||
#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1)
|
||||
|
||||
/*
|
||||
* This is what we need to know about a HASH index create.
|
||||
*
|
||||
* Backup block 0: metapage
|
||||
*/
|
||||
typedef struct xl_hash_createidx
|
||||
{
|
||||
double num_tuples;
|
||||
RegProcedure procid;
|
||||
uint16 ffactor;
|
||||
} xl_hash_createidx;
|
||||
|
||||
#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to know about simple (without split) insert.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_INSERT
|
||||
*
|
||||
* Backup Blk 0: original page (data contains the inserted tuple)
|
||||
* Backup Blk 1: metapage (HashMetaPageData)
|
||||
*/
|
||||
typedef struct xl_hash_insert
|
||||
{
|
||||
OffsetNumber offnum;
|
||||
} xl_hash_insert;
|
||||
|
||||
#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber))
|
||||
|
||||
/*
|
||||
* This is what we need to know about addition of overflow page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_ADD_OVFL_PAGE
|
||||
*
|
||||
* Backup Blk 0: newly allocated overflow page
|
||||
* Backup Blk 1: page before new overflow page in the bucket chain
|
||||
* Backup Blk 2: bitmap page
|
||||
* Backup Blk 3: new bitmap page
|
||||
* Backup Blk 4: metapage
|
||||
*/
|
||||
typedef struct xl_hash_add_ovfl_page
|
||||
{
|
||||
uint16 bmsize;
|
||||
bool bmpage_found;
|
||||
} xl_hash_add_ovfl_page;
|
||||
|
||||
#define SizeOfHashAddOvflPage \
|
||||
(offsetof(xl_hash_add_ovfl_page, bmpage_found) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need to know about allocating a page for split.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE
|
||||
*
|
||||
* Backup Blk 0: page for old bucket
|
||||
* Backup Blk 1: page for new bucket
|
||||
* Backup Blk 2: metapage
|
||||
*/
|
||||
typedef struct xl_hash_split_allocate_page
|
||||
{
|
||||
uint32 new_bucket;
|
||||
uint16 old_bucket_flag;
|
||||
uint16 new_bucket_flag;
|
||||
uint8 flags;
|
||||
} xl_hash_split_allocate_page;
|
||||
|
||||
#define SizeOfHashSplitAllocPage \
|
||||
(offsetof(xl_hash_split_allocate_page, flags) + sizeof(uint8))
|
||||
|
||||
/*
|
||||
* This is what we need to know about completing the split operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_SPLIT_COMPLETE
|
||||
*
|
||||
* Backup Blk 0: page for old bucket
|
||||
* Backup Blk 1: page for new bucket
|
||||
*/
|
||||
typedef struct xl_hash_split_complete
|
||||
{
|
||||
uint16 old_bucket_flag;
|
||||
uint16 new_bucket_flag;
|
||||
} xl_hash_split_complete;
|
||||
|
||||
#define SizeOfHashSplitComplete \
|
||||
(offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to know about move page contents required during
|
||||
* squeeze operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS
|
||||
*
|
||||
* Backup Blk 0: bucket page
|
||||
* Backup Blk 1: page containing moved tuples
|
||||
* Backup Blk 2: page from which tuples will be removed
|
||||
*/
|
||||
typedef struct xl_hash_move_page_contents
|
||||
{
|
||||
uint16 ntups;
|
||||
bool is_prim_bucket_same_wrt; /* true if the page to which
|
||||
* tuples are moved is same as
|
||||
* primary bucket page */
|
||||
} xl_hash_move_page_contents;
|
||||
|
||||
#define SizeOfHashMovePageContents \
|
||||
(offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need to know about the squeeze page operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_SQUEEZE_PAGE
|
||||
*
|
||||
* Backup Blk 0: page containing tuples moved from freed overflow page
|
||||
* Backup Blk 1: freed overflow page
|
||||
* Backup Blk 2: page previous to the freed overflow page
|
||||
* Backup Blk 3: page next to the freed overflow page
|
||||
* Backup Blk 4: bitmap page containing info of freed overflow page
|
||||
* Backup Blk 5: meta page
|
||||
*/
|
||||
typedef struct xl_hash_squeeze_page
|
||||
{
|
||||
BlockNumber prevblkno;
|
||||
BlockNumber nextblkno;
|
||||
uint16 ntups;
|
||||
bool is_prim_bucket_same_wrt; /* true if the page to which
|
||||
* tuples are moved is same as
|
||||
* primary bucket page */
|
||||
bool is_prev_bucket_same_wrt; /* true if the page to which
|
||||
* tuples are moved is the page
|
||||
* previous to the freed overflow
|
||||
* page */
|
||||
} xl_hash_squeeze_page;
|
||||
|
||||
#define SizeOfHashSqueezePage \
|
||||
(offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need to know about the deletion of index tuples from a page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_DELETE
|
||||
*
|
||||
* Backup Blk 0: primary bucket page
|
||||
* Backup Blk 1: page from which tuples are deleted
|
||||
*/
|
||||
typedef struct xl_hash_delete
|
||||
{
|
||||
bool clear_dead_marking; /* true if this operation clears
|
||||
* LH_PAGE_HAS_DEAD_TUPLES flag */
|
||||
bool is_primary_bucket_page; /* true if the operation is for
|
||||
* primary bucket page */
|
||||
} xl_hash_delete;
|
||||
|
||||
#define SizeOfHashDelete \
|
||||
(offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool))
|
||||
|
||||
/*
|
||||
* This is what we need for metapage update operation.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_UPDATE_META_PAGE
|
||||
*
|
||||
* Backup Blk 0: meta page
|
||||
*/
|
||||
typedef struct xl_hash_update_meta_page
|
||||
{
|
||||
double ntuples;
|
||||
} xl_hash_update_meta_page;
|
||||
|
||||
#define SizeOfHashUpdateMetaPage \
|
||||
(offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double))
|
||||
|
||||
/*
|
||||
* This is what we need to initialize metapage.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_INIT_META_PAGE
|
||||
*
|
||||
* Backup Blk 0: meta page
|
||||
*/
|
||||
typedef struct xl_hash_init_meta_page
|
||||
{
|
||||
double num_tuples;
|
||||
RegProcedure procid;
|
||||
uint16 ffactor;
|
||||
} xl_hash_init_meta_page;
|
||||
|
||||
#define SizeOfHashInitMetaPage \
|
||||
(offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need to initialize bitmap page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_INIT_BITMAP_PAGE
|
||||
*
|
||||
* Backup Blk 0: bitmap page
|
||||
* Backup Blk 1: meta page
|
||||
*/
|
||||
typedef struct xl_hash_init_bitmap_page
|
||||
{
|
||||
uint16 bmsize;
|
||||
} xl_hash_init_bitmap_page;
|
||||
|
||||
#define SizeOfHashInitBitmapPage \
|
||||
(offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16))
|
||||
|
||||
/*
|
||||
* This is what we need for index tuple deletion and to
|
||||
* update the meta page.
|
||||
*
|
||||
* This data record is used for XLOG_HASH_VACUUM_ONE_PAGE
|
||||
*
|
||||
* Backup Blk 0: bucket page
|
||||
* Backup Blk 1: meta page
|
||||
*/
|
||||
typedef struct xl_hash_vacuum_one_page
|
||||
{
|
||||
RelFileNode hnode;
|
||||
int ntuples;
|
||||
|
||||
/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
|
||||
} xl_hash_vacuum_one_page;
|
||||
|
||||
#define SizeOfHashVacuumOnePage \
|
||||
(offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(int))
|
||||
|
||||
extern void hash_redo(XLogReaderState *record);
|
||||
extern void hash_desc(StringInfo buf, XLogReaderState *record);
|
||||
extern const char *hash_identify(uint8 info);
|
||||
extern bool IsHashVacuumPages(XLogReaderState *record);
|
||||
|
||||
#endif /* HASH_XLOG_H */
|
|
@ -754,6 +754,47 @@ void BtreeXlogUnlinkPageOperatorChildpage(RedoBufferInfo* cbuf, void* recorddata
|
|||
|
||||
void BtreeXlogClearIncompleteSplit(RedoBufferInfo* buffer);
|
||||
|
||||
void HashRedoInitMetaPageOperatorPage(RedoBufferInfo *metabuf, void *recorddata);
|
||||
|
||||
void HashRedoInitBitmapPageOperatorBitmapPage(RedoBufferInfo *bitmapbuf, void *recorddata);
|
||||
void HashRedoInitBitmapPageOperatorMetaPage(RedoBufferInfo *metabuf);
|
||||
|
||||
void HashRedoInsertOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *data, Size datalen);
|
||||
void HashRedoInsertOperatorMetaPage(RedoBufferInfo *metabuf);
|
||||
|
||||
void HashRedoAddOvflPageOperatorOvflPage(RedoBufferInfo *ovflbuf, BlockNumber leftblk, void *data, Size datalen);
|
||||
void HashRedoAddOvflPageOperatorLeftPage(RedoBufferInfo *ovflbuf, BlockNumber rightblk);
|
||||
void HashRedoAddOvflPageOperatorMapPage(RedoBufferInfo *mapbuf, void *data);
|
||||
void HashRedoAddOvflPageOperatorNewmapPage(RedoBufferInfo *newmapbuf, void *recorddata);
|
||||
void HashRedoAddOvflPageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *data, Size datalen);
|
||||
|
||||
void HashRedoSplitAllocatePageOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
|
||||
void HashRedoSplitAllocatePageOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
|
||||
void HashRedoSplitAllocatePageOperatorMetaPage(RedoBufferInfo *metabuf, void *recorddata, void *blkdata);
|
||||
|
||||
void HashRedoSplitCompleteOperatorObukPage(RedoBufferInfo *oldbukbuf, void *recorddata);
|
||||
void HashRedoSplitCompleteOperatorNbukPage(RedoBufferInfo *newbukbuf, void *recorddata);
|
||||
|
||||
void HashXlogMoveAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
|
||||
void HashXlogMoveDeleteOvflPageOperatorPage(RedoBufferInfo *redobuffer, void *blkdata, Size len);
|
||||
|
||||
void HashXlogSqueezeAddPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
|
||||
void HashXlogSqueezeInitOvflbufOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
void HashXlogSqueezeUpdatePrevPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
void HashXlogSqueezeUpdateNextPageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
void HashXlogSqueezeUpdateBitmapOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
|
||||
void HashXlogSqueezeUpdateMateOperatorPage(RedoBufferInfo *redobuffer, void *blkdata);
|
||||
|
||||
void HashXlogDeleteBlockOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, void *blkdata, Size len);
|
||||
|
||||
void HashXlogSplitCleanupOperatorPage(RedoBufferInfo *redobuffer);
|
||||
|
||||
void HashXlogUpdateMetaOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
|
||||
void HashXlogVacuumOnePageOperatorPage(RedoBufferInfo *redobuffer, void *recorddata, Size len);
|
||||
|
||||
void HashXlogVacuumMateOperatorPage(RedoBufferInfo *redobuffer, void *recorddata);
|
||||
|
||||
void XLogRecSetBlockCommonState(XLogReaderState* record, XLogBlockParseEnum blockvalid, ForkNumber forknum,
|
||||
BlockNumber blockknum, RelFileNode* relnode, XLogRecParseState* recordblockstate);
|
||||
|
||||
|
@ -787,6 +828,7 @@ extern void XLogRecSetBlockDdlState(XLogBlockDdlParse* blockddlstate, uint32 blo
|
|||
char *mainData, Oid ownerid = InvalidOid);
|
||||
XLogRedoAction XLogCheckBlockDataRedoAction(XLogBlockDataParse* datadecode, RedoBufferInfo* bufferinfo);
|
||||
void BtreeRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
|
||||
extern void HashRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
|
||||
XLogRecParseState* XactXlogCsnlogParseToBlock(XLogReaderState* record, uint32* blocknum, TransactionId xid,
|
||||
int nsubxids, TransactionId* subxids, CommitSeqNo csn, XLogRecParseState* recordstatehead);
|
||||
extern void XLogRecSetVmBlockState(XLogReaderState* record, uint32 blockid, XLogRecParseState* recordblockstate);
|
||||
|
@ -914,5 +956,4 @@ extern void XLogBlockDdlDoSmgrAction(XLogBlockHead* blockhead, void* blockrecbod
|
|||
extern void GinRedoDataBlock(XLogBlockHead* blockhead, XLogBlockDataParse* blockdatarec, RedoBufferInfo* bufferinfo);
|
||||
extern void GistRedoDataBlock(XLogBlockHead *blockhead, XLogBlockDataParse *blockdatarec, RedoBufferInfo *bufferinfo);
|
||||
extern bool IsCheckPoint(const XLogRecParseState *parseState);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "utils/tuplesort.h"
|
||||
|
||||
#define DEFAULT_INDEX_TYPE "btree"
|
||||
#define DEFAULT_HASH_INDEX_TYPE "hash"
|
||||
#define DEFAULT_CSTORE_INDEX_TYPE "psort"
|
||||
#define DEFAULT_GIST_INDEX_TYPE "gist"
|
||||
#define CSTORE_BTREE_INDEX_TYPE "cbtree"
|
||||
|
|
|
@ -55,6 +55,7 @@ extern const uint32 RANGE_LIST_DISTRIBUTION_VERSION_NUM;
|
|||
extern const uint32 FIX_SQL_ADD_RELATION_REF_COUNT;
|
||||
extern const uint32 GENERATED_COL_VERSION_NUM;
|
||||
extern const uint32 ANALYZER_HOOK_VERSION_NUM;
|
||||
extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM;
|
||||
|
||||
#define INPLACE_UPGRADE_PRECOMMIT_VERSION 1
|
||||
|
||||
|
|
|
@ -268,6 +268,7 @@ extern void LockBuffer(Buffer buffer, int mode);
|
|||
extern bool ConditionalLockBuffer(Buffer buffer);
|
||||
extern void LockBufferForCleanup(Buffer buffer);
|
||||
extern bool ConditionalLockBufferForCleanup(Buffer buffer);
|
||||
extern bool IsBufferCleanupOK(Buffer buffer);
|
||||
extern bool HoldingBufferPinThatDelaysRecovery(void);
|
||||
extern void AsyncUnpinBuffer(volatile void* bufHdr, bool forgetBuffer);
|
||||
extern void AsyncCompltrPinBuffer(volatile void* bufHdr);
|
||||
|
|
|
@ -472,6 +472,7 @@ extern Page PageGetTempPageCopySpecial(Page page, bool isbtree);
|
|||
extern void PageRestoreTempPage(Page tempPage, Page oldPage);
|
||||
extern void PageRepairFragmentation(Page page);
|
||||
extern Size PageGetFreeSpace(Page page);
|
||||
extern Size PageGetFreeSpaceForMultipleTuples(Page page, int ntups);
|
||||
extern Size PageGetExactFreeSpace(Page page);
|
||||
extern Size PageGetHeapFreeSpace(Page page);
|
||||
extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
|
||||
|
|
|
@ -66,7 +66,8 @@ extern Tuplesortstate* tuplesort_begin_cluster(
|
|||
extern Tuplesortstate* tuplesort_begin_index_btree(
|
||||
Relation indexRel, bool enforceUnique, int workMem, bool randomAccess, int maxMem);
|
||||
extern Tuplesortstate* tuplesort_begin_index_hash(
|
||||
Relation indexRel, uint32 hash_mask, int workMem, bool randomAccess, int maxMem);
|
||||
Relation heapRel, Relation indexRel, uint32 high_mask, uint32 low_mask, uint32 max_buckets,
|
||||
int workMem, bool randomAccess, int maxMem);
|
||||
extern Tuplesortstate* tuplesort_begin_datum(
|
||||
Oid datumType, Oid sortOperator, Oid sortCollation, bool nullsFirstFlag, int workMem, bool randomAccess);
|
||||
#ifdef PGXC
|
||||
|
|
|
@ -354,7 +354,6 @@ NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "mytable_pkey" fo
|
|||
-- ok
|
||||
create index idx_gtt1_1 on gtt1 using btree (a);
|
||||
create index idx_gtt1_2 on gtt1 using hash (a);
|
||||
ERROR: access method "hash" does not support row store
|
||||
create global temp table tmp_t0(c0 tsvector,c1 varchar(100));
|
||||
create index idx_tmp_t0_1 on tmp_t0 using gin (c0);
|
||||
create index idx_tmp_t0_2 on tmp_t0 using gist (c0);
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
--------------------------------
|
||||
---------- hash index ----------
|
||||
--------------------------------
|
||||
set enable_seqscan = off;
|
||||
set enable_indexscan = off;
|
||||
------------------
|
||||
-- hash_table_1 --
|
||||
------------------
|
||||
drop table if exists hash_table_1 cascade;
|
||||
NOTICE: table "hash_table_1" does not exist, skipping
|
||||
create table hash_table_1 (id int, name varchar, sex varchar default 'male');
|
||||
insert into hash_table_1 values (1, 'Smith');
|
||||
insert into hash_table_1 values (2, 'Jones');
|
||||
insert into hash_table_1 values (3, 'Williams', 'female');
|
||||
insert into hash_table_1 values (4, 'Taylor');
|
||||
insert into hash_table_1 values (5, 'Brown');
|
||||
insert into hash_table_1 values (6, 'Davies');
|
||||
drop index if exists hash_t1_id1;
|
||||
NOTICE: index "hash_t1_id1" does not exist, skipping
|
||||
create index hash_t1_id1 on hash_table_1 using hash (id);
|
||||
-- error, does not support multicolumn indexes
|
||||
drop index if exists hash_t1_id2;
|
||||
NOTICE: index "hash_t1_id2" does not exist, skipping
|
||||
create index hash_t1_id2 on hash_table_1 using hash (id, sex);
|
||||
ERROR: access method "hash" does not support multicolumn indexes
|
||||
-- compare with hash_t1_id1 and hash_t1_id3, hash index can be create in same column
|
||||
drop index if exists hash_t1_id3;
|
||||
NOTICE: index "hash_t1_id3" does not exist, skipping
|
||||
drop index if exists hash_t1_id4;
|
||||
NOTICE: index "hash_t1_id4" does not exist, skipping
|
||||
create index hash_t1_id3 on hash_table_1 using btree (id);
|
||||
create index hash_t1_id4 on hash_table_1 using hash (id);
|
||||
-- drop superfluous index now
|
||||
drop index hash_t1_id3, hash_t1_id4;
|
||||
-- insert into large volumns of data into hash_table_1
|
||||
insert into hash_table_1 select 4, 'XXX', 'XXX' from generate_series(1,50000);
|
||||
insert into hash_table_1 select 6, 'XXX', 'XXX' from generate_series(1,50000);
|
||||
analyse hash_table_1;
|
||||
-- after insert, hash_t1_id1 is still work
|
||||
explain(costs off) select * from hash_table_1 where id = 4;
|
||||
QUERY PLAN
|
||||
----------------------------------------
|
||||
Bitmap Heap Scan on hash_table_1
|
||||
Recheck Cond: (id = 4)
|
||||
-> Bitmap Index Scan on hash_t1_id1
|
||||
Index Cond: (id = 4)
|
||||
(4 rows)
|
||||
|
||||
select count(*) from hash_table_1 where id = 6; --50001
|
||||
count
|
||||
-------
|
||||
50001
|
||||
(1 row)
|
||||
|
||||
-- do other dml action, then check hash_t1_id1 again
|
||||
insert into hash_table_1 select random()*100, 'XXX', 'XXX' from generate_series(1,50000);
|
||||
update hash_table_1 set id = 101, sex = 'male' where id = 60;
|
||||
delete from hash_table_1 where id = 80;
|
||||
explain(costs off) select * from hash_table_1 where id = 101;
|
||||
QUERY PLAN
|
||||
----------------------------------------
|
||||
Bitmap Heap Scan on hash_table_1
|
||||
Recheck Cond: (id = 101)
|
||||
-> Bitmap Index Scan on hash_t1_id1
|
||||
Index Cond: (id = 101)
|
||||
(4 rows)
|
||||
|
||||
-- cleanup env
|
||||
drop table hash_table_1 cascade;
|
||||
------------------
|
||||
-- hash_table_2 --
|
||||
------------------
|
||||
drop table if exists hash_table_2 cascade;
|
||||
NOTICE: table "hash_table_2" does not exist, skipping
|
||||
create table hash_table_2 (id int, name varchar, sex varchar default 'male');
|
||||
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100000);
|
||||
-- create index concurrently
|
||||
-- In this fastcheck, we only check it can run properly. However, in a real
|
||||
-- situation, you should run this sql in connection a first, then doing some DML(
|
||||
-- insert, delete, update) operation about this table in connection b as soon
|
||||
-- as possible. We expect the create index do not block DML operation.
|
||||
-- connection a
|
||||
create index concurrently hash_t2_id1 on hash_table_2 using hash (id);
|
||||
-- connection b
|
||||
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
|
||||
explain(costs off) select * from hash_table_2 where id = 40;
|
||||
QUERY PLAN
|
||||
----------------------------------------
|
||||
Bitmap Heap Scan on hash_table_2
|
||||
Recheck Cond: (id = 40)
|
||||
-> Bitmap Index Scan on hash_t2_id1
|
||||
Index Cond: (id = 40)
|
||||
(4 rows)
|
||||
|
||||
-- error, does not support unique indexes
|
||||
create unique index hash_t2_id2 on hash_table_2 using hash (sex);
|
||||
ERROR: access method "hash" does not support unique indexes
|
||||
-- hash_t2_id3 occupies more disk space than hash_t2_id2
|
||||
create index hash_t2_id2 on hash_table_2 using hash (id) with (fillfactor=25);
|
||||
create index hash_t2_id3 on hash_table_2 using hash (id) with (fillfactor=75);
|
||||
select count(*) from hash_table_2; --100100
|
||||
count
|
||||
--------
|
||||
100100
|
||||
(1 row)
|
||||
|
||||
-- cleanup env
|
||||
drop table hash_table_2 cascade;
|
||||
------------------
|
||||
-- hash_table_3 --
|
||||
------------------
|
||||
drop schema if exists hash_sc_3 cascade;
|
||||
NOTICE: schema "hash_sc_3" does not exist, skipping
|
||||
drop tablespace if exists hash_sp_3;
|
||||
NOTICE: Tablespace "hash_sp_3" does not exist, skipping.
|
||||
create schema hash_sc_3;
|
||||
create tablespace hash_sp_3 relative location 'tablespace/tablespace_1';
|
||||
create table hash_sc_3.hash_table_3
|
||||
(
|
||||
id int, name varchar,
|
||||
sex varchar default 'male'
|
||||
)
|
||||
tablespace hash_sp_3;
|
||||
-- create index specify schema and tablespace
|
||||
create index concurrently hash_sc_3.hash_t3_id1 on hash_sc_3.hash_table_3 using hash (id);
|
||||
create index hash_sc_3.hash_t3_id2 on hash_sc_3.hash_table_3 using hash (id) tablespace hash_sp_3;
|
||||
drop table hash_sc_3.hash_table_3 cascade;
|
||||
drop schema hash_sc_3 cascade;
|
||||
drop tablespace hash_sp_3;
|
||||
------------------
|
||||
-- hash_table_4 --
|
||||
------------------
|
||||
drop table if exists hash_table_4 cascade;
|
||||
NOTICE: table "hash_table_4" does not exist, skipping
|
||||
create table hash_table_4
|
||||
(
|
||||
id int,
|
||||
name varchar,
|
||||
sex varchar default 'male'
|
||||
)
|
||||
partition by range(id)
|
||||
(
|
||||
partition p1 values less than (1000),
|
||||
partition p2 values less than (2000),
|
||||
partition p3 values less than (3000),
|
||||
partition p4 values less than (maxvalue)
|
||||
);
|
||||
-- hash index only support local index in partition table
|
||||
drop index if exists hash_t4_id1;
|
||||
NOTICE: index "hash_t4_id1" does not exist, skipping
|
||||
drop index if exists hash_t4_id2;
|
||||
NOTICE: index "hash_t4_id2" does not exist, skipping
|
||||
drop index if exists hash_t4_id2_new;
|
||||
NOTICE: index "hash_t4_id2_new" does not exist, skipping
|
||||
create index hash_t4_id1 on hash_table_4 using hash(id) global;
|
||||
ERROR: Global partition index only support btree.
|
||||
create index hash_t4_id2 on hash_table_4 using hash(id) local
|
||||
(
|
||||
partition index_t4_p1,
|
||||
partition index_t4_p2,
|
||||
partition index_t4_p3,
|
||||
partition index_t4_p4
|
||||
);
|
||||
-- alter index rename, unusable
|
||||
insert into hash_table_4 select random()*5000, 'XXX', 'XXX' from generate_series(1,1000);
|
||||
alter index hash_t4_id2 rename to hash_t4_id2_new;
|
||||
alter index hash_t4_id2_new modify partition index_t4_p2 unusable;
|
||||
reindex index hash_t4_id2_new partition index_t4_p2;
|
||||
drop table hash_table_4 cascade;
|
||||
------------------
|
||||
-- hash_table_5 --
|
||||
------------------
|
||||
drop table if exists hash_table_5;
|
||||
NOTICE: table "hash_table_5" does not exist, skipping
|
||||
create temporary table hash_table_5(id int, name varchar, sex varchar default 'male');
|
||||
drop index if exists hash_t5_id1;
|
||||
NOTICE: index "hash_t5_id1" does not exist, skipping
|
||||
create index hash_t5_id1 on hash_table_5 using hash(id) with(fillfactor = 80);
|
||||
insert into hash_table_5 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
|
||||
update hash_table_5 set name = 'aaa' where id = 80;
|
||||
alter index hash_t5_id1 set (fillfactor = 60);
|
||||
alter index hash_t5_id1 RESET (fillfactor);
|
||||
explain (costs off) select * from hash_table_5 where id = 80;
|
||||
QUERY PLAN
|
||||
----------------------------------------
|
||||
Bitmap Heap Scan on hash_table_5
|
||||
Recheck Cond: (id = 80)
|
||||
-> Bitmap Index Scan on hash_t5_id1
|
||||
Index Cond: (id = 80)
|
||||
(4 rows)
|
||||
|
||||
drop table hash_table_5 cascade;
|
||||
------------------
|
||||
-- hash_table_6 --
|
||||
------------------
|
||||
drop table if exists hash_table_6;
|
||||
NOTICE: table "hash_table_6" does not exist, skipping
|
||||
create global temporary table hash_table_6(id int, name varchar, sex varchar default 'male');
|
||||
drop index if exists hash_t6_id1;
|
||||
NOTICE: index "hash_t6_id1" does not exist, skipping
|
||||
create index hash_t6_id1 on hash_table_6 using hash((id*10)) with (fillfactor = 30);
|
||||
insert into hash_table_6 select random()*100, 'XXX', 'XXX' from generate_series(1,1000);
|
||||
delete from hash_table_6 where id in (50, 60, 70);
|
||||
explain (costs off) select * from hash_table_6 where id*10 = 80;
|
||||
QUERY PLAN
|
||||
----------------------------------------
|
||||
Bitmap Heap Scan on hash_table_6
|
||||
Recheck Cond: ((id * 10) = 80)
|
||||
-> Bitmap Index Scan on hash_t6_id1
|
||||
Index Cond: ((id * 10) = 80)
|
||||
(4 rows)
|
||||
|
||||
drop table hash_table_6 cascade;
|
|
@ -261,11 +261,8 @@ INTERVAL ('1 month')
|
|||
);
|
||||
NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "interval_partition_table_003_pkey" for table "interval_partition_table_003"
|
||||
create index interval_partition_table_003_1 ON interval_partition_table_003 USING HASH (logdate) LOCAL;
|
||||
ERROR: access method "hash" does not support row store
|
||||
create index interval_partition_table_003_2 ON interval_partition_table_003 USING HASH (c2) LOCAL;
|
||||
ERROR: access method "hash" does not support row store
|
||||
create index interval_partition_table_003_3 ON interval_partition_table_003 USING HASH (c1) LOCAL;
|
||||
ERROR: access method "hash" does not support row store
|
||||
select relname from pg_partition where INDEXTBLID=(select RELFILENODE from pg_partition where relname='interval_partition_table_003_1') order by 1;
|
||||
relname
|
||||
---------
|
||||
|
|
|
@ -41,7 +41,6 @@ SELECT * FROM macaddr_data ORDER BY a;
|
|||
|
||||
CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b);
|
||||
CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b);
|
||||
ERROR: access method "hash" does not support row store
|
||||
SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1;
|
||||
a | b | trunc
|
||||
----+-------------------+-------------------
|
||||
|
|
|
@ -362,7 +362,6 @@ DROP INDEX enumtest_btree;
|
|||
-- Hash index / opclass with the = operator
|
||||
--
|
||||
CREATE INDEX enumtest_hash ON enumtest USING hash (col);
|
||||
ERROR: access method "hash" does not support row store
|
||||
SELECT * FROM enumtest WHERE col = 'orange';
|
||||
col
|
||||
--------
|
||||
|
@ -370,7 +369,6 @@ SELECT * FROM enumtest WHERE col = 'orange';
|
|||
(1 row)
|
||||
|
||||
DROP INDEX enumtest_hash;
|
||||
ERROR: index "enumtest_hash" does not exist
|
||||
--
|
||||
-- End index tests
|
||||
--
|
||||
|
|
|
@ -41,7 +41,6 @@ SELECT * FROM macaddr_data;
|
|||
|
||||
CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b);
|
||||
CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b);
|
||||
ERROR: access method "hash" does not support row store
|
||||
SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1;
|
||||
a | b | trunc
|
||||
----+-------------------+-------------------
|
||||
|
|
|
@ -120,7 +120,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222
|
|||
-- btree and hash index creation test
|
||||
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
|
||||
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
|
||||
ERROR: access method "hash" does not support row store
|
||||
-- unique index test
|
||||
CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field);
|
||||
-- should fail
|
||||
|
@ -131,7 +130,7 @@ DETAIL: Key (guid_field)=(11111111-1111-1111-1111-111111111111) already exists.
|
|||
SELECT count(*) FROM pg_class WHERE relkind='i' AND relname LIKE 'guid%';
|
||||
count
|
||||
-------
|
||||
2
|
||||
3
|
||||
(1 row)
|
||||
|
||||
-- populating the test tables with additional records
|
||||
|
|
|
@ -120,7 +120,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222
|
|||
-- btree and hash index creation test
|
||||
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
|
||||
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
|
||||
ERROR: access method "hash" does not support row store
|
||||
-- unique index test
|
||||
CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field);
|
||||
-- should fail
|
||||
|
@ -131,7 +130,7 @@ DETAIL: Key (guid_field)=(11111111-1111-1111-1111-111111111111) already exists.
|
|||
SELECT count(*) FROM pg_class WHERE relkind='i' AND relname LIKE 'guid%';
|
||||
count
|
||||
-------
|
||||
2
|
||||
3
|
||||
(1 row)
|
||||
|
||||
-- populating the test tables with additional records
|
||||
|
|
|
@ -110,6 +110,7 @@ test: single_node_random
|
|||
#test: single_node_portals
|
||||
#test: single_node_arrays
|
||||
#test: single_node_btree_index single_node_hash_index single_node_update
|
||||
test: hash_index_001
|
||||
test: single_node_update
|
||||
#test single_node_namespace
|
||||
#test: single_node_prepared_xacts
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
--------------------------------
|
||||
---------- hash index ----------
|
||||
--------------------------------
|
||||
|
||||
set enable_seqscan = off;
|
||||
set enable_indexscan = off;
|
||||
------------------
|
||||
-- hash_table_1 --
|
||||
------------------
|
||||
drop table if exists hash_table_1 cascade;
|
||||
create table hash_table_1 (id int, name varchar, sex varchar default 'male');
|
||||
|
||||
insert into hash_table_1 values (1, 'Smith');
|
||||
insert into hash_table_1 values (2, 'Jones');
|
||||
insert into hash_table_1 values (3, 'Williams', 'female');
|
||||
insert into hash_table_1 values (4, 'Taylor');
|
||||
insert into hash_table_1 values (5, 'Brown');
|
||||
insert into hash_table_1 values (6, 'Davies');
|
||||
|
||||
drop index if exists hash_t1_id1;
|
||||
create index hash_t1_id1 on hash_table_1 using hash (id);
|
||||
-- error, does not support multicolumn indexes
|
||||
drop index if exists hash_t1_id2;
|
||||
create index hash_t1_id2 on hash_table_1 using hash (id, sex);
|
||||
|
||||
-- compare with hash_t1_id1 and hash_t1_id3, hash index can be create in same column
|
||||
drop index if exists hash_t1_id3;
|
||||
drop index if exists hash_t1_id4;
|
||||
create index hash_t1_id3 on hash_table_1 using btree (id);
|
||||
create index hash_t1_id4 on hash_table_1 using hash (id);
|
||||
|
||||
-- drop superfluous index now
|
||||
drop index hash_t1_id3, hash_t1_id4;
|
||||
|
||||
-- insert into large volumns of data into hash_table_1
|
||||
insert into hash_table_1 select 4, 'XXX', 'XXX' from generate_series(1,50000);
|
||||
insert into hash_table_1 select 6, 'XXX', 'XXX' from generate_series(1,50000);
|
||||
analyse hash_table_1;
|
||||
|
||||
-- after insert, hash_t1_id1 is still work
|
||||
explain(costs off) select * from hash_table_1 where id = 4;
|
||||
select count(*) from hash_table_1 where id = 6; --50001
|
||||
|
||||
-- do other dml action, then check hash_t1_id1 again
|
||||
insert into hash_table_1 select random()*100, 'XXX', 'XXX' from generate_series(1,50000);
|
||||
update hash_table_1 set id = 101, sex = 'male' where id = 60;
|
||||
delete from hash_table_1 where id = 80;
|
||||
explain(costs off) select * from hash_table_1 where id = 101;
|
||||
|
||||
-- cleanup env
|
||||
drop table hash_table_1 cascade;
|
||||
|
||||
------------------
|
||||
-- hash_table_2 --
|
||||
------------------
|
||||
drop table if exists hash_table_2 cascade;
|
||||
create table hash_table_2 (id int, name varchar, sex varchar default 'male');
|
||||
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100000);
|
||||
|
||||
-- create index concurrently
|
||||
-- In this fastcheck, we only check it can run properly. However, in a real
|
||||
-- situation, you should run this sql in connection a first, then doing some DML(
|
||||
-- insert, delete, update) operation about this table in connection b as soon
|
||||
-- as possible. We expect the create index do not block DML operation.
|
||||
-- connection a
|
||||
create index concurrently hash_t2_id1 on hash_table_2 using hash (id);
|
||||
-- connection b
|
||||
insert into hash_table_2 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
|
||||
explain(costs off) select * from hash_table_2 where id = 40;
|
||||
|
||||
-- error, does not support unique indexes
|
||||
create unique index hash_t2_id2 on hash_table_2 using hash (sex);
|
||||
|
||||
-- hash_t2_id3 occupies more disk space than hash_t2_id2
|
||||
create index hash_t2_id2 on hash_table_2 using hash (id) with (fillfactor=25);
|
||||
create index hash_t2_id3 on hash_table_2 using hash (id) with (fillfactor=75);
|
||||
|
||||
select count(*) from hash_table_2; --100100
|
||||
|
||||
-- cleanup env
|
||||
drop table hash_table_2 cascade;
|
||||
|
||||
------------------
|
||||
-- hash_table_3 --
|
||||
------------------
|
||||
drop schema if exists hash_sc_3 cascade;
|
||||
drop tablespace if exists hash_sp_3;
|
||||
create schema hash_sc_3;
|
||||
create tablespace hash_sp_3 relative location 'tablespace/tablespace_1';
|
||||
create table hash_sc_3.hash_table_3
|
||||
(
|
||||
id int, name varchar,
|
||||
sex varchar default 'male'
|
||||
)
|
||||
tablespace hash_sp_3;
|
||||
-- create index specify schema and tablespace
|
||||
create index concurrently hash_sc_3.hash_t3_id1 on hash_sc_3.hash_table_3 using hash (id);
|
||||
create index hash_sc_3.hash_t3_id2 on hash_sc_3.hash_table_3 using hash (id) tablespace hash_sp_3;
|
||||
|
||||
drop table hash_sc_3.hash_table_3 cascade;
|
||||
drop schema hash_sc_3 cascade;
|
||||
drop tablespace hash_sp_3;
|
||||
|
||||
------------------
|
||||
-- hash_table_4 --
|
||||
------------------
|
||||
drop table if exists hash_table_4 cascade;
|
||||
create table hash_table_4
|
||||
(
|
||||
id int,
|
||||
name varchar,
|
||||
sex varchar default 'male'
|
||||
)
|
||||
partition by range(id)
|
||||
(
|
||||
partition p1 values less than (1000),
|
||||
partition p2 values less than (2000),
|
||||
partition p3 values less than (3000),
|
||||
partition p4 values less than (maxvalue)
|
||||
);
|
||||
|
||||
-- hash index only support local index in partition table
|
||||
drop index if exists hash_t4_id1;
|
||||
drop index if exists hash_t4_id2;
|
||||
drop index if exists hash_t4_id2_new;
|
||||
create index hash_t4_id1 on hash_table_4 using hash(id) global;
|
||||
create index hash_t4_id2 on hash_table_4 using hash(id) local
|
||||
(
|
||||
partition index_t4_p1,
|
||||
partition index_t4_p2,
|
||||
partition index_t4_p3,
|
||||
partition index_t4_p4
|
||||
);
|
||||
|
||||
-- alter index rename, unusable
|
||||
insert into hash_table_4 select random()*5000, 'XXX', 'XXX' from generate_series(1,1000);
|
||||
alter index hash_t4_id2 rename to hash_t4_id2_new;
|
||||
alter index hash_t4_id2_new modify partition index_t4_p2 unusable;
|
||||
reindex index hash_t4_id2_new partition index_t4_p2;
|
||||
|
||||
drop table hash_table_4 cascade;
|
||||
|
||||
------------------
|
||||
-- hash_table_5 --
|
||||
------------------
|
||||
drop table if exists hash_table_5;
|
||||
create temporary table hash_table_5(id int, name varchar, sex varchar default 'male');
|
||||
|
||||
drop index if exists hash_t5_id1;
|
||||
create index hash_t5_id1 on hash_table_5 using hash(id) with(fillfactor = 80);
|
||||
|
||||
insert into hash_table_5 select random()*100, 'XXX', 'XXX' from generate_series(1,100);
|
||||
update hash_table_5 set name = 'aaa' where id = 80;
|
||||
alter index hash_t5_id1 set (fillfactor = 60);
|
||||
alter index hash_t5_id1 RESET (fillfactor);
|
||||
explain (costs off) select * from hash_table_5 where id = 80;
|
||||
drop table hash_table_5 cascade;
|
||||
|
||||
------------------
|
||||
-- hash_table_6 --
|
||||
------------------
|
||||
drop table if exists hash_table_6;
|
||||
create global temporary table hash_table_6(id int, name varchar, sex varchar default 'male');
|
||||
drop index if exists hash_t6_id1;
|
||||
create index hash_t6_id1 on hash_table_6 using hash((id*10)) with (fillfactor = 30);
|
||||
insert into hash_table_6 select random()*100, 'XXX', 'XXX' from generate_series(1,1000);
|
||||
delete from hash_table_6 where id in (50, 60, 70);
|
||||
explain (costs off) select * from hash_table_6 where id*10 = 80;
|
||||
drop table hash_table_6 cascade;
|
Loading…
Reference in New Issue