diff --git a/contrib/postgres_fdw/postgres_fdw.cpp b/contrib/postgres_fdw/postgres_fdw.cpp index 23d5af8e6..6d994659d 100644 --- a/contrib/postgres_fdw/postgres_fdw.cpp +++ b/contrib/postgres_fdw/postgres_fdw.cpp @@ -654,10 +654,18 @@ static ForeignScan *postgresGetForeignPlan(PlannerInfo *root, RelOptInfo *basere * complete information about, and (b) it wouldn't work anyway on * older remote servers. Likewise, we don't worry about NOWAIT. */ - if (rc->forUpdate) { - appendStringInfoString(&sql, " FOR UPDATE"); - } else { - appendStringInfoString(&sql, " FOR SHARE"); + switch (rc->strength) { + case LCS_FORKEYSHARE: + case LCS_FORSHARE: + appendStringInfoString(&sql, " FOR SHARE"); + break; + case LCS_FORNOKEYUPDATE: + case LCS_FORUPDATE: + appendStringInfoString(&sql, " FOR UPDATE"); + break; + default: + ereport(ERROR, (errmsg("unknown lock type: %d", rc->strength))); + break; } } } diff --git a/doc/src/sgml/ref/select.sgmlin b/doc/src/sgml/ref/select.sgmlin index b22af7fb1..9dede4222 100644 --- a/doc/src/sgml/ref/select.sgmlin +++ b/doc/src/sgml/ref/select.sgmlin @@ -23,7 +23,7 @@ SELECT [/*+ plan_hint */] [ ALL | DISTINCT [ ON ( expression [, ...] ) ] ] [ LIMIT { [offset,] count | ALL } ] [ OFFSET start [ ROW | ROWS ] ] [ FETCH { FIRST | NEXT } [ count ] { ROW | ROWS } ONLY ] - [ {FOR { UPDATE | SHARE } [ OF table_name [, ...] ] [ NOWAIT ]} [...] ]; + [ {FOR { UPDATE | NO KEY UPDATE | SHARE | KEY SHARE } [ OF table_name [, ...] ] [ NOWAIT ]} [...] ]; TABLE { ONLY {(table_name)| table_name} | table_name [ * ]}; where from_item can be: diff --git a/src/bin/pg_probackup/parsexlog.cpp b/src/bin/pg_probackup/parsexlog.cpp index 3dd81a3f0..8a12b0824 100644 --- a/src/bin/pg_probackup/parsexlog.cpp +++ b/src/bin/pg_probackup/parsexlog.cpp @@ -1819,7 +1819,7 @@ extractPageInfo(XLogReaderState *record, XLogReaderData *reader_data, * source system. */ } - else if (info & XLR_SPECIAL_REL_UPDATE) + else if (rmid != RM_HEAP_ID && rmid != RM_HEAP2_ID && (info & XLR_SPECIAL_REL_UPDATE)) { /* * This record type modifies a relation file in some special way, but diff --git a/src/common/backend/catalog/heap.cpp b/src/common/backend/catalog/heap.cpp index b710f3cf0..c9faf0d0c 100644 --- a/src/common/backend/catalog/heap.cpp +++ b/src/common/backend/catalog/heap.cpp @@ -39,6 +39,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/multixact.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/gs_obsscaninfo.h" @@ -166,6 +167,8 @@ static bool CheckNestedGenerated(ParseState *pstate, Node *node); extern void getErrorTableFilePath(char* buf, int len, Oid databaseid, Oid reid); extern void make_tmptable_cache_key(Oid relNode); +#define RELKIND_IN_RTM (relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE || relkind == RELKIND_MATVIEW) + /* ---------------------------------------------------------------- * XXX UGLY HARD CODED BADNESS FOLLOWS XXX * @@ -1085,7 +1088,7 @@ void InsertPgClassTuple( else nulls[Anum_pg_class_reloptions - 1] = true; - if (relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE || relkind == RELKIND_MATVIEW) + if (RELKIND_IN_RTM) values[Anum_pg_class_relfrozenxid64 - 1] = u_sess->utils_cxt.RecentXmin; else values[Anum_pg_class_relfrozenxid64 - 1] = InvalidTransactionId; @@ -1103,6 +1106,14 @@ void InsertPgClassTuple( } else { nulls[Anum_pg_class_relbucket - 1] = true; } + +#ifndef ENABLE_MULTIPLE_NODES + if (RELKIND_IN_RTM && !is_cstore_option(relkind, reloptions)) { + values[Anum_pg_class_relminmxid - 1] = GetOldestMultiXactId(); + } else { + values[Anum_pg_class_relminmxid - 1] = InvalidMultiXactId; + } +#endif if (bucketcol != NULL) values[Anum_pg_class_relbucketkey - 1] = PointerGetDatum(bucketcol); @@ -1170,7 +1181,7 @@ static void AddNewRelationTuple(Relation pg_class_desc, Relation new_rel_desc, O } /* Initialize relfrozenxid */ - if (relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE || relkind == RELKIND_MATVIEW) { + if (RELKIND_IN_RTM) { /* * Initialize to the minimum XID that could put tuples in the table. * We know that no xacts older than RecentXmin are still running, so @@ -2904,7 +2915,7 @@ Oid heap_create_with_catalog(const char *relname, Oid relnamespace, Oid reltable register_on_commit_action(relid, oncommit); if (relpersistence == RELPERSISTENCE_UNLOGGED) { - Assert(relkind == RELKIND_RELATION || relkind == RELKIND_TOASTVALUE || relkind == RELKIND_MATVIEW); + Assert(RELKIND_IN_RTM); heap_create_init_fork(new_rel_desc); } @@ -6239,6 +6250,9 @@ static void addNewPartitionTupleForValuePartitionedTable(Relation pg_partition_r nulls[Anum_pg_partition_reloptions - 1] = true; } values[Anum_pg_partition_relfrozenxid64 - 1] = TransactionIdGetDatum(InvalidTransactionId); +#ifndef ENABLE_MULTIPLE_NODES + values[Anum_pg_partition_relminmxid - 1] = TransactionIdGetDatum(InvalidMultiXactId); +#endif /*form a tuple using values and null array, and insert it*/ tup = heap_form_tuple(RelationGetDescr(pg_partition_rel), values, nulls); HeapTupleSetOid(tup, InvalidOid); @@ -6459,6 +6473,7 @@ void heap_truncate_one_part(Relation rel, Oid partOid) List* partIndexlist = NULL; Relation parentIndex = NULL; mySubid = GetCurrentSubTransactionId(); + MultiXactId multiXid = GetOldestMultiXactId(); partIndexlist = searchPartitionIndexesByblid(partOid); @@ -6477,12 +6492,13 @@ void heap_truncate_one_part(Relation rel, Oid partOid) CheckTableForSerializableConflictIn(rel); - PartitionSetNewRelfilenode(rel, p, u_sess->utils_cxt.RecentXmin); + PartitionSetNewRelfilenode(rel, p, u_sess->utils_cxt.RecentXmin, + RelationIsColStore(rel) ? InvalidMultiXactId : multiXid); /* truncate the toast table */ if (toastOid != InvalidOid) { Relation toastRel = heap_open(toastOid, AccessExclusiveLock); - RelationSetNewRelfilenode(toastRel, u_sess->utils_cxt.RecentXmin); + RelationSetNewRelfilenode(toastRel, u_sess->utils_cxt.RecentXmin, multiXid); heap_close(toastRel, NoLock); } diff --git a/src/common/backend/catalog/index.cpp b/src/common/backend/catalog/index.cpp index fd18ceed0..5d223e13b 100644 --- a/src/common/backend/catalog/index.cpp +++ b/src/common/backend/catalog/index.cpp @@ -2755,7 +2755,7 @@ void index_build(Relation heapRelation, Partition heapPartition, Relation indexR Oid psortRelId = targetIndexRelation->rd_rel->relcudescrelid; Relation psortRel = relation_open(psortRelId, AccessExclusiveLock); - RelationSetNewRelfilenode(psortRel, u_sess->utils_cxt.RecentXmin); + RelationSetNewRelfilenode(psortRel, u_sess->utils_cxt.RecentXmin, InvalidMultiXactId); relation_close(psortRel, NoLock); } @@ -3192,7 +3192,7 @@ double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexIn * unless it's our own deletion or a system catalog. */ Assert(!(heapTuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); - xwait = HeapTupleGetRawXmax(heapTuple); + xwait = HeapTupleGetUpdateXid(heapTuple); if (!TransactionIdIsCurrentTransactionId(xwait)) { if (!is_system_catalog) ereport(WARNING, @@ -4220,7 +4220,7 @@ void reindex_indexpart_internal(Relation heapRelation, Relation iRel, IndexInfo* heapPart = partitionOpen(heapRelation, heapPartId, ShareLock); indexpart = partitionOpen(iRel, indexPartId, AccessExclusiveLock); - PartitionSetNewRelfilenode(iRel, indexpart, InvalidTransactionId); + PartitionSetNewRelfilenode(iRel, indexpart, InvalidTransactionId, InvalidMultiXactId); index_build(heapRelation, heapPart, iRel, indexpart, indexInfo, false, true, INDEX_CREATE_LOCAL_PARTITION, true); @@ -4243,7 +4243,7 @@ void ReindexGlobalIndexInternal(Relation heapRelation, Relation iRel, IndexInfo* partitionList = relationGetPartitionList(heapRelation, ShareLock); /* We'll build a new physical relation for the index */ - RelationSetNewRelfilenode(iRel, InvalidTransactionId); + RelationSetNewRelfilenode(iRel, InvalidTransactionId, InvalidMultiXactId); /* Initialize the index and rebuild */ /* Note: we do not need to re-establish pkey setting */ @@ -4404,7 +4404,7 @@ void reindex_index(Oid indexId, Oid indexPartId, bool skip_constraint_checks, TrRelationSetNewRelfilenode(iRel, InvalidTransactionId, baseDesc); } else { /* We'll build a new physical relation for the index */ - RelationSetNewRelfilenode(iRel, InvalidTransactionId); + RelationSetNewRelfilenode(iRel, InvalidTransactionId, InvalidMultiXactId); } /* Initialize the index and rebuild */ @@ -4859,7 +4859,7 @@ void reindex_partIndex(Relation heapRel, Partition heapPart, Relation indexRel, */ // change the storage of part index - PartitionSetNewRelfilenode(indexRel, indexPart, InvalidTransactionId); + PartitionSetNewRelfilenode(indexRel, indexPart, InvalidTransactionId, InvalidMultiXactId); // build the part index indexInfo = BuildIndexInfo(indexRel); @@ -5139,7 +5139,7 @@ static void reindexPartIndex(Oid indexId, Oid partOid, bool skip_constraint_chec // REINDEX INDEX CheckPartitionNotInUse(indexpart, "REINDEX INDEX index_partition"); - PartitionSetNewRelfilenode(iRel, indexpart, InvalidTransactionId); + PartitionSetNewRelfilenode(iRel, indexpart, InvalidTransactionId, InvalidMultiXactId); index_build(heapRelation, heapPart, iRel, indexpart, indexInfo, false, true, INDEX_CREATE_LOCAL_PARTITION); /* @@ -5219,7 +5219,7 @@ static void reindexPartIndex(Oid indexId, Oid partOid, bool skip_constraint_chec } /* Update reltuples and relpages in pg_class for partitioned index. */ - vac_update_pgclass_partitioned_table(iRel, false, InvalidTransactionId); + vac_update_pgclass_partitioned_table(iRel, false, InvalidTransactionId, InvalidMultiXactId); /* Close rels, but keep locks */ index_close(iRel, NoLock); diff --git a/src/common/backend/catalog/pg_partition.cpp b/src/common/backend/catalog/pg_partition.cpp index 04314ad54..6340fe686 100644 --- a/src/common/backend/catalog/pg_partition.cpp +++ b/src/common/backend/catalog/pg_partition.cpp @@ -36,6 +36,8 @@ #include "utils/inval.h" #include "utils/syscache.h" #include "access/genam.h" +#include "access/multixact.h" +#include "access/reloptions.h" #include "utils/fmgroids.h" #include "access/heapam.h" #include "utils/snapmgr.h" @@ -129,8 +131,19 @@ void insertPartitionEntry(Relation pg_partition_desc, Partition new_part_desc, O if (parttype == PART_OBJ_TYPE_TABLE_PARTITION) { values[Anum_pg_partition_relfrozenxid64 - 1] = u_sess->utils_cxt.RecentXmin; + +#ifndef ENABLE_MULTIPLE_NODES + if (!is_cstore_option(RELKIND_RELATION, reloptions)) { + values[Anum_pg_partition_relminmxid - 1] = GetOldestMultiXactId(); + } else { + values[Anum_pg_partition_relminmxid - 1] = InvalidMultiXactId; + } +#endif } else { values[Anum_pg_partition_relfrozenxid64 - 1] = InvalidTransactionId; +#ifndef ENABLE_MULTIPLE_NODES + values[Anum_pg_partition_relminmxid - 1] = InvalidMultiXactId; +#endif } /* form a tuple using values and null array, and insert it */ diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 71e99dc0b..44bce0b47 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -3493,6 +3493,9 @@ static RowMarkClause* _copyRowMarkClause(const RowMarkClause* from) COPY_SCALAR_FIELD(forUpdate); COPY_SCALAR_FIELD(noWait); COPY_SCALAR_FIELD(pushedDown); + if (t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + COPY_SCALAR_FIELD(strength); + } return newnode; } @@ -3967,6 +3970,9 @@ static LockingClause* _copyLockingClause(const LockingClause* from) COPY_NODE_FIELD(lockedRels); COPY_SCALAR_FIELD(forUpdate); COPY_SCALAR_FIELD(noWait); + if (t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + COPY_SCALAR_FIELD(strength); + } return newnode; } diff --git a/src/common/backend/nodes/equalfuncs.cpp b/src/common/backend/nodes/equalfuncs.cpp index 4afa8ae7d..8496b8881 100644 --- a/src/common/backend/nodes/equalfuncs.cpp +++ b/src/common/backend/nodes/equalfuncs.cpp @@ -2542,6 +2542,9 @@ static bool _equalLockingClause(const LockingClause* a, const LockingClause* b) COMPARE_NODE_FIELD(lockedRels); COMPARE_SCALAR_FIELD(forUpdate); COMPARE_SCALAR_FIELD(noWait); + if (t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + COMPARE_SCALAR_FIELD(strength); + } return true; } @@ -2680,6 +2683,9 @@ static bool _equalRowMarkClause(const RowMarkClause* a, const RowMarkClause* b) COMPARE_SCALAR_FIELD(forUpdate); COMPARE_SCALAR_FIELD(noWait); COMPARE_SCALAR_FIELD(pushedDown); + if (t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + COMPARE_SCALAR_FIELD(strength); + } return true; } diff --git a/src/common/backend/nodes/outfuncs.cpp b/src/common/backend/nodes/outfuncs.cpp index 7e4978d42..091fb917d 100755 --- a/src/common/backend/nodes/outfuncs.cpp +++ b/src/common/backend/nodes/outfuncs.cpp @@ -3634,6 +3634,9 @@ static void _outLockingClause(StringInfo str, LockingClause* node) WRITE_NODE_FIELD(lockedRels); WRITE_BOOL_FIELD(forUpdate); WRITE_BOOL_FIELD(noWait); + if (t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + WRITE_ENUM_FIELD(strength, LockClauseStrength); + } } static void _outXmlSerialize(StringInfo str, XmlSerialize* node) @@ -4269,6 +4272,9 @@ static void _outRowMarkClause(StringInfo str, RowMarkClause* node) WRITE_BOOL_FIELD(forUpdate); WRITE_BOOL_FIELD(noWait); WRITE_BOOL_FIELD(pushedDown); + if (t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + WRITE_ENUM_FIELD(strength, LockClauseStrength); + } } static void _outWithClause(StringInfo str, WithClause* node) diff --git a/src/common/backend/nodes/readfuncs.cpp b/src/common/backend/nodes/readfuncs.cpp index e7a4c679a..ac6d70fb7 100755 --- a/src/common/backend/nodes/readfuncs.cpp +++ b/src/common/backend/nodes/readfuncs.cpp @@ -1608,6 +1608,9 @@ static RowMarkClause* _readRowMarkClause(void) READ_BOOL_FIELD(forUpdate); READ_BOOL_FIELD(noWait); READ_BOOL_FIELD(pushedDown); + IF_EXIST(strength) { + READ_ENUM_FIELD(strength, LockClauseStrength); + } READ_DONE(); } diff --git a/src/common/backend/parser/analyze.cpp b/src/common/backend/parser/analyze.cpp index bc4c63490..9349ce9d4 100644 --- a/src/common/backend/parser/analyze.cpp +++ b/src/common/backend/parser/analyze.cpp @@ -4124,7 +4124,7 @@ static bool is_rel_child_of_rel(RangeTblEntry* child_rte, RangeTblEntry* parent_ #endif /* - * Check for features that are not supported together with FOR UPDATE/SHARE. + * Check for features that are not supported together with FOR [KEY] UPDATE/SHARE. * * exported so planner can check again after rewriting, query pullup, etc */ @@ -4133,42 +4133,44 @@ void CheckSelectLocking(Query* qry) if (qry->setOperations) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with UNION/INTERSECT/EXCEPT"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed " + "with UNION/INTERSECT/EXCEPT"))); } if (qry->distinctClause != NIL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with DISTINCT clause"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed with DISTINCT clause"))); } if (qry->groupClause != NIL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with GROUP BY clause"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed with GROUP BY clause"))); } if (qry->havingQual != NULL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with HAVING clause"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed with HAVING clause"))); } if (qry->hasAggs) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with aggregate functions"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed with aggregate functions"))); } if (qry->hasWindowFuncs) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with window functions"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed with window functions"))); } if (expression_returns_set((Node*)qry->targetList)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with set-returning functions in the target list"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed with set-returning functions " + "in the target list"))); } } /* - * Transform a FOR UPDATE/SHARE clause + * Transform a FOR [KEY] UPDATE/SHARE clause * * This basically involves replacing names by integer relids. * @@ -4189,7 +4191,7 @@ static void transformLockingClause(ParseState* pstate, Query* qry, LockingClause /* make a clause we can pass down to subqueries to select all rels */ allrels = makeNode(LockingClause); allrels->lockedRels = NIL; /* indicates all rels */ - allrels->forUpdate = lc->forUpdate; + allrels->strength = lc->strength; allrels->noWait = lc->noWait; if (lockedRels == NIL) { @@ -4206,20 +4208,20 @@ static void transformLockingClause(ParseState* pstate, Query* qry, LockingClause heap_close(rel, AccessShareLock); ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be used with column table \"%s\"", - rte->eref->aliasname))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be used with " + "column table \"%s\"", rte->eref->aliasname))); } heap_close(rel, AccessShareLock); - applyLockingClause(qry, i, lc->forUpdate, lc->noWait, pushedDown); + applyLockingClause(qry, i, lc->strength, lc->noWait, pushedDown); rte->requiredPerms |= ACL_SELECT_FOR_UPDATE; break; case RTE_SUBQUERY: - applyLockingClause(qry, i, lc->forUpdate, lc->noWait, pushedDown); + applyLockingClause(qry, i, lc->strength, lc->noWait, pushedDown); /* - * FOR UPDATE/SHARE of subquery is propagated to all of + * FOR [KEY] UPDATE/SHARE of subquery is propagated to all of * subquery's rels, too. We could do this later (based on * the marking of the subquery RTE) but it is convenient * to have local knowledge in each query level about which @@ -4246,7 +4248,8 @@ static void transformLockingClause(ParseState* pstate, Query* qry, LockingClause if (thisrel->catalogname || thisrel->schemaname) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("SELECT FOR UPDATE/SHARE must specify unqualified relation names"), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE must specify unqualified " + "relation names"), parser_errposition(pstate, thisrel->location))); } @@ -4263,42 +4266,46 @@ static void transformLockingClause(ParseState* pstate, Query* qry, LockingClause heap_close(rel, AccessShareLock); ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be used with column table \"%s\"", - rte->eref->aliasname), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be used with " + "column table \"%s\"", rte->eref->aliasname), parser_errposition(pstate, thisrel->location))); } heap_close(rel, AccessShareLock); - applyLockingClause(qry, i, lc->forUpdate, lc->noWait, pushedDown); + applyLockingClause(qry, i, lc->strength, lc->noWait, pushedDown); rte->requiredPerms |= ACL_SELECT_FOR_UPDATE; break; case RTE_SUBQUERY: - applyLockingClause(qry, i, lc->forUpdate, lc->noWait, pushedDown); + applyLockingClause(qry, i, lc->strength, lc->noWait, pushedDown); /* see comment above */ transformLockingClause(pstate, rte->subquery, allrels, true); break; case RTE_JOIN: ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a join"), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be applied " + "to a join"), parser_errposition(pstate, thisrel->location))); break; case RTE_FUNCTION: ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a function"), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be applied " + "to a function"), parser_errposition(pstate, thisrel->location))); break; case RTE_VALUES: ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be applied to VALUES"), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be applied " + "to VALUES"), parser_errposition(pstate, thisrel->location))); break; case RTE_CTE: ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be applied to a WITH query"), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be applied " + "to a WITH query"), parser_errposition(pstate, thisrel->location))); break; default: @@ -4313,7 +4320,8 @@ static void transformLockingClause(ParseState* pstate, Query* qry, LockingClause if (rt == NULL) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_TABLE), - errmsg("relation \"%s\" in FOR UPDATE/SHARE clause not found in FROM clause", thisrel->relname), + errmsg("relation \"%s\" in FOR UPDATE/SHARE//NO KEY UPDATE/KEY SHARE clause not found " + "in FROM clause", thisrel->relname), parser_errposition(pstate, thisrel->location))); } } @@ -4323,7 +4331,7 @@ static void transformLockingClause(ParseState* pstate, Query* qry, LockingClause /* * Record locking info for a single rangetable item */ -void applyLockingClause(Query* qry, Index rtindex, bool forUpdate, bool noWait, bool pushedDown) +void applyLockingClause(Query* qry, Index rtindex, LockClauseStrength strength, bool noWait, bool pushedDown) { RowMarkClause* rc = NULL; @@ -4335,10 +4343,10 @@ void applyLockingClause(Query* qry, Index rtindex, bool forUpdate, bool noWait, /* Check for pre-existing entry for same rtindex */ if ((rc = get_parse_rowmark(qry, rtindex)) != NULL) { /* - * If the same RTE is specified both FOR UPDATE and FOR SHARE, treat - * it as FOR UPDATE. (Reasonable, since you can't take both a shared - * and exclusive lock at the same time; it'll end up being exclusive - * anyway.) + * If the same RTE is specified for more than one locking strength, + * treat is as the strongest. (Reasonable, since you can't take both a + * shared and exclusive lock at the same time; it'll end up being + * exclusive anyway.) * * We also consider that NOWAIT wins if it's specified both ways. This * is a bit more debatable but raising an error doesn't seem helpful. @@ -4347,7 +4355,7 @@ void applyLockingClause(Query* qry, Index rtindex, bool forUpdate, bool noWait, * * And of course pushedDown becomes false if any clause is explicit. */ - rc->forUpdate = rc->forUpdate || forUpdate; + rc->strength = Max(rc->strength, strength); rc->noWait = rc->noWait || noWait; rc->pushedDown = rc->pushedDown && pushedDown; return; @@ -4356,7 +4364,7 @@ void applyLockingClause(Query* qry, Index rtindex, bool forUpdate, bool noWait, /* Make a new RowMarkClause */ rc = makeNode(RowMarkClause); rc->rti = rtindex; - rc->forUpdate = forUpdate; + rc->strength = strength; rc->noWait = noWait; rc->pushedDown = pushedDown; qry->rowMarks = lappend(qry->rowMarks, rc); diff --git a/src/common/backend/parser/gram.y b/src/common/backend/parser/gram.y index 62acf7bdf..92e32b378 100644 --- a/src/common/backend/parser/gram.y +++ b/src/common/backend/parser/gram.y @@ -287,6 +287,7 @@ static Node *make_node_from_scanbuf(int start_pos, int end_pos, core_yyscan_t yy MergeWhenClause *mergewhen; UpsertClause *upsert; EncryptionType algtype; + LockClauseStrength lockstrength; } %type stmt schema_stmt @@ -479,6 +480,7 @@ static Node *make_node_from_scanbuf(int start_pos, int end_pos, core_yyscan_t yy %type OptTemp OptKind %type OnCommitOption +%type for_locking_strength %type for_locking_item %type for_locking_clause opt_for_locking_clause for_locking_items %type locked_rels_list @@ -16901,9 +16903,9 @@ select_with_parens: * The duplicative productions are annoying, but hard to get rid of without * creating shift/reduce conflicts. * - * FOR UPDATE/SHARE may be before or after LIMIT/OFFSET. + * The locking clause (FOR UPDATE etc) may be before or after LIMIT/OFFSET. * In <=7.2.X, LIMIT/OFFSET had to be after FOR UPDATE - * We now support both orderings, but prefer LIMIT/OFFSET before FOR UPDATE/SHARE + * We now support both orderings, but prefer LIMIT/OFFSET before the locking clause. * 2002-08-28 bjm */ select_no_parens: @@ -17452,21 +17454,38 @@ for_locking_items: ; for_locking_item: - FOR UPDATE locked_rels_list opt_nowait + for_locking_strength locked_rels_list opt_nowait { LockingClause *n = makeNode(LockingClause); - n->lockedRels = $3; - n->forUpdate = TRUE; - n->noWait = $4; + n->lockedRels = $2; + n->strength = $1; + n->noWait = $3; $$ = (Node *) n; } - | FOR SHARE locked_rels_list opt_nowait + ; + +for_locking_strength: + FOR UPDATE { $$ = LCS_FORUPDATE; } + | FOR NO KEY UPDATE { - LockingClause *n = makeNode(LockingClause); - n->lockedRels = $3; - n->forUpdate = FALSE; - n->noWait = $4; - $$ = (Node *) n; +#ifdef ENABLE_MULTIPLE_NODES + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("SELECT FOR NO KEY UPDATE is not yet supported."))); +#else + $$ = LCS_FORNOKEYUPDATE; +#endif + } + | FOR SHARE { $$ = LCS_FORSHARE; } + | FOR KEY SHARE + { +#ifdef ENABLE_MULTIPLE_NODES + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("SELECT FOR KEY SHARE is not yet supported."))); +#else + $$ = LCS_FORKEYSHARE; +#endif } ; diff --git a/src/common/backend/pgxc_single/pool/execRemote.cpp b/src/common/backend/pgxc_single/pool/execRemote.cpp index a3b6b6615..7defcb4eb 100755 --- a/src/common/backend/pgxc_single/pool/execRemote.cpp +++ b/src/common/backend/pgxc_single/pool/execRemote.cpp @@ -26,6 +26,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/relscan.h" +#include "access/multixact.h" #include "catalog/pg_namespace.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" @@ -9457,7 +9458,8 @@ static void ReceivePageAndTuple(Oid relid, TupleTableSlot* slot, VacuumStmt* stm rel = relation_open(relid, ShareUpdateExclusiveLock); classRel = heap_open(RelationRelationId, RowExclusiveLock); - vac_update_relstats(rel, classRel, relpages, reltuples, relallvisible, hasindex, BootstrapTransactionId); + vac_update_relstats(rel, classRel, relpages, reltuples, relallvisible, + hasindex, BootstrapTransactionId, InvalidMultiXactId); /* Save the flag identify is there dirty data in the relation. */ if (stmt != NULL) { @@ -9871,7 +9873,9 @@ static void ReceivePartitionPageAndTuple(Oid relid, TupleTableSlot* slot) } partrel = partitionOpen(rel, partitionid, NoLock); - vac_update_partstats(partrel, (BlockNumber)relpages, reltuples, relallvisible, BootstrapTransactionId); + vac_update_partstats(partrel, (BlockNumber)relpages, reltuples, relallvisible, + BootstrapTransactionId, RelationIsColStore(rel) ? InvalidMultiXactId : FirstMultiXactId); + /* * we does not fetch dead tuples info from remote DN/CN, just set deadtuples to 0. it does * not matter because we should fetch all deadtuples info from all datanodes to calculate a diff --git a/src/common/backend/utils/adt/ri_triggers.cpp b/src/common/backend/utils/adt/ri_triggers.cpp index dd48a9e78..c594cf1a8 100644 --- a/src/common/backend/utils/adt/ri_triggers.cpp +++ b/src/common/backend/utils/adt/ri_triggers.cpp @@ -285,7 +285,7 @@ static Datum RI_FKey_check(PG_FUNCTION_ARGS) * Get the relation descriptors of the FK and PK tables. * * pk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = trigdata->tg_relation; pk_rel = heap_open(riinfo.pk_relid, RowShareLock); @@ -320,8 +320,13 @@ static Datum RI_FKey_check(PG_FUNCTION_ARGS) * ---------- */ quoteRelationName(pkrelname, pk_rel); +#ifdef ENABLE_MULTIPLE_NODES rc = snprintf_s( querystr, sizeof(querystr), sizeof(querystr) - 1, "SELECT 1 FROM ONLY %s x FOR SHARE OF x", pkrelname); +#else + rc = snprintf_s(querystr, sizeof(querystr), sizeof(querystr) - 1, + "SELECT 1 FROM ONLY %s x FOR KEY SHARE OF x", pkrelname); +#endif securec_check_ss(rc, "\0", "\0"); /* Prepare and save the plan */ @@ -436,7 +441,8 @@ static Datum RI_FKey_check(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE pkatt1 = $1 [AND ...] FOR SHARE + * SELECT 1 FROM ONLY x WHERE pkatt1 = $1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding FK attributes. * ---------- @@ -456,7 +462,11 @@ static Datum RI_FKey_check(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = fk_type; } +#ifdef ENABLE_MULTIPLE_NODES appendStringInfo(&querybuf, " FOR SHARE OF x"); +#else + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); +#endif /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, &qkey, fk_rel, pk_rel, true); @@ -581,7 +591,8 @@ static bool ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel, HeapTuple old_ro /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE pkatt1 = $1 [AND ...] FOR SHARE + * SELECT 1 FROM ONLY x WHERE pkatt1 = $1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * PK attributes themselves. * ---------- @@ -601,7 +612,12 @@ static bool ri_Check_Pk_Match(Relation pk_rel, Relation fk_rel, HeapTuple old_ro querysep = "AND"; queryoids[i] = pk_type; } +#ifdef ENABLE_MULTIPLE_NODES appendStringInfo(&querybuf, " FOR SHARE OF x"); +#else + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); +#endif + /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo->nkeys, queryoids, &qkey, fk_rel, pk_rel, true); @@ -664,7 +680,7 @@ Datum RI_FKey_noaction(PG_FUNCTION_ARGS) * (the new and old tuple for update) * * fk_rel is opened in RowShareLock mode since that's what our eventual - * SELECT FOR SHARE will get on it. + * SELECT FOR KEY SHARE will get on it. */ fk_rel = heap_open(riinfo.fk_relid, RowShareLock); pk_rel = trigdata->tg_relation; @@ -749,7 +765,8 @@ Datum RI_FKey_noaction(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE $1 = fkatt1 [AND ...] + * SELECT 1 FROM ONLY x WHERE $1 = fkatt1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding PK attributes. * ---------- @@ -769,7 +786,11 @@ Datum RI_FKey_noaction(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = pk_type; } +#ifdef ENABLE_MULTIPLE_NODES appendStringInfo(&querybuf, " FOR SHARE OF x"); +#else + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); +#endif /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, &qkey, fk_rel, pk_rel, true); @@ -929,7 +950,7 @@ Datum RI_FKey_cascade_del(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * DELETE FROM ONLY WHERE $1 = fkatt1 [AND ...] + * DELETE FROM ONLY x WHERE $1 = fkatt1 [AND ...] * The type id's for the $ parameters are those of the * corresponding PK attributes. * ---------- @@ -1277,7 +1298,8 @@ Datum RI_FKey_restrict(PG_FUNCTION_ARGS) /* ---------- * The query string built is - * SELECT 1 FROM ONLY WHERE $1 = fkatt1 [AND ...] + * SELECT 1 FROM ONLY x WHERE $1 = fkatt1 [AND ...] + * FOR KEY SHARE OF x * The type id's for the $ parameters are those of the * corresponding PK attributes. * ---------- @@ -1297,7 +1319,11 @@ Datum RI_FKey_restrict(PG_FUNCTION_ARGS) querysep = "AND"; queryoids[i] = pk_type; } +#ifdef ENABLE_MULTIPLE_NODES appendStringInfo(&querybuf, " FOR SHARE OF x"); +#else + appendStringInfo(&querybuf, " FOR KEY SHARE OF x"); +#endif /* Prepare and save the plan */ qplan = ri_PlanCheck(querybuf.data, riinfo.nkeys, queryoids, &qkey, fk_rel, pk_rel, true); diff --git a/src/common/backend/utils/adt/ruleutils.cpp b/src/common/backend/utils/adt/ruleutils.cpp index f722db1b2..52892d04f 100644 --- a/src/common/backend/utils/adt/ruleutils.cpp +++ b/src/common/backend/utils/adt/ruleutils.cpp @@ -5425,7 +5425,7 @@ static void get_select_query_def(Query* query, deparse_context* context, TupleDe get_rule_expr(query->limitCount, context, false); } - /* Add FOR UPDATE/SHARE clauses if present */ + /* Add FOR [KEY] UPDATE/SHARE clauses if present */ if (query->hasForUpdate) { foreach (l, query->rowMarks) { RowMarkClause* rc = (RowMarkClause*)lfirst(l); @@ -5435,10 +5435,30 @@ static void get_select_query_def(Query* query, deparse_context* context, TupleDe if (rc->pushedDown) continue; +#ifndef ENABLE_MULTIPLE_NODES + switch (rc->strength) { + case LCS_FORKEYSHARE: + appendContextKeyword(context, " FOR KEY SHARE", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORSHARE: + appendContextKeyword(context, " FOR SHARE", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORNOKEYUPDATE: + appendContextKeyword(context, " FOR NO KEY UPDATE", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + case LCS_FORUPDATE: + appendContextKeyword(context, " FOR UPDATE", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); + break; + default: + ereport(ERROR, (errmsg("unknown lock type: %d", rc->strength))); + break; + } +#else if (rc->forUpdate) appendContextKeyword(context, " FOR UPDATE", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); else appendContextKeyword(context, " FOR SHARE", -PRETTYINDENT_STD, PRETTYINDENT_STD, 0); +#endif appendStringInfo(buf, " OF %s", quote_identifier(rte->eref->aliasname)); if (rc->noWait) appendStringInfo(buf, " NOWAIT"); diff --git a/src/common/backend/utils/cache/partcache.cpp b/src/common/backend/utils/cache/partcache.cpp index 81945667c..28a14e2af 100644 --- a/src/common/backend/utils/cache/partcache.cpp +++ b/src/common/backend/utils/cache/partcache.cpp @@ -1448,7 +1448,7 @@ static void PartitionReloadIndexInfo(Partition part) * Output : * Notes : */ -void PartitionSetNewRelfilenode(Relation parent, Partition part, TransactionId freezeXid) +void PartitionSetNewRelfilenode(Relation parent, Partition part, TransactionId freezeXid, MultiXactId freezeMultiXid) { Oid newrelfilenode; RelFileNodeBackend newrnode; @@ -1557,6 +1557,11 @@ void PartitionSetNewRelfilenode(Relation parent, Partition part, TransactionId f replaces[Anum_pg_partition_relfrozenxid64 - 1] = true; values[Anum_pg_partition_relfrozenxid64 - 1] = TransactionIdGetDatum(freezeXid); +#ifndef ENABLE_MULTIPLE_NODES + replaces[Anum_pg_partition_relminmxid - 1] = true; + values[Anum_pg_partition_relminmxid - 1] = TransactionIdGetDatum(freezeMultiXid); +#endif + ntup = heap_modify_tuple(tuple, RelationGetDescr(pg_partition), values, nulls, replaces); simple_heap_update(pg_partition, &ntup->t_self, ntup); diff --git a/src/common/backend/utils/cache/relcache.cpp b/src/common/backend/utils/cache/relcache.cpp index 2e894857d..9a4e52231 100644 --- a/src/common/backend/utils/cache/relcache.cpp +++ b/src/common/backend/utils/cache/relcache.cpp @@ -37,6 +37,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/multixact.h" #include "catalog/catalog.h" #include "catalog/heap.h" #include "catalog/catversion.h" @@ -3444,6 +3445,7 @@ static void RelationDestroyRelation(Relation relation, bool remember_tupdesc) } list_free_ext(relation->rd_indexlist); bms_free_ext(relation->rd_indexattr); + bms_free_ext(relation->rd_keyattr); bms_free_ext(relation->rd_idattr); FreeTriggerDesc(relation->trigdesc); if (relation->rd_rlsdesc) { @@ -4452,7 +4454,7 @@ extern void heap_create_init_fork(Relation rel); void DeltaTableSetNewRelfilenode(Oid relid, TransactionId freezeXid, bool partition) { Relation deltaRel = heap_open(relid, AccessExclusiveLock); - RelationSetNewRelfilenode(deltaRel, freezeXid); + RelationSetNewRelfilenode(deltaRel, freezeXid, InvalidMultiXactId); // skip partition because one partition CANNOT be unlogged. if (!partition && RELPERSISTENCE_UNLOGGED == deltaRel->rd_rel->relpersistence) { heap_create_init_fork(deltaRel); @@ -4467,7 +4469,7 @@ void DescTableSetNewRelfilenode(Oid relid, TransactionId freezeXid, bool partiti // Because indexRelation has locked as AccessExclusiveLock, so it is safe // Relation cudescRel = heap_open(relid, AccessExclusiveLock); - RelationSetNewRelfilenode(cudescRel, freezeXid); + RelationSetNewRelfilenode(cudescRel, freezeXid, InvalidMultiXactId); // skip partition because one partition CANNOT be unlogged. if (!partition && RELPERSISTENCE_UNLOGGED == cudescRel->rd_rel->relpersistence) { @@ -4480,7 +4482,7 @@ void DescTableSetNewRelfilenode(Oid relid, TransactionId freezeXid, bool partiti foreach (indlist, RelationGetIndexList(cudescRel)) { Oid indexId = lfirst_oid(indlist); Relation currentIndex = index_open(indexId, AccessExclusiveLock); - RelationSetNewRelfilenode(currentIndex, InvalidTransactionId); + RelationSetNewRelfilenode(currentIndex, InvalidTransactionId, InvalidMultiXactId); // keep the same logic with row index relation, and // skip checking RELPERSISTENCE_UNLOGGED persistence @@ -4512,7 +4514,7 @@ void DescTableSetNewRelfilenode(Oid relid, TransactionId freezeXid, bool partiti * must be passed for indexes and sequences). This should be a lower bound on * the XIDs that will be put into the new relation contents. */ -void RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid, bool isDfsTruncate) +void RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid, MultiXactId minmulti, bool isDfsTruncate) { Oid newrelfilenode; RelFileNodeBackend newrnode; @@ -4649,6 +4651,11 @@ void RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid, bool replaces[Anum_pg_class_relfrozenxid64 - 1] = true; values[Anum_pg_class_relfrozenxid64 - 1] = TransactionIdGetDatum(freezeXid); +#ifndef ENABLE_MULTIPLE_NODES + replaces[Anum_pg_class_relminmxid - 1] = true; + values[Anum_pg_class_relminmxid - 1] = TransactionIdGetDatum(minmulti); +#endif + nctup = heap_modify_tuple(tuple, RelationGetDescr(pg_class), values, nulls, replaces); simple_heap_update(pg_class, &nctup->t_self, nctup); @@ -6147,6 +6154,7 @@ static void ClusterConstraintFetch(__inout Relation relation) Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) { Bitmapset* indexattrs = NULL; + Bitmapset* uindexattrs = NULL; List* indexoidlist = NULL; ListCell* l = NULL; Bitmapset* idindexattrs = NULL; /* columns in the the replica identity */ @@ -6160,7 +6168,7 @@ Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind att case INDEX_ATTR_BITMAP_ALL: return bms_copy(relation->rd_indexattr); case INDEX_ATTR_BITMAP_KEY: - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown attrKind %u", attrKind))); + return bms_copy(relation->rd_keyattr); case INDEX_ATTR_BITMAP_IDENTITY_KEY: return bms_copy(relation->rd_idattr); default: @@ -6201,12 +6209,14 @@ Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind att * won't be returned at all by RelationGetIndexList. */ indexattrs = NULL; + uindexattrs = NULL; idindexattrs = NULL; foreach (l, indexoidlist) { Oid indexOid = lfirst_oid(l); Relation indexDesc; IndexInfo* indexInfo = NULL; int i; + bool isKey = false; /* candidate key */ bool isIDKey = false; /* replica identity index */ indexDesc = index_open(indexOid, AccessShareLock); @@ -6214,6 +6224,9 @@ Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind att /* Extract index key information from the index's pg_index row */ indexInfo = BuildIndexInfo(indexDesc); + /* Can this index be referenced by a foreign key? */ + isKey = indexInfo->ii_Unique && indexInfo->ii_Expressions == NIL && indexInfo->ii_Predicate == NIL; + /* Is this index the configured (or default) replica identity? */ isIDKey = (indexOid == relreplindex); @@ -6231,6 +6244,9 @@ Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind att */ if (attrnum != 0) { indexattrs = bms_add_member(indexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); + if (isKey && i < indexInfo->ii_NumIndexKeyAttrs) { + uindexattrs = bms_add_member(uindexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); + } if (isIDKey && i < indexInfo->ii_NumIndexKeyAttrs) idindexattrs = bms_add_member(idindexattrs, attrnum - FirstLowInvalidHeapAttributeNumber); } @@ -6255,6 +6271,7 @@ Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind att * empty. */ oldcxt = MemoryContextSwitchTo(u_sess->cache_mem_cxt); + relation->rd_keyattr = bms_copy(uindexattrs); relation->rd_idattr = bms_copy(idindexattrs); relation->rd_indexattr = bms_copy(indexattrs); (void)MemoryContextSwitchTo(oldcxt); @@ -6264,7 +6281,7 @@ Bitmapset* RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind att case INDEX_ATTR_BITMAP_ALL: return indexattrs; case INDEX_ATTR_BITMAP_KEY: - ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unknown attrKind %u", attrKind))); + return uindexattrs; case INDEX_ATTR_BITMAP_IDENTITY_KEY: return idindexattrs; default: @@ -6822,6 +6839,7 @@ static bool load_relcache_init_file(bool shared) rel->rd_indexlist = NIL; rel->rd_oidindex = InvalidOid; rel->rd_indexattr = NULL; + rel->rd_keyattr = NULL; rel->rd_idattr = NULL; rel->rd_createSubid = InvalidSubTransactionId; rel->rd_newRelfilenodeSubid = InvalidSubTransactionId; diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index c8a2ed4f1..11a4ca4b8 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -59,7 +59,7 @@ bool open_join_children = true; bool will_shutdown = false; /* hard-wired binary version number */ -const uint32 GRAND_VERSION_NUM = 92422; +const uint32 GRAND_VERSION_NUM = 92423; const uint32 HINT_ENHANCEMENT_VERSION_NUM = 92359; const uint32 MATVIEW_VERSION_NUM = 92213; @@ -97,6 +97,8 @@ const uint32 V5R2C00_BACKEND_VERSION_NUM = 92412; const uint32 ANALYZER_HOOK_VERSION_NUM = 92420; const uint32 SUPPORT_HASH_XLOG_VERSION_NUM = 92420; +const uint32 ENHANCED_TUPLE_LOCK_VERSION_NUM = 92423; + /* This variable indicates wheather the instance is in progress of upgrade as a whole */ uint32 volatile WorkingGrandVersionNum = GRAND_VERSION_NUM; diff --git a/src/common/backend/utils/time/combocid.cpp b/src/common/backend/utils/time/combocid.cpp index 3833f4a3f..998d28f51 100644 --- a/src/common/backend/utils/time/combocid.cpp +++ b/src/common/backend/utils/time/combocid.cpp @@ -115,9 +115,8 @@ CommandId HeapTupleGetCmax(HeapTuple tup) HeapTupleHeader htup = tup->t_data; CommandId cid = HeapTupleHeaderGetRawCommandId(htup); - /* We do not store cmax when locking a tuple */ - Assert(!(htup->t_infomask & (HEAP_IS_LOCKED))); - Assert(TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(tup))); + Assert(!(htup->t_infomask & HEAP_MOVED)); + Assert(TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXid(tup))); if (htup->t_infomask & HEAP_COMBOCID) return GetRealCmax(cid); @@ -129,9 +128,8 @@ CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup, Page page) { CommandId cid = HeapTupleHeaderGetRawCommandId(tup); - /* We do not store cmax when locking a tuple */ - Assert(!(tup->t_infomask & (HEAP_MOVED | HEAP_IS_LOCKED))); - Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tup))); + Assert(!(tup->t_infomask & HEAP_MOVED)); + Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(page, tup))); if (tup->t_infomask & HEAP_COMBOCID) return GetRealCmax(cid); diff --git a/src/gausskernel/optimizer/commands/analyze.cpp b/src/gausskernel/optimizer/commands/analyze.cpp index 0b7c3296d..5d0194532 100755 --- a/src/gausskernel/optimizer/commands/analyze.cpp +++ b/src/gausskernel/optimizer/commands/analyze.cpp @@ -80,6 +80,7 @@ #include "utils/timestamp.h" #include "tcop/utility.h" #include "tcop/dest.h" +#include "access/multixact.h" #ifdef PGXC #include "pgxc/pgxc.h" #endif @@ -2884,7 +2885,7 @@ retry: .* pre-image but not the post-image. We also get sane .* results if the concurrent transaction never commits. */ - if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(targpage, targtuple.t_data))) + if (TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXid(&targtuple))) deadrows += 1; else { sample_it = true; @@ -6775,7 +6776,8 @@ static void update_pages_and_tuples_pgclass(Relation onerel, VacuumStmt* vacstmt } Relation classRel = heap_open(RelationRelationId, RowExclusiveLock); - vac_update_relstats(onerel, classRel, updrelpages, totalrows, mapCont, hasindex, InvalidTransactionId); + vac_update_relstats(onerel, classRel, updrelpages, totalrows, mapCont, + hasindex, InvalidTransactionId, InvalidMultiXactId); heap_close(classRel, RowExclusiveLock); } @@ -6815,7 +6817,8 @@ static void update_pages_and_tuples_pgclass(Relation onerel, VacuumStmt* vacstmt (0 != vacstmt->pstGlobalStatEx[vacstmt->tableidx].totalRowCnts)) { nblocks = estimate_index_blocks(Irel[ind], totalindexrows, table_factor); Relation classRel = heap_open(RelationRelationId, RowExclusiveLock); - vac_update_relstats(Irel[ind], classRel, nblocks, totalindexrows, 0, false, BootstrapTransactionId); + vac_update_relstats(Irel[ind], classRel, nblocks, totalindexrows, 0, + false, BootstrapTransactionId, InvalidMultiXactId); heap_close(classRel, RowExclusiveLock); continue; } @@ -6827,7 +6830,8 @@ static void update_pages_and_tuples_pgclass(Relation onerel, VacuumStmt* vacstmt nblocks = GetOneRelNBlocks(onerel, Irel[ind], vacstmt, totalindexrows); Relation classRel = heap_open(RelationRelationId, RowExclusiveLock); - vac_update_relstats(Irel[ind], classRel, nblocks, totalindexrows, 0, false, InvalidTransactionId); + vac_update_relstats(Irel[ind], classRel, nblocks, totalindexrows, 0, + false, InvalidTransactionId, InvalidMultiXactId); heap_close(classRel, RowExclusiveLock); } } diff --git a/src/gausskernel/optimizer/commands/cluster.cpp b/src/gausskernel/optimizer/commands/cluster.cpp index 80218317f..125914e0c 100755 --- a/src/gausskernel/optimizer/commands/cluster.cpp +++ b/src/gausskernel/optimizer/commands/cluster.cpp @@ -30,6 +30,7 @@ #include "access/xact.h" #include "access/xlog.h" #include "access/sysattr.h" +#include "access/reloptions.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/heap.h" @@ -75,6 +76,7 @@ #include "gstrace/gstrace_infra.h" #include "gstrace/commands_gstrace.h" #include "parser/parse_utilcmd.h" +#include "access/multixact.h" #ifdef ENABLE_MULTIPLE_NODES #include "tsdb/storage/part_merge.h" #include "tsdb/utils/ts_relcache.h" @@ -125,7 +127,8 @@ extern DfsSrvOptions* GetDfsSrvOptions(Oid spcNode); static void swap_relation_names(Oid r1, Oid r2); static void swapCascadeHeapTables( - Oid relId1, Oid relId2, Oid tempTableOid, bool swapByContent, TransactionId frozenXid, Oid* mappedTables); + Oid relId1, Oid relId2, Oid tempTableOid, bool swapByContent, TransactionId frozenXid, + MultiXactId multiXid, Oid* mappedTables); static void SwapCStoreTables(Oid relId1, Oid relId2, Oid parentOid, Oid tempTableOid); @@ -139,7 +142,7 @@ static void rebuildPartition(Relation partTableRel, Oid partitionOid, Oid indexO static void copyPartitionHeapData(Relation newHeap, Relation oldHeap, Oid indexOid, PlannerInfo* root, RelOptInfo* relOptInfo, int freezeMinAge, int freezeTableAge, bool verbose, bool* pSwapToastByContent, - TransactionId* pFreezeXid, AdaptMem* mem_info, double* ptrDeleteTupleNum = NULL); + TransactionId* pFreezeXid, MultiXactId* pFreezeMulti, AdaptMem* mem_info, double* ptrDeleteTupleNum = NULL); static void CopyCStoreData(Relation oldRel, Relation newRel, int freeze_min_age, int freeze_table_age, bool verbose, bool* pSwapToastByContent, TransactionId* pFreezeXid, AdaptMem* mem_info); static void DoCopyPaxFormatData(Relation oldRel, Relation newRel); @@ -148,7 +151,8 @@ static void DoCopyCUFormatData(Relation oldRel, Relation newRel, TupleDesc oldTu static List* FindMergedDescs(Relation oldRel, Relation newRel); extern ValuePartitionMap* buildValuePartitionMap(Relation relation, Relation pg_partition, HeapTuple partitioned_tuple); static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int freeze_min_age, int freeze_table_age, - bool verbose, bool* pSwapToastByContent, TransactionId* pFreezeXid, double* ptrDeleteTupleNum, AdaptMem* mem_info); + bool verbose, bool* pSwapToastByContent, TransactionId* pFreezeXid, MultiXactId *pFreezeMulti, + double* ptrDeleteTupleNum, AdaptMem* mem_info); static List* get_tables_to_cluster(MemoryContext cluster_context); static void reform_and_rewrite_tuple(HeapTuple tuple, TupleDesc oldTupDesc, TupleDesc newTupDesc, Datum* values, bool* isnull, bool newRelHasOids, RewriteState rwstate); @@ -163,8 +167,8 @@ static void RebuildCStoreRelation( extern void Start_Prefetch(TableScanDesc scan, SeqScanAccessor* pAccessor, ScanDirection dir); extern void SeqScan_Init(TableScanDesc Scan, SeqScanAccessor* pAccessor, Relation relation); -static void swap_partition_relfilenode( - Oid partitionOid1, Oid partitionOid2, bool swapToastByContent, TransactionId frozenXid, Oid* mappedTables); +static void swap_partition_relfilenode( Oid partitionOid1, Oid partitionOid2, bool swapToastByContent, + TransactionId frozenXid, MultiXactId multiXid, Oid* mappedTables); static void partition_relfilenode_swap(Oid OIDOldHeap, Oid OIDNewHeap, uint8 needSwitch); static void relfilenode_swap(Oid OIDOldHeap, Oid OIDNewHeap, uint8 needSwitch); #ifdef ENABLE_MULTIPLE_NODES @@ -775,6 +779,7 @@ static void rebuild_relation( TransactionId frozenXid = InvalidTransactionId; double deleteTupleNum = 0; bool is_shared = OldHeap->rd_rel->relisshared; + MultiXactId multiXid; /* Mark the correct index as clustered */ if (OidIsValid(indexOid)) @@ -798,6 +803,7 @@ static void rebuild_relation( verbose, &swap_toast_by_content, &frozenXid, + &multiXid, &deleteTupleNum, memUsage); @@ -814,7 +820,8 @@ static void rebuild_relation( * Swap the physical files of the target and transient tables, then * rebuild the target's indexes and throw away the transient table. */ - finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, swap_toast_by_content, false, frozenXid, memUsage); + finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog, swap_toast_by_content, + false, frozenXid, multiXid, memUsage); /* report vacuum full stat to PgStatCollector */ pgstat_report_vacuum(tableOid, InvalidOid, is_shared, deleteTupleNum); @@ -822,10 +829,8 @@ static void rebuild_relation( clearAttrInitDefVal(tableOid); } -TransactionId getPartitionRelfrozenxid(Relation ordTableRel) +void getPartitionRelxids(Relation ordTableRel, TransactionId* frozenXid, MultiXactId* multiXid) { - bool relfrozenxid_isNull = true; - TransactionId relfrozenxid = InvalidTransactionId; Relation rel = heap_open(PartitionRelationId, AccessShareLock); HeapTuple tuple = SearchSysCacheCopy1(PARTRELID, ObjectIdGetDatum(RelationGetRelid(ordTableRel))); if (!HeapTupleIsValid(tuple)) { @@ -833,28 +838,35 @@ TransactionId getPartitionRelfrozenxid(Relation ordTableRel) (errcode(ERRCODE_UNDEFINED_TABLE), errmsg("cache lookup failed for relation %u", RelationGetRelid(ordTableRel)))); } + bool isNull = true; Datum xid64datum = - tableam_tops_tuple_getattr(tuple, Anum_pg_partition_relfrozenxid64, RelationGetDescr(rel), &relfrozenxid_isNull); - heap_close(rel, AccessShareLock); - heap_freetuple(tuple); + tableam_tops_tuple_getattr(tuple, Anum_pg_partition_relfrozenxid64, RelationGetDescr(rel), &isNull); - if (relfrozenxid_isNull) { - relfrozenxid = ordTableRel->rd_rel->relfrozenxid; + if (isNull) { + *frozenXid = ordTableRel->rd_rel->relfrozenxid; - if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, relfrozenxid) || - !TransactionIdIsNormal(relfrozenxid)) - relfrozenxid = FirstNormalTransactionId; + if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, *frozenXid) || + !TransactionIdIsNormal(*frozenXid)) + *frozenXid = FirstNormalTransactionId; } else { - relfrozenxid = DatumGetTransactionId(xid64datum); + *frozenXid = DatumGetTransactionId(xid64datum); } - return relfrozenxid; +#ifndef ENABLE_MULTIPLE_NODES + if (multiXid != NULL) { + xid64datum = + tableam_tops_tuple_getattr(tuple, Anum_pg_partition_relminmxid, RelationGetDescr(rel), &isNull); + *multiXid = isNull ? InvalidMultiXactId : DatumGetTransactionId(xid64datum); + } +#endif + + heap_close(rel, AccessShareLock); + heap_freetuple(tuple); } -TransactionId getRelationRelfrozenxid(Relation ordTableRel) +void getRelationRelxids(Relation ordTableRel, TransactionId* frozenXid, MultiXactId* multiXid) { - bool relfrozenxid_isNull = true; - TransactionId relfrozenxid = InvalidTransactionId; + bool isNull = true; Relation rel = heap_open(RelationRelationId, AccessShareLock); HeapTuple tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(RelationGetRelid(ordTableRel))); if (!HeapTupleIsValid(tuple)) { @@ -862,21 +874,27 @@ TransactionId getRelationRelfrozenxid(Relation ordTableRel) (errcode(ERRCODE_UNDEFINED_TABLE), errmsg("cache lookup failed for relation %u", RelationGetRelid(ordTableRel)))); } - Datum xid64datum = tableam_tops_tuple_getattr(tuple, Anum_pg_class_relfrozenxid64, RelationGetDescr(rel), &relfrozenxid_isNull); - heap_close(rel, AccessShareLock); - heap_freetuple(tuple); + Datum xid64datum = tableam_tops_tuple_getattr(tuple, Anum_pg_class_relfrozenxid64, RelationGetDescr(rel), &isNull); - if (relfrozenxid_isNull) { - relfrozenxid = ordTableRel->rd_rel->relfrozenxid; + if (isNull) { + *frozenXid = ordTableRel->rd_rel->relfrozenxid; - if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, relfrozenxid) || - !TransactionIdIsNormal(relfrozenxid)) - relfrozenxid = FirstNormalTransactionId; + if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, *frozenXid) || + !TransactionIdIsNormal(*frozenXid)) + *frozenXid = FirstNormalTransactionId; } else { - relfrozenxid = DatumGetTransactionId(xid64datum); + *frozenXid = DatumGetTransactionId(xid64datum); } - return relfrozenxid; +#ifndef ENABLE_MULTIPLE_NODES + if (multiXid != NULL) { + xid64datum = tableam_tops_tuple_getattr(tuple, Anum_pg_class_relminmxid, RelationGetDescr(rel), &isNull); + *multiXid = isNull ? InvalidMultiXactId : DatumGetTransactionId(xid64datum); + } +#endif + + heap_close(rel, AccessShareLock); + heap_freetuple(tuple); } /* @@ -895,6 +913,7 @@ static void rebuildPartitionedTable( Oid OIDNewHeap = InvalidOid; bool swapToastByContent = false; TransactionId* frozenXid = NULL; + MultiXactId* multiXid = NULL; TupleDesc partTabHeapDesc; HeapTuple tuple = NULL; @@ -922,6 +941,7 @@ static void rebuildPartitionedTable( int OIDNewHeapArrayLen = 0; int pos = 0; int loc = 0; + int temp = 0; /* Mark the correct index as clustered */ if (OidIsValid(indexOid)) { @@ -976,6 +996,7 @@ static void rebuildPartitionedTable( OIDNewHeapArrayLen = list_length(partitions); OIDNewHeapArray = (Oid*)palloc(sizeof(Oid) * OIDNewHeapArrayLen); frozenXid = (TransactionId*)palloc(sizeof(TransactionId) * OIDNewHeapArrayLen); + multiXid = (MultiXactId*)palloc(sizeof(MultiXactId) * OIDNewHeapArrayLen); foreach (cell, partitions) { partition = (Partition)lfirst(cell); partRel = partitionGetRelation(partTableRel, partition); @@ -1022,7 +1043,8 @@ static void rebuildPartitionedTable( &swapToastByContent, &frozenXid[loc++], memUsage); - else + else { + temp = loc++; copyPartitionHeapData(newHeap, partRel, indexOid, @@ -1032,9 +1054,11 @@ static void rebuildPartitionedTable( freezeTableAge, verbose, &swapToastByContent, - &frozenXid[loc++], + &frozenXid[temp], + &multiXid[temp], memUsage, &deleteTuplesNum); + } heap_close(newHeap, NoLock); releaseDummyRelation(&partRel); @@ -1060,8 +1084,10 @@ static void rebuildPartitionedTable( partRel = partitionGetRelation(partTableRel, partition); OIDNewHeap = OIDNewHeapArray[pos++]; + temp = loc++; /* swap the temp table and partition */ - finishPartitionHeapSwap(partRel->rd_id, OIDNewHeap, swapToastByContent, frozenXid[loc++]); + finishPartitionHeapSwap(partRel->rd_id, OIDNewHeap, swapToastByContent, frozenXid[temp], + isCStore ? InvalidMultiXactId : multiXid[temp]); /* release this partition relation. */ releaseDummyRelation(&partRel); @@ -1106,6 +1132,7 @@ static void rebuildPartition(Relation partTableRel, Oid partitionOid, Oid indexO Oid OIDNewHeap = InvalidOid; bool swapToastByContent = false; TransactionId frozenXid = InvalidTransactionId; + MultiXactId multiXid = InvalidMultiXactId; bool isCStore = RelationIsColStore(partTableRel); TupleDesc partTabHeapDesc; @@ -1234,6 +1261,7 @@ static void rebuildPartition(Relation partTableRel, Oid partitionOid, Oid indexO verbose, &swapToastByContent, &frozenXid, + &multiXid, memUsage, &deleteTuplesNum); } @@ -1266,7 +1294,7 @@ static void rebuildPartition(Relation partTableRel, Oid partitionOid, Oid indexO */ TransferPredicateLocksToHeapRelation(partRel); /* swap the temp table and partition */ - finishPartitionHeapSwap(partRel->rd_id, OIDNewHeap, swapToastByContent, frozenXid); + finishPartitionHeapSwap(partRel->rd_id, OIDNewHeap, swapToastByContent, frozenXid, multiXid); /* rebuild index of partition table */ reindexFlags = REINDEX_REL_SUPPRESS_INDEX_USE; (void)reindexPartition(RelationGetRelid(partTableRel), partitionOid, reindexFlags, REINDEX_ALL_INDEX); @@ -2049,7 +2077,7 @@ double copy_heap_data_internal(Relation OldHeap, Relation OldIndex, Relation New * case we had better copy it. */ if (!is_system_catalog && - !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(page, tuple->t_data))) + !TransactionIdIsCurrentTransactionId(HeapTupleGetUpdateXid(tuple))) ereport(messageLevel, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("concurrent insert in progress within table \"%s\"", RelationGetRelationName(OldHeap)))); @@ -2180,11 +2208,13 @@ double copy_heap_data_internal(Relation OldHeap, Relation OldIndex, Relation New * *pFreezeXid receives the TransactionId used as freeze cutoff point. */ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int freeze_min_age, int freeze_table_age, - bool verbose, bool* pSwapToastByContent, TransactionId* pFreezeXid, double* ptrDeleteTupleNum, AdaptMem* memUsage) + bool verbose, bool* pSwapToastByContent, TransactionId* pFreezeXid, MultiXactId *pFreezeMulti, + double* ptrDeleteTupleNum, AdaptMem* memUsage) { Relation NewHeap, OldHeap, OldIndex; TransactionId OldestXmin; TransactionId FreezeXid; + MultiXactId MultiXactFrzLimit; bool use_sort = false; double tups_vacuumed = 0; bool isGtt = false; @@ -2259,7 +2289,8 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a * plain VACUUM would. */ - vacuum_set_xid_limits(OldHeap, 0, freeze_table_age, &OldestXmin, &FreezeXid, NULL); + + vacuum_set_xid_limits(OldHeap, 0, freeze_table_age, &OldestXmin, &FreezeXid, NULL, &MultiXactFrzLimit); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go @@ -2272,6 +2303,7 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int } else { bool isNull = false; TransactionId relfrozenxid; + MultiXactId relminmxid; Relation rel = heap_open(RelationRelationId, AccessShareLock); HeapTuple tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap)); if (!HeapTupleIsValid(tuple)) { @@ -2280,8 +2312,6 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int errmsg("cache lookup failed for relation %u", RelationGetRelid(OldHeap)))); } Datum xid64datum = tableam_tops_tuple_getattr(tuple, Anum_pg_class_relfrozenxid64, RelationGetDescr(rel), &isNull); - heap_close(rel, AccessShareLock); - heap_freetuple(tuple); if (isNull) { relfrozenxid = OldHeap->rd_rel->relfrozenxid; @@ -2300,6 +2330,16 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int if (TransactionIdPrecedes(FreezeXid, relfrozenxid)) { FreezeXid = relfrozenxid; } +#ifndef ENABLE_MULTIPLE_NODES + Datum minmxidDatum = tableam_tops_tuple_getattr(tuple, Anum_pg_class_relminmxid, RelationGetDescr(rel), &isNull); + relminmxid = isNull ? InvalidMultiXactId : DatumGetTransactionId(minmxidDatum); + + if (MultiXactIdPrecedes(MultiXactFrzLimit, relminmxid)) { + MultiXactFrzLimit = relminmxid; + } +#endif + heap_close(rel, AccessShareLock); + heap_freetuple(tuple); } } else { /* We will eventually freeze all tuples of ustore tables here. @@ -2310,6 +2350,7 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int /* return selected value to caller */ *pFreezeXid = FreezeXid; + *pFreezeMulti = MultiXactFrzLimit; /* * Decide whether to use an indexscan or seqscan-and-optional-sort to scan @@ -2414,15 +2455,17 @@ static Relation GetPartitionIndexRel( */ static void copyPartitionHeapData(Relation newHeap, Relation oldHeap, Oid indexOid, PlannerInfo* root, RelOptInfo* relOptInfo, int freezeMinAge, int freezeTableAge, bool verbose, bool* pSwapToastByContent, - TransactionId* pFreezeXid, AdaptMem* memUsage, double* ptrDeleteTupleNum) + TransactionId* pFreezeXid, MultiXactId* pFreezeMulti, AdaptMem* memUsage, double* ptrDeleteTupleNum) { Relation oldIndex = NULL; TransactionId oldestXmin = 0; TransactionId freezeXid = 0; + MultiXactId freezeMulti = 0; bool useSort = false; Relation partTabIndexRel = NULL; Partition partIndexRel = NULL; TransactionId relfrozenxid = InvalidTransactionId; + MultiXactId relfrozenmxid = InvalidMultiXactId; double tups_vacuumed = 0; *pSwapToastByContent = false; @@ -2448,25 +2491,30 @@ static void copyPartitionHeapData(Relation newHeap, Relation oldHeap, Oid indexO /* * compute xids used to freeze and weed out dead tuples. */ - vacuum_set_xid_limits(oldHeap, 0, freezeTableAge, &oldestXmin, &freezeXid, NULL); + vacuum_set_xid_limits(oldHeap, 0, freezeTableAge, &oldestXmin, &freezeXid, NULL, &freezeMulti); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go * backwards, so take the max. */ - relfrozenxid = getPartitionRelfrozenxid(oldHeap); + getPartitionRelxids(oldHeap, &relfrozenxid, &relfrozenmxid); if (TransactionIdPrecedes(freezeXid, relfrozenxid)) freezeXid = relfrozenxid; + + if (MultiXactIdPrecedes(freezeMulti, relfrozenmxid)) + freezeMulti = relfrozenmxid; } else { /* We will eventually freeze all tuples of ustore tables here. * Hence freeze xid should be CurrentTransactionId */ freezeXid = GetCurrentTransactionId(); + freezeMulti = GetOldestMultiXactId(); } /* return selected value to caller */ *pFreezeXid = freezeXid; + *pFreezeMulti = freezeMulti; /* * Decide whether to use an indexscan or seqscan-and-optional-sort to scan @@ -2546,7 +2594,8 @@ static void copyPartitionHeapData(Relation newHeap, Relation oldHeap, Oid indexO * having to look the information up again later in finish_heap_swap. */ static void swap_relation_files( - Oid r1, Oid r2, bool target_is_pg_class, bool swap_toast_by_content, TransactionId frozenXid, Oid* mapped_tables) + Oid r1, Oid r2, bool target_is_pg_class, bool swap_toast_by_content, TransactionId frozenXid, + MultiXactId frozenMulti, Oid* mapped_tables) { Relation relRelation; HeapTuple reltup1, reltup2; @@ -2675,7 +2724,7 @@ static void swap_relation_files( * mapped catalog, because it's possible that we'll commit the map change * and then fail to commit the pg_class update. * - * set rel1's frozen Xid + * set rel1's frozen Xid and minimum MultiXid */ nctup = NULL; if (relform1->relkind != RELKIND_INDEX && relform1->relkind != RELKIND_GLOBAL_INDEX) { @@ -2697,6 +2746,11 @@ static void swap_relation_files( replaces[Anum_pg_class_relfrozenxid64 - 1] = true; values[Anum_pg_class_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenXid); +#ifndef ENABLE_MULTIPLE_NODES + replaces[Anum_pg_class_relminmxid - 1] = true; + values[Anum_pg_class_relminmxid - 1] = TransactionIdGetDatum(frozenMulti); +#endif + nctup = (HeapTuple) tableam_tops_modify_tuple(reltup1, RelationGetDescr(relRelation), values, nulls, replaces); relform1 = (Form_pg_class)GETSTRUCT(nctup); @@ -2761,6 +2815,7 @@ static void swap_relation_files( target_is_pg_class, swap_toast_by_content, frozenXid, + frozenMulti, mapped_tables); } else { /* caller messed up */ @@ -2851,6 +2906,7 @@ static void swap_relation_files( target_is_pg_class, swap_toast_by_content, InvalidTransactionId, + InvalidMultiXactId, mapped_tables); /* Clean up. */ if (nctup) @@ -2929,7 +2985,8 @@ static void swap_relation_names(Oid r1, Oid r2) * Output : NA */ static void swapPartitionfiles( - Oid partitionOid, Oid tempTableOid, bool swapToastByContent, TransactionId frozenXid, Oid* mappedTables) + Oid partitionOid, Oid tempTableOid, bool swapToastByContent, TransactionId frozenXid, + MultiXactId multiXid, Oid* mappedTables) { Relation relRelation1 = NULL; Relation relRelation2 = NULL; @@ -3010,6 +3067,11 @@ static void swapPartitionfiles( replaces[Anum_pg_partition_relfrozenxid64 - 1] = true; values[Anum_pg_partition_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenXid); +#ifndef ENABLE_MULTIPLE_NODES + replaces[Anum_pg_partition_relminmxid - 1] = true; + values[Anum_pg_partition_relminmxid - 1] = TransactionIdGetDatum(multiXid); +#endif + ntup = (HeapTuple) tableam_tops_modify_tuple(reltup1, RelationGetDescr(relRelation1), values, nulls, replaces); relform1 = (Form_pg_partition)GETSTRUCT(ntup); @@ -3057,7 +3119,8 @@ static void swapPartitionfiles( * deal with them too. */ swapCascadeHeapTables( - relform1->reltoastrelid, relform2->reltoastrelid, tempTableOid, swapToastByContent, frozenXid, mappedTables); + relform1->reltoastrelid, relform2->reltoastrelid, tempTableOid, swapToastByContent, frozenXid, + multiXid, mappedTables); SwapCStoreTables(relform1->relcudescrelid, relform2->relcudescrelid, InvalidOid, tempTableOid); SwapCStoreTables(relform1->reldeltarelid, relform2->reldeltarelid, InvalidOid, tempTableOid); @@ -3071,6 +3134,7 @@ static void swapPartitionfiles( false, swapToastByContent, InvalidTransactionId, + InvalidMultiXactId, mappedTables); /* Clean up. */ @@ -3090,13 +3154,14 @@ static void swapPartitionfiles( } static void swapCascadeHeapTables( - Oid relId1, Oid relId2, Oid tempTableOid, bool swapByContent, TransactionId frozenXid, Oid* mappedTables) + Oid relId1, Oid relId2, Oid tempTableOid, bool swapByContent, TransactionId frozenXid, + MultiXactId multiXid, Oid* mappedTables) { if (relId1 || relId2) { if (swapByContent) { if (relId1 && relId2) { /* Recursively swap the contents of the toast tables */ - swap_relation_files(relId1, relId2, false, swapByContent, frozenXid, mappedTables); + swap_relation_files(relId1, relId2, false, swapByContent, frozenXid, multiXid, mappedTables); } else { /* caller messed up */ ereport(ERROR, @@ -3219,7 +3284,7 @@ static void SwapCStoreTables(Oid relId1, Oid relId2, Oid parentOid, Oid tempTabl * cleaning up (including rebuilding all indexes on the old heap). */ void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bool swap_toast_by_content, - bool checkConstraints, TransactionId frozenXid, AdaptMem* memInfo) + bool checkConstraints, TransactionId frozenXid, MultiXactId frozenMulti, AdaptMem* memInfo) { ObjectAddress object; Oid mapped_tables[4]; @@ -3240,7 +3305,7 @@ void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bo GttSwapRelationFiles(OIDOldHeap, OIDNewHeap); } else { swap_relation_files(OIDOldHeap, OIDNewHeap, (OIDOldHeap == RelationRelationId), - swap_toast_by_content, frozenXid, mapped_tables); + swap_toast_by_content, frozenXid, frozenMulti, mapped_tables); } /* @@ -3351,7 +3416,8 @@ void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bo * Output : NA */ void finishPartitionHeapSwap( - Oid partitionOid, Oid tempTableOid, bool swapToastByContent, TransactionId frozenXid, bool tempTableIsPartition) + Oid partitionOid, Oid tempTableOid, bool swapToastByContent, TransactionId frozenXid, + MultiXactId multiXid, bool tempTableIsPartition) { Oid mapped_tables[4]; int i = 0; @@ -3367,10 +3433,10 @@ void finishPartitionHeapSwap( */ if (tempTableIsPartition) { /* For redistribution, exchange meta info between two partitions */ - swap_partition_relfilenode(partitionOid, tempTableOid, swapToastByContent, frozenXid, mapped_tables); + swap_partition_relfilenode(partitionOid, tempTableOid, swapToastByContent, frozenXid, multiXid, mapped_tables); } else { /* For alter table exchange, between partition and a normal table */ - swapPartitionfiles(partitionOid, tempTableOid, swapToastByContent, frozenXid, mapped_tables); + swapPartitionfiles(partitionOid, tempTableOid, swapToastByContent, frozenXid, multiXid, mapped_tables); } /* @@ -3782,6 +3848,7 @@ static void rebuildPartVacFull(Relation oldHeap, Oid partOid, int freezeMinAge, Oid OIDNewHeap = InvalidOid; bool swapToastByContent = false; TransactionId frozenXid = InvalidTransactionId; + MultiXactId multiXid = InvalidMultiXactId; TupleDesc partTabHeapDesc; HeapTuple tuple = NULL; Datum partTabRelOptions = 0; @@ -3860,6 +3927,7 @@ static void rebuildPartVacFull(Relation oldHeap, Oid partOid, int freezeMinAge, verbose, &swapToastByContent, &frozenXid, + &multiXid, &vacstmt->memUsage, &deleteTupleNum); } @@ -3872,7 +3940,7 @@ static void rebuildPartVacFull(Relation oldHeap, Oid partOid, int freezeMinAge, * Swap the physical files of the target and transient tables, then * rebuild the target's indexes and throw away the transient table. */ - finishPartitionHeapSwap(partRel->rd_id, OIDNewHeap, swapToastByContent, frozenXid); + finishPartitionHeapSwap(partRel->rd_id, OIDNewHeap, swapToastByContent, frozenXid, multiXid); /* Close relcache entry, but keep lock until transaction commit */ releaseDummyRelation(&partRel); @@ -3906,7 +3974,7 @@ static void rebuildPartVacFull(Relation oldHeap, Oid partOid, int freezeMinAge, */ partTable = try_relation_open(tableOid, AccessShareLock); /* Update reltuples and relpages in pg_class for partitioned table. */ - vac_update_pgclass_partitioned_table(partTable, partTable->rd_rel->relhasindex, frozenXid); + vac_update_pgclass_partitioned_table(partTable, partTable->rd_rel->relhasindex, frozenXid, multiXid); /* * report vacuum full stat to PgStatCollector. * For CStore table, we delete all invisible tuple, so dead tuple should be 0; and @@ -4000,7 +4068,7 @@ static void CopyCStoreData(Relation oldRel, Relation newRel, int freeze_min_age, * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a * plain VACUUM would. **/ - vacuum_set_xid_limits(oldRel, freeze_min_age, freeze_table_age, &OldestXmin, &FreezeXid, NULL); + vacuum_set_xid_limits(oldRel, freeze_min_age, freeze_table_age, &OldestXmin, &FreezeXid, NULL, NULL); bool isNull = false; TransactionId relfrozenxid = InvalidTransactionId; Relation rel; @@ -4406,7 +4474,7 @@ static void RebuildCStoreRelation( LockRelationOid(tableOid, AccessExclusiveLock); /* swap relation files */ - finish_heap_swap(tableOid, oidNewHeap, false, swapToastByContent, false, frozenXid, mem_info); + finish_heap_swap(tableOid, oidNewHeap, false, swapToastByContent, false, frozenXid, InvalidMultiXactId, mem_info); /* * Report vacuum full stat to PgStatCollector. @@ -5293,7 +5361,8 @@ static Datum pgxc_parallel_execution(const char* query, ExecNodes* exec_nodes) * @return : None */ static void swap_partition_relfilenode( - Oid partitionOid1, Oid partitionOid2, bool swapToastByContent, TransactionId frozenXid, Oid* mappedTables) + Oid partitionOid1, Oid partitionOid2, bool swapToastByContent, TransactionId frozenXid, + MultiXactId multiXid, Oid* mappedTables) { Relation relRelation = NULL; HeapTuple reltup1 = NULL; @@ -5373,6 +5442,11 @@ static void swap_partition_relfilenode( replaces[Anum_pg_partition_relfrozenxid64 - 1] = true; values[Anum_pg_partition_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenXid); +#ifndef ENABLE_MULTIPLE_NODES + replaces[Anum_pg_partition_relminmxid - 1] = true; + values[Anum_pg_partition_relminmxid - 1] = TransactionIdGetDatum(multiXid); +#endif + ntup = (HeapTuple) tableam_tops_modify_tuple(reltup1, RelationGetDescr(relRelation), values, nulls, replaces); relform1 = (Form_pg_partition)GETSTRUCT(ntup); @@ -5433,6 +5507,7 @@ static void swap_partition_relfilenode( false, swapToastByContent, InvalidTransactionId, + InvalidMultiXactId, mappedTables); /* Clean up. */ @@ -5641,11 +5716,13 @@ static void PartitionRelfilenodeSwap( } Relation old_partRel = partitionGetRelation(oldHeap, old_partition); Relation new_partRel = partitionGetRelation(newHeap, new_partition); - TransactionId relfrozenxid = getPartitionRelfrozenxid(old_partRel); + TransactionId relfrozenxid = InvalidTransactionId; + MultiXactId relminmxid = InvalidMultiXactId; + getPartitionRelxids(old_partRel, &relfrozenxid, &relminmxid); /* Exchange two partition's meta information */ if (RelationIsIndex(oldHeap)) { - finishPartitionHeapSwap(old_partRel->rd_id, new_partRel->rd_id, false, relfrozenxid, true); + finishPartitionHeapSwap(old_partRel->rd_id, new_partRel->rd_id, false, relfrozenxid, relminmxid, true); } else { List* old_part_idx_list = PartitionGetPartIndexList(old_partition, true); List* new_part_idx_list = PartitionGetPartIndexList(new_partition, true); @@ -5771,6 +5848,7 @@ void relfilenode_swap(Oid OIDOldHeap, Oid OIDNewHeap, uint8 needSwitch) (OIDOldHeap == RelationRelationId), false, u_sess->utils_cxt.RecentGlobalXmin, + GetOldestMultiXactId(), mapped_tables); /* * Now we must remove any relation mapping entries that we set up for the diff --git a/src/gausskernel/optimizer/commands/dbcommands.cpp b/src/gausskernel/optimizer/commands/dbcommands.cpp index 04f6a68ec..8133b173c 100644 --- a/src/gausskernel/optimizer/commands/dbcommands.cpp +++ b/src/gausskernel/optimizer/commands/dbcommands.cpp @@ -31,6 +31,7 @@ #include "access/xact.h" #include "access/xloginsert.h" #include "access/xlogutils.h" +#include "access/multixact.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/indexing.h" @@ -100,8 +101,8 @@ static void createdb_failure_callback(int code, Datum arg); static void movedb(const char* dbname, const char* tblspcname); static void movedb_failure_callback(int code, Datum arg); static bool get_db_info(const char* name, LOCKMODE lockmode, Oid* dbIdP, Oid* ownerIdP, int* encodingP, - bool* dbIsTemplateP, bool* dbAllowConnP, Oid* dbLastSysOidP, TransactionId* dbFrozenXidP, Oid* dbTablespace, - char** dbCollate, char** dbCtype, char** src_compatibility = NULL); + bool* dbIsTemplateP, bool* dbAllowConnP, Oid* dbLastSysOidP, TransactionId* dbFrozenXidP, MultiXactId *dbMinMultiP, + Oid* dbTablespace, char** dbCollate, char** dbCtype, char** src_compatibility = NULL); static void remove_dbtablespaces(Oid db_id); static bool check_db_file_conflict(Oid db_id); static void createdb_xact_callback(bool isCommit, const void* arg); @@ -152,6 +153,7 @@ void createdb(const CreatedbStmt* stmt) bool src_allowconn = false; Oid src_lastsysoid; TransactionId src_frozenxid; + MultiXactId src_minmxid; Oid src_deftablespace; volatile Oid dst_deftablespace; Relation pg_database_rel; @@ -324,6 +326,7 @@ void createdb(const CreatedbStmt* stmt) &src_allowconn, &src_lastsysoid, &src_frozenxid, + &src_minmxid, &src_deftablespace, &src_collate, &src_ctype, @@ -529,6 +532,9 @@ void createdb(const CreatedbStmt* stmt) */ new_record_nulls[Anum_pg_database_datacl - 1] = true; new_record[Anum_pg_database_datfrozenxid64 - 1] = TransactionIdGetDatum(src_frozenxid); +#ifndef ENABLE_MULTIPLE_NODES + new_record[Anum_pg_database_datminmxid - 1] = TransactionIdGetDatum(src_minmxid); +#endif tuple = heap_form_tuple(RelationGetDescr(pg_database_rel), new_record, new_record_nulls); HeapTupleSetOid(tuple, dboid); @@ -1001,7 +1007,8 @@ void dropdb(const char* dbname, bool missing_ok) pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info( - dbname, AccessExclusiveLock, &db_id, NULL, NULL, &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL)) { + dbname, AccessExclusiveLock, &db_id, NULL, NULL, &db_istemplate, + NULL, NULL, NULL, NULL, NULL, NULL, NULL)) { if (!missing_ok) { ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); } else { @@ -1211,7 +1218,7 @@ void RenameDatabase(const char* oldname, const char* newname) */ rel = heap_open(DatabaseRelationId, RowExclusiveLock); - if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) + if (!get_db_info(oldname, AccessExclusiveLock, &db_id, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", oldname))); /* Permission check. */ @@ -1356,7 +1363,7 @@ static void movedb(const char* dbname, const char* tblspcname) pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock); if (!get_db_info( - dbname, AccessExclusiveLock, &db_id, NULL, NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL)) + dbname, AccessExclusiveLock, &db_id, NULL, NULL, NULL, NULL, NULL, NULL, NULL, &src_tblspcoid, NULL, NULL)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); /* @@ -1925,8 +1932,8 @@ void AlterDatabaseOwner(const char* dbname, Oid newOwnerId) * return FALSE. */ static bool get_db_info(const char* name, LOCKMODE lockmode, Oid* dbIdP, Oid* ownerIdP, int* encodingP, - bool* dbIsTemplateP, bool* dbAllowConnP, Oid* dbLastSysOidP, TransactionId* dbFrozenXidP, Oid* dbTablespace, - char** dbCollate, char** dbCtype, char** dbcompatibility) + bool* dbIsTemplateP, bool* dbAllowConnP, Oid* dbLastSysOidP, TransactionId* dbFrozenXidP, MultiXactId *dbMinMultiP, + Oid* dbTablespace, char** dbCollate, char** dbCtype, char** dbcompatibility) { bool result = false; Relation relation; @@ -2019,6 +2026,15 @@ static bool get_db_info(const char* name, LOCKMODE lockmode, Oid* dbIdP, Oid* ow *dbFrozenXidP = datfrozenxid; } +#ifndef ENABLE_MULTIPLE_NODES + /* limit of frozen Multixacts */ + if (dbMinMultiP != NULL) { + bool isNull = false; + Datum minmxidDatum = + heap_getattr(tuple, Anum_pg_database_datminmxid, RelationGetDescr(relation), &isNull); + *dbMinMultiP = isNull ? FirstMultiXactId : DatumGetTransactionId(minmxidDatum); + } +#endif /* default tablespace for this database */ if (dbTablespace != NULL) *dbTablespace = dbform->dattablespace; diff --git a/src/gausskernel/optimizer/commands/matview.cpp b/src/gausskernel/optimizer/commands/matview.cpp index e36a84fa9..f842b4092 100755 --- a/src/gausskernel/optimizer/commands/matview.cpp +++ b/src/gausskernel/optimizer/commands/matview.cpp @@ -1068,7 +1068,7 @@ void ExecRefreshCtasMatViewAll(RefreshMatViewStmt *stmt, const char *queryString * Swap the physical files of the target and transient tables, then * rebuild the target's indexes and throw away the transient table. */ - finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, u_sess->utils_cxt.RecentXmin); + finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, u_sess->utils_cxt.RecentXmin, GetOldestMultiXactId()); RelationCacheInvalidateEntry(matviewOid); } diff --git a/src/gausskernel/optimizer/commands/sequence.cpp b/src/gausskernel/optimizer/commands/sequence.cpp index de1639170..e53031b63 100644 --- a/src/gausskernel/optimizer/commands/sequence.cpp +++ b/src/gausskernel/optimizer/commands/sequence.cpp @@ -1059,7 +1059,7 @@ void ResetSequence(Oid seq_relid) * Create a new storage file for the sequence. We want to keep the * sequence's relfrozenxid at 0, since it won't contain any unfrozen XIDs. */ - RelationSetNewRelfilenode(seq_rel, InvalidTransactionId); + RelationSetNewRelfilenode(seq_rel, InvalidTransactionId, InvalidMultiXactId); /* * Insert the modified tuple into the new storage file. @@ -1274,7 +1274,7 @@ void AlterSequence(AlterSeqStmt* stmt) * changes transactional. We want to keep the sequence's relfrozenxid * at 0, since it won't contain any unfrozen XIDs. */ - RelationSetNewRelfilenode(seqrel, InvalidTransactionId); + RelationSetNewRelfilenode(seqrel, InvalidTransactionId, InvalidMultiXactId); /* * Insert the modified tuple into the new storage file. */ @@ -1904,6 +1904,7 @@ static Form_pg_sequence read_seq_tuple(SeqTable elm, Relation rel, Buffer* buf, * bit update, ie, don't bother to WAL-log it, since we can certainly do * this again if the update gets lost. */ + Assert(!(seqtuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); if (HeapTupleGetRawXmax(seqtuple) != InvalidTransactionId) { HeapTupleSetXmax(seqtuple, InvalidTransactionId); seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; diff --git a/src/gausskernel/optimizer/commands/tablecmds.cpp b/src/gausskernel/optimizer/commands/tablecmds.cpp index 699162033..2ad03f063 100644 --- a/src/gausskernel/optimizer/commands/tablecmds.cpp +++ b/src/gausskernel/optimizer/commands/tablecmds.cpp @@ -35,6 +35,7 @@ #include "access/tableam.h" #include "access/ustore/knl_uheap.h" #include "access/ustore/knl_uscan.h" +#include "access/multixact.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/dfsstore_ctlg.h" @@ -197,7 +198,7 @@ #include "pgxc/redistrib.h" extern void vacuum_set_xid_limits(Relation rel, int64 freeze_min_age, int64 freeze_table_age, TransactionId* oldestXmin, - TransactionId* freezeLimit, TransactionId* freezeTableLimit); + TransactionId* freezeLimit, TransactionId* freezeTableLimit, MultiXactId* multiXactFrzLimit); /* * ON COMMIT action list @@ -679,7 +680,7 @@ static void AlterPartitionedSetWaitCleanGPI(bool alterGPI, Relation partTableRel static Oid AddTemporaryRangePartitionForAlterPartitions(const AlterTableCmd* cmd, Relation partTableRel, int sequence, bool* renameTargetPart); static void ExchangePartitionWithGPI(const AlterTableCmd* cmd, Relation partTableRel, Oid srcPartOid, - TransactionId frozenXid); + TransactionId frozenXid, MultiXactId multiXid); static void fastAddPartition(Relation partTableRel, List* destPartDefList, List** newPartOidList); static void readTuplesAndInsert(Relation tempTableRel, Relation partTableRel); static Oid createTempTableForPartition(Relation partTableRel, Partition part); @@ -3988,6 +3989,7 @@ void ExecuteTruncate(TruncateStmt* stmt) Oid heap_relid; Oid toast_relid; bool is_shared = rel->rd_rel->relisshared; + MultiXactId minmulti; /* * This effectively deletes all rows in the table, and may be done * in a serializable transaction. In that case we must record a @@ -4000,6 +4002,8 @@ void ExecuteTruncate(TruncateStmt* stmt) continue; } + minmulti = GetOldestMultiXactId(); + #ifdef ENABLE_MOT if (RelationIsForeignTable(rel) && isMOTFromTblOid(RelationGetRelid(rel))) { FdwRoutine* fdwroutine = GetFdwRoutineByRelId(RelationGetRelid(rel)); @@ -4038,7 +4042,9 @@ void ExecuteTruncate(TruncateStmt* stmt) } } - RelationSetNewRelfilenode(rel, u_sess->utils_cxt.RecentXmin, isDfsTruncate); + RelationSetNewRelfilenode(rel, u_sess->utils_cxt.RecentXmin, + RelationIsColStore(rel) ? InvalidMultiXactId : minmulti, + isDfsTruncate); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) heap_create_init_fork(rel); @@ -4051,7 +4057,7 @@ void ExecuteTruncate(TruncateStmt* stmt) */ if (OidIsValid(toast_relid)) { rel = relation_open(toast_relid, AccessExclusiveLock); - RelationSetNewRelfilenode(rel, u_sess->utils_cxt.RecentXmin); + RelationSetNewRelfilenode(rel, u_sess->utils_cxt.RecentXmin, minmulti); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) heap_create_init_fork(rel); heap_close(rel, NoLock); @@ -4082,13 +4088,14 @@ void ExecuteTruncate(TruncateStmt* stmt) Oid partOid = HeapTupleGetOid(tup); Partition p = partitionOpen(rel, partOid, AccessExclusiveLock); - PartitionSetNewRelfilenode(rel, p, u_sess->utils_cxt.RecentXmin); + PartitionSetNewRelfilenode(rel, p, u_sess->utils_cxt.RecentXmin, + RelationIsColStore(rel) ? InvalidMultiXactId : minmulti); /* process the toast table */ if (OidIsValid(toastOid)) { Assert(rel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED); toastRel = heap_open(toastOid, AccessExclusiveLock); - RelationSetNewRelfilenode(toastRel, u_sess->utils_cxt.RecentXmin); + RelationSetNewRelfilenode(toastRel, u_sess->utils_cxt.RecentXmin, minmulti); heap_close(toastRel, NoLock); } partitionClose(rel, p, NoLock); @@ -4097,7 +4104,9 @@ void ExecuteTruncate(TruncateStmt* stmt) pgstat_report_truncate(partOid, heap_relid, is_shared); } - RelationSetNewRelfilenode(rel, u_sess->utils_cxt.RecentXmin, isDfsTruncate); + RelationSetNewRelfilenode(rel, u_sess->utils_cxt.RecentXmin, + RelationIsColStore(rel) ? InvalidMultiXactId : minmulti, + isDfsTruncate); freePartList(partTupleList); pgstat_report_truncate( heap_relid, InvalidOid, is_shared); /* report truncate partitioned table to PgStatCollector */ @@ -19943,7 +19952,8 @@ static void destroyMergeingIndexes(Relation srcIndexRelation, List* merging_btre list_free_ext(merging_part_list); } -static void mergePartitionIndexSwap(List* indexRel, List* indexDestPartOid, List* indexDestOid, TransactionId FreezeXid) +static void mergePartitionIndexSwap(List* indexRel, List* indexDestPartOid, List* indexDestOid, TransactionId FreezeXid, + MultiXactId FreezeMultiXid) { ListCell* cell1 = NULL; ListCell* cell2 = NULL; @@ -19966,12 +19976,13 @@ static void mergePartitionIndexSwap(List* indexRel, List* indexDestPartOid, List getPartitionName(dstPartOid, false)))); } /* swap relfilenode between temp index relation and dest index partition */ - finishPartitionHeapSwap(dstPartOid, clonedIndexRelationId, false, FreezeXid); + finishPartitionHeapSwap(dstPartOid, clonedIndexRelationId, false, FreezeXid, FreezeMultiXid); partitionClose(currentIndex, dstPart, NoLock); } } -static void mergePartitionHeapSwap(Relation partTableRel, Oid destPartOid, Oid tempTableOid, TransactionId FreezeXid) +static void mergePartitionHeapSwap(Relation partTableRel, Oid destPartOid, Oid tempTableOid, TransactionId FreezeXid, + MultiXactId FreezeMultiXid) { Partition destPart; @@ -19986,7 +19997,7 @@ static void mergePartitionHeapSwap(Relation partTableRel, Oid destPartOid, Oid t getPartitionName(destPartOid, false)))); } - finishPartitionHeapSwap(destPartOid, tempTableOid, false, FreezeXid); + finishPartitionHeapSwap(destPartOid, tempTableOid, false, FreezeXid, FreezeMultiXid); partitionClose(partTableRel, destPart, NoLock); } @@ -20048,9 +20059,10 @@ static void mergePartitionBTreeIndexes(List* srcPartOids, List* srcPartMergeOffs } static void mergePartitionHeapData(Relation partTableRel, Relation tempTableRel, List* srcPartOids, List* indexRel_list, - List* indexDestOid_list, int2 bucketId, TransactionId* freezexid) + List* indexDestOid_list, int2 bucketId, TransactionId* freezexid, MultiXactId* freezeMultixid) { TransactionId FreezeXid = InvalidTransactionId; + MultiXactId FreezeMultiXid = InvalidMultiXactId; HTAB* chunkIdHashTable = NULL; ListCell* cell1 = NULL; List* mergeToastIndexes = NIL; @@ -20177,18 +20189,22 @@ static void mergePartitionHeapData(Relation partTableRel, Relation tempTableRel, Relation srcPartRel = NULL; char persistency; BlockNumber srcPartHeapBlocks = 0; - TransactionId relfrozenxid; + TransactionId relfrozenxid = InvalidTransactionId; + MultiXactId relminmxid = InvalidMultiXactId; srcPartition = partitionOpen(partTableRel, srcPartOid, ExclusiveLock, bucketId); // already ExclusiveLock // locked srcPartRel = partitionGetRelation(partTableRel, srcPartition); PartitionOpenSmgr(srcPartition); - relfrozenxid = getPartitionRelfrozenxid(srcPartRel); + getPartitionRelxids(srcPartRel, &relfrozenxid, &relminmxid); /* update final fronzenxid, we choose the least one */ if (!TransactionIdIsValid(FreezeXid) || TransactionIdPrecedes(relfrozenxid, FreezeXid)) FreezeXid = relfrozenxid; + if (!MultiXactIdIsValid(FreezeMultiXid) || MultiXactIdPrecedes(relminmxid, FreezeMultiXid)) + FreezeMultiXid = relminmxid; + /* Retry to open smgr in case it is cloesd when we process SI messages */ RelationOpenSmgr(tempTableRel); @@ -20271,6 +20287,9 @@ static void mergePartitionHeapData(Relation partTableRel, Relation tempTableRel, if (freezexid != NULL) *freezexid = FreezeXid; + + if (freezeMultixid != NULL) + *freezeMultixid = FreezeMultiXid; /* * 3.4 merge toast indexes and destroy chunkId hash table */ @@ -20368,6 +20387,7 @@ static void ATExecMergePartition(Relation partTableRel, AlterTableCmd* cmd) Relation tempTableRel = NULL; ObjectAddress object; TransactionId FreezeXid; + MultiXactId FreezeMultiXid; LOCKMODE lockMode = NoLock; srcPartitions = (List*)cmd->def; @@ -20587,7 +20607,7 @@ static void ATExecMergePartition(Relation partTableRel, AlterTableCmd* cmd) /* set new empty filenode for toast index */ Relation toastRel = relation_open(tempTableRel->rd_rel->reltoastrelid, AccessExclusiveLock); Relation toastIndexRel = index_open(toastRel->rd_rel->reltoastidxid, AccessExclusiveLock); - RelationSetNewRelfilenode(toastIndexRel, InvalidTransactionId); + RelationSetNewRelfilenode(toastIndexRel, InvalidTransactionId, InvalidMultiXactId); relation_close(toastRel, NoLock); index_close(toastIndexRel, NoLock); } @@ -20612,7 +20632,8 @@ static void ATExecMergePartition(Relation partTableRel, AlterTableCmd* cmd) indexRel_list, clonedIndexRelId_list, bucketlist->values[i], - &FreezeXid); + &FreezeXid, + &FreezeMultiXid); /* first bucket already merged into target cross bucket index. */ if (i != 0) { @@ -20622,7 +20643,8 @@ static void ATExecMergePartition(Relation partTableRel, AlterTableCmd* cmd) } } else { mergePartitionHeapData( - partTableRel, tempTableRel, srcPartOids, indexRel_list, clonedIndexRelId_list, InvalidBktId, &FreezeXid); + partTableRel, tempTableRel, srcPartOids, indexRel_list, clonedIndexRelId_list, InvalidBktId, &FreezeXid, + &FreezeMultiXid); } /* close temp relation */ @@ -20630,10 +20652,10 @@ static void ATExecMergePartition(Relation partTableRel, AlterTableCmd* cmd) heap_close(tempTableRel, NoLock); /* swap the index relfilenode*/ - mergePartitionIndexSwap(indexRel_list, indexDestPartOid_list, clonedIndexRelId_list, FreezeXid); + mergePartitionIndexSwap(indexRel_list, indexDestPartOid_list, clonedIndexRelId_list, FreezeXid, FreezeMultiXid); /* swap the heap relfilenode */ - mergePartitionHeapSwap(partTableRel, destPartOid, tempTableOid, FreezeXid); + mergePartitionHeapSwap(partTableRel, destPartOid, tempTableOid, FreezeXid, FreezeMultiXid); CommandCounterIncrement(); /*free index list*/ @@ -20907,6 +20929,7 @@ static void ATExecExchangePartition(Relation partTableRel, AlterTableCmd* cmd) List* partIndexList = NIL; List* ordIndexList = NIL; TransactionId relfrozenxid = InvalidTransactionId; + MultiXactId relminmxid = InvalidMultiXactId; ordTableOid = RangeVarGetRelid(cmd->exchange_with_rel, AccessExclusiveLock, false); @@ -20998,12 +21021,12 @@ static void ATExecExchangePartition(Relation partTableRel, AlterTableCmd* cmd) checkValidationForExchange(partTableRel, ordTableRel, partOid, cmd->exchange_verbose); } if (RelationIsPartition(ordTableRel)) - relfrozenxid = getPartitionRelfrozenxid(ordTableRel); + getPartitionRelxids(ordTableRel, &relfrozenxid, &relminmxid); else - relfrozenxid = getRelationRelfrozenxid(ordTableRel); + getRelationRelxids(ordTableRel, &relfrozenxid, &relminmxid); // Swap relfilenode of table and toast table - finishPartitionHeapSwap(partOid, ordTableRel->rd_id, false, relfrozenxid); + finishPartitionHeapSwap(partOid, ordTableRel->rd_id, false, relfrozenxid, relminmxid); // Swap relfilenode of index Assert(list_length(partIndexList) == list_length(ordIndexList)); @@ -21019,7 +21042,7 @@ static void ATExecExchangePartition(Relation partTableRel, AlterTableCmd* cmd) // Unusable Global Index ATUnusableGlobalIndex(partTableRel); } else { - ExchangePartitionWithGPI(cmd, partTableRel, partOid, relfrozenxid); + ExchangePartitionWithGPI(cmd, partTableRel, partOid, relfrozenxid, relminmxid); } } @@ -22272,7 +22295,7 @@ static void finishIndexSwap(List* partIndexList, List* ordIndexList) partOid = (Oid)lfirst_oid(cell1); ordOid = (Oid)lfirst_oid(cell2); - finishPartitionHeapSwap(partOid, ordOid, true, u_sess->utils_cxt.RecentGlobalXmin); + finishPartitionHeapSwap(partOid, ordOid, true, u_sess->utils_cxt.RecentGlobalXmin, GetOldestMultiXactId()); } } @@ -22469,7 +22492,7 @@ static void ATExecSplitPartition(Relation partTableRel, AlterTableCmd* cmd) // creat temp table and swap relfilenode with src partition tempTableOid = createTempTableForPartition(partTableRel, part); - finishPartitionHeapSwap(srcPartOid, tempTableOid, false, u_sess->utils_cxt.RecentXmin); + finishPartitionHeapSwap(srcPartOid, tempTableOid, false, u_sess->utils_cxt.RecentXmin, GetOldestMultiXactId()); CommandCounterIncrement(); @@ -22940,7 +22963,7 @@ static Oid AddTemporaryHashPartitionForAlterPartitions(const AlterTableCmd* cmd, * @in srcPartOid: current partition oid. */ static void ExchangePartitionWithGPI(const AlterTableCmd* cmd, Relation partTableRel, Oid srcPartOid, - TransactionId frozenXid) + TransactionId frozenXid, MultiXactId multiXid) { List* indexList = NIL; ListCell* cell = NULL; @@ -22994,14 +23017,14 @@ static void ExchangePartitionWithGPI(const AlterTableCmd* cmd, Relation partTabl } /* swap relfilenode between temp index relation and dest index partition */ - finishPartitionHeapSwap(indexDestPartOid, indexSrcPartOid, false, frozenXid, true); + finishPartitionHeapSwap(indexDestPartOid, indexSrcPartOid, false, frozenXid, multiXid, true); partitionClose(indexRel, dstPart, NoLock); relation_close(indexRel, RowExclusiveLock); } // Swap relfilenode of table and toast table CommandCounterIncrement(); - finishPartitionHeapSwap(srcPartOid, destPartOid, false, frozenXid, true); + finishPartitionHeapSwap(srcPartOid, destPartOid, false, frozenXid, multiXid, true); CommandCounterIncrement(); AlterPartitionedSetWaitCleanGPI(cmd->alterGPI, partTableRel, srcPartOid); @@ -24121,7 +24144,8 @@ static void ExecRewriteRowTable(AlteredTableInfo* tab, Oid NewTableSpace, LOCKMO * we never try to swap toast tables by content, since we have no * interest in letting this code work on system catalogs. */ - finish_heap_swap(tab->relid, OIDNewHeap, false, swapToastByContent, true, u_sess->utils_cxt.RecentXmin); + finish_heap_swap(tab->relid, OIDNewHeap, false, swapToastByContent, true, u_sess->utils_cxt.RecentXmin, + GetOldestMultiXactId()); /* clear all attrinitdefval */ clearAttrInitDefVal(tab->relid); @@ -24201,7 +24225,7 @@ static void ExecRewriteRowPartitionedTable(AlteredTableInfo* tab, Oid NewTableSp heap_close(newRel, NoLock); /* swap the temp table and partition */ - finishPartitionHeapSwap(oldRel->rd_id, OIDNewHeap, false, u_sess->utils_cxt.RecentXmin); + finishPartitionHeapSwap(oldRel->rd_id, OIDNewHeap, false, u_sess->utils_cxt.RecentXmin, GetOldestMultiXactId()); /* record the temp table oid for dropping */ tempTableOidList = lappend_oid(tempTableOidList, OIDNewHeap); diff --git a/src/gausskernel/optimizer/commands/trigger.cpp b/src/gausskernel/optimizer/commands/trigger.cpp index 1f4747a02..0ae3329c5 100644 --- a/src/gausskernel/optimizer/commands/trigger.cpp +++ b/src/gausskernel/optimizer/commands/trigger.cpp @@ -87,7 +87,7 @@ static void ConvertTriggerToFK(CreateTrigStmt* stmt, Oid funcoid); static void SetTriggerFlags(TriggerDesc* trigdesc, const Trigger* trigger); static HeapTuple GetTupleForTrigger(EState* estate, EPQState* epqstate, ResultRelInfo* relinfo, Oid targetPartitionOid, - int2 bucketid, ItemPointer tid, TupleTableSlot** newSlot); + int2 bucketid, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot** newSlot); static void ReleaseFakeRelation(Relation relation, Partition part, Relation* fakeRelation); static bool TriggerEnabled(EState* estate, ResultRelInfo* relinfo, Trigger* trigger, TriggerEvent event, const Bitmapset* modifiedCols, HeapTuple oldtup, HeapTuple newtup); @@ -2125,7 +2125,8 @@ bool ExecBRDeleteTriggers(EState* estate, EPQState* epqstate, ResultRelInfo* rel } else { /* On datanode, do the usual way */ #endif - trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, deletePartitionOid, bucketid, tupleid, &newSlot); + trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, deletePartitionOid, + bucketid, tupleid, LockTupleExclusive, &newSlot); #ifdef PGXC } #endif @@ -2192,7 +2193,8 @@ void ExecARDeleteTriggers(EState* estate, ResultRelInfo* relinfo, Oid deletePart } else { /* Do the usual PG-way for datanode */ #endif - trigtuple = GetTupleForTrigger(estate, NULL, relinfo, deletePartitionOid, bucketid, tupleid, NULL); + trigtuple = GetTupleForTrigger(estate, NULL, relinfo, deletePartitionOid, + bucketid, tupleid, LockTupleExclusive, NULL); #ifdef PGXC } #endif @@ -2346,6 +2348,25 @@ TupleTableSlot* ExecBRUpdateTriggers(EState* estate, EPQState* epqstate, ResultR TupleTableSlot* newSlot = NULL; int i; Bitmapset* updatedCols = NULL; + Bitmapset* keyCols = NULL; + LockTupleMode lockmode; + + /* + * Compute lock mode to use. If columns that are part of the key have not + * been modified, then we can use a weaker lock, allowing for better + * concurrency. + */ + updatedCols = GET_ALL_UPDATED_COLUMNS(relinfo, estate); + keyCols = RelationGetIndexAttrBitmap(relinfo->ri_RelationDesc, INDEX_ATTR_BITMAP_KEY); +#ifndef ENABLE_MULTIPLE_NODES + if (!bms_overlap(keyCols, updatedCols)) { + lockmode = LockTupleNoKeyExclusive; + } else +#endif + { + lockmode = LockTupleExclusive; + } + #ifdef PGXC bool exec_all_triggers = false; @@ -2377,7 +2398,8 @@ TupleTableSlot* ExecBRUpdateTriggers(EState* estate, EPQState* epqstate, ResultR /* On datanode, do the usual way */ #endif /* get a copy of the on-disk tuple we are planning to update */ - trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, oldPartitionOid, bucketid, tupleid, &newSlot); + trigtuple = GetTupleForTrigger(estate, epqstate, relinfo, oldPartitionOid, + bucketid, tupleid, lockmode, &newSlot); if (trigtuple == NULL) return NULL; /* cancel the update action */ @@ -2401,8 +2423,6 @@ TupleTableSlot* ExecBRUpdateTriggers(EState* estate, EPQState* epqstate, ResultR } #endif - updatedCols = GET_ALL_UPDATED_COLUMNS(relinfo, estate); - LocTriggerData.type = T_TriggerData; LocTriggerData.tg_event = TRIGGER_EVENT_UPDATE | TRIGGER_EVENT_ROW | TRIGGER_EVENT_BEFORE; LocTriggerData.tg_relation = relinfo->ri_RelationDesc; @@ -2476,7 +2496,8 @@ void ExecARUpdateTriggers(EState* estate, ResultRelInfo* relinfo, Oid oldPartiti } else { /* Do the usual PG-way for datanode */ #endif - trigtuple = GetTupleForTrigger(estate, NULL, relinfo, oldPartitionOid, bucketid, tupleid, NULL); + trigtuple = GetTupleForTrigger(estate, NULL, relinfo, oldPartitionOid, + bucketid, tupleid, LockTupleExclusive, NULL); #ifdef PGXC } #endif @@ -2629,7 +2650,7 @@ void ExecASTruncateTriggers(EState* estate, ResultRelInfo* relinfo) } static HeapTuple GetTupleForTrigger(EState* estate, EPQState* epqstate, ResultRelInfo* relinfo, Oid targetPartitionOid, - int2 bucketid, ItemPointer tid, TupleTableSlot** newSlot) + int2 bucketid, ItemPointer tid, LockTupleMode lockmode, TupleTableSlot** newSlot) { Relation relation = relinfo->ri_RelationDesc; HeapTupleData tuple; @@ -2796,7 +2817,7 @@ ltrmark:; &tuple, &buffer, estate->es_output_cid, - LockTupleExclusive, + lockmode, false, &tmfd, false, // fake params below are for uheap implementation @@ -2838,7 +2859,8 @@ ltrmark:; TupleTableSlot* epqslot = NULL; epqslot = EvalPlanQual( - estate, epqstate, fakeRelation, relinfo->ri_RangeTableIndex, &tmfd.ctid, tmfd.xmax, false); + estate, epqstate, fakeRelation, relinfo->ri_RangeTableIndex, + lockmode, &tmfd.ctid, tmfd.xmax, false); if (!TupIsNull(epqslot)) { *tid = tmfd.ctid; *newSlot = epqslot; diff --git a/src/gausskernel/optimizer/commands/vacuum.cpp b/src/gausskernel/optimizer/commands/vacuum.cpp index 586d78e46..e062687a8 100644 --- a/src/gausskernel/optimizer/commands/vacuum.cpp +++ b/src/gausskernel/optimizer/commands/vacuum.cpp @@ -34,6 +34,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/tableam.h" +#include "access/multixact.h" #include "catalog/dfsstore_ctlg.h" #include "catalog/namespace.h" #include "catalog/gs_matview.h" @@ -127,7 +128,7 @@ static void DropEmptyPartitionDirectories(Oid relid); static THR_LOCAL BufferAccessStrategy vac_strategy; static THR_LOCAL int elevel = -1; -static void vac_truncate_clog(TransactionId frozenXID); +static void vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti); static bool vacuum_rel(Oid relid, VacuumStmt* vacstmt, bool do_toast); static void GPIVacuumMainPartition( Relation onerel, const VacuumStmt* vacstmt, LOCKMODE lockmode, BufferAccessStrategy bstrategy); @@ -899,7 +900,7 @@ List* get_rel_oids(Oid relid, VacuumStmt* vacstmt) * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points */ void vacuum_set_xid_limits(Relation rel, int64 freeze_min_age, int64 freeze_table_age, TransactionId* oldestXmin, - TransactionId* freezeLimit, TransactionId* freezeTableLimit) + TransactionId* freezeLimit, TransactionId* freezeTableLimit, MultiXactId* multiXactFrzLimit) { int64 freezemin; TransactionId limit; @@ -990,6 +991,28 @@ void vacuum_set_xid_limits(Relation rel, int64 freeze_min_age, int64 freeze_tabl *freezeTableLimit = limit; } + + if (multiXactFrzLimit != NULL) { +#ifndef ENABLE_MULTIPLE_NODES + MultiXactId mxLimit; + + /* + * simplistic multixactid freezing: use the same freezing policy as + * for Xids + */ + mxLimit = GetOldestMultiXactId(); + if (mxLimit > FirstMultiXactId + freezemin) + mxLimit -= freezemin; + else + mxLimit = FirstMultiXactId; + + *multiXactFrzLimit = mxLimit; +#else + *multiXactFrzLimit = InvalidMultiXactId; +#endif + } + + } /* @@ -1092,7 +1115,7 @@ static void debug_print_rows_and_pages(Relation relation, Form_pg_class pgcform) * This routine is shared by VACUUM and ANALYZE. */ void vac_update_relstats(Relation relation, Relation classRel, RelPageType num_pages, double num_tuples, - BlockNumber num_all_visible_pages, bool hasindex, TransactionId frozenxid) + BlockNumber num_all_visible_pages, bool hasindex, TransactionId frozenxid, MultiXactId minmulti) { Oid relid = RelationGetRelid(relation); HeapTuple ctup; @@ -1103,6 +1126,7 @@ void vac_update_relstats(Relation relation, Relation classRel, RelPageType num_p TransactionId relfrozenxid; Datum xid64datum; bool isGtt = false; + MultiXactId relminmxid; /* global temp table remember relstats to localhash and rel->rd_rel, not catalog */ if (RELATION_IS_GLOBAL_TEMP(relation)) { @@ -1190,11 +1214,20 @@ void vac_update_relstats(Relation relation, Relation classRel, RelPageType num_p relfrozenxid = DatumGetTransactionId(xid64datum); } - if (TransactionIdIsNormal(frozenxid) && (TransactionIdPrecedes(relfrozenxid, frozenxid) -#ifdef PGXC - || !IsPostmasterEnvironment) +#ifndef ENABLE_MULTIPLE_NODES + Datum minmxidDatum = tableam_tops_tuple_getattr(ctup, + Anum_pg_class_relminmxid, + RelationGetDescr(classRel), + &isNull); + relminmxid = isNull ? InvalidMultiXactId : DatumGetTransactionId(minmxidDatum); #endif - ) { + + if ((TransactionIdIsNormal(frozenxid) && (TransactionIdPrecedes(relfrozenxid, frozenxid) +#ifdef PGXC + || !IsPostmasterEnvironment +#endif + )) || (MultiXactIdIsValid(minmulti) && (MultiXactIdPrecedes(relminmxid, minmulti) || !IsPostmasterEnvironment)) || + isNull) { Datum values[Natts_pg_class]; bool nulls[Natts_pg_class]; @@ -1210,8 +1243,17 @@ void vac_update_relstats(Relation relation, Relation classRel, RelPageType num_p rc = memset_s(replaces, sizeof(replaces), false, sizeof(replaces)); securec_check(rc, "", ""); - replaces[Anum_pg_class_relfrozenxid64 - 1] = true; - values[Anum_pg_class_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenxid); + if (TransactionIdIsNormal(frozenxid) && TransactionIdPrecedes(relfrozenxid, frozenxid)) { + replaces[Anum_pg_class_relfrozenxid64 - 1] = true; + values[Anum_pg_class_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenxid); + } + +#ifndef ENABLE_MULTIPLE_NODES + if ((MultiXactIdIsValid(minmulti) && MultiXactIdPrecedes(relminmxid, minmulti)) || isNull) { + replaces[Anum_pg_class_relminmxid - 1] = true; + values[Anum_pg_class_relminmxid - 1] = TransactionIdGetDatum(minmulti); + } +#endif nctup = (HeapTuple) tableam_tops_modify_tuple(ctup, RelationGetDescr(classRel), values, nulls, replaces); ctup = nctup; @@ -1235,8 +1277,13 @@ void vac_update_relstats(Relation relation, Relation classRel, RelPageType num_p * vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB * * Update pg_database's datfrozenxid entry for our database to be the - * minimum of the pg_class.relfrozenxid values. If we are able to - * advance pg_database.datfrozenxid, also try to truncate pg_clog. + * minimum of the pg_class.relfrozenxid values. + * + * Similarly, update our datfrozenmulti to be the minimum of the + * pg_class.relfrozenmulti values. + * + * If we are able to advance either pg_database value, also try to + * truncate pg_clog and pg_multixact. * * We violate transaction semantics here by overwriting the database's * existing pg_database tuple with the new value. This is reasonably @@ -1260,6 +1307,12 @@ void vac_update_datfrozenxid(void) TransactionId datfrozenxid; TransactionId relfrozenxid; Datum xid64datum; + MultiXactId newFrozenMulti = InvalidMultiXactId; +#ifndef ENABLE_MULTIPLE_NODES + Datum minmxidDatum; + MultiXactId relminmxid = InvalidMultiXactId; +#endif + MultiXactId datminmxid = InvalidMultiXactId; /* Don't update datfrozenxid when cluser is resizing */ if (ClusterResizingInProgress()) { @@ -1269,10 +1322,18 @@ void vac_update_datfrozenxid(void) * Initialize the "min" calculation with GetOldestXmin, which is a * reasonable approximation to the minimum relfrozenxid for not-yet- * committed pg_class entries for new tables; see AddNewRelationTuple(). - * Se we cannot produce a wrong minimum by starting with this. + * So we cannot produce a wrong minimum by starting with this. */ newFrozenXid = GetOldestXmin(NULL); +#ifndef ENABLE_MULTIPLE_NODES + /* + * Similarly, initialize the MultiXact "min" with the value that would + * be used on pg_class for new tables. See AddNewRelationTuple(). + */ + newFrozenMulti = GetOldestMultiXactId(); +#endif + lastSaneFrozenXid = ReadNewTransactionId(); /* @@ -1342,6 +1403,15 @@ void vac_update_datfrozenxid(void) if (TransactionIdPrecedes(relfrozenxid, newFrozenXid)) newFrozenXid = relfrozenxid; + +#ifndef ENABLE_MULTIPLE_NODES + minmxidDatum = tableam_tops_tuple_getattr( + classTup, Anum_pg_class_relminmxid, RelationGetDescr(relation), &isNull); + relminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(minmxidDatum); + + if (MultiXactIdIsValid(relminmxid) && MultiXactIdPrecedes(relminmxid, newFrozenMulti)) + newFrozenMulti = relminmxid; +#endif } /* we're done with pg_class */ @@ -1425,11 +1495,16 @@ void vac_update_datfrozenxid(void) /* Consider frozenxid of objects in recyclebin. */ TrAdjustFrozenXid64(u_sess->proc_cxt.MyDatabaseId, &newFrozenXid); - if ((TransactionIdPrecedes(datfrozenxid, newFrozenXid)) -#ifdef PGXC - || !IsPostmasterEnvironment) +#ifndef ENABLE_MULTIPLE_NODES + minmxidDatum = tableam_tops_tuple_getattr(tuple, Anum_pg_database_datminmxid, RelationGetDescr(relation), &isNull); + datminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(minmxidDatum); #endif - { + + if ((TransactionIdPrecedes(datfrozenxid, newFrozenXid)) || (MultiXactIdPrecedes(datminmxid, newFrozenMulti)) +#ifdef PGXC + || !IsPostmasterEnvironment +#endif + ) { Datum values[Natts_pg_database]; bool nulls[Natts_pg_database]; bool replaces[Natts_pg_database]; @@ -1444,9 +1519,16 @@ void vac_update_datfrozenxid(void) rc = memset_s(replaces, sizeof(replaces), false, sizeof(replaces)); securec_check(rc, "", ""); - replaces[Anum_pg_database_datfrozenxid64 - 1] = true; - values[Anum_pg_database_datfrozenxid64 - 1] = TransactionIdGetDatum(newFrozenXid); - + if (TransactionIdPrecedes(datfrozenxid, newFrozenXid)) { + replaces[Anum_pg_database_datfrozenxid64 - 1] = true; + values[Anum_pg_database_datfrozenxid64 - 1] = TransactionIdGetDatum(newFrozenXid); + } +#ifndef ENABLE_MULTIPLE_NODES + if (MultiXactIdPrecedes(datminmxid, newFrozenMulti)) { + replaces[Anum_pg_database_datminmxid - 1] = true; + values[Anum_pg_database_datminmxid - 1] = TransactionIdGetDatum(newFrozenMulti); + } +#endif newtuple = (HeapTuple) tableam_tops_modify_tuple(tuple, RelationGetDescr(relation), values, nulls, replaces); dirty = true; } @@ -1470,7 +1552,7 @@ void vac_update_datfrozenxid(void) * this action will update that too. */ if (dirty || ForceTransactionIdLimitUpdate()) { - vac_truncate_clog(newFrozenXid); + vac_truncate_clog(newFrozenXid, newFrozenMulti); } } @@ -1486,14 +1568,17 @@ void vac_update_datfrozenxid(void) * This routine is only invoked when we've managed to change our * DB's datfrozenxid entry. */ -static void vac_truncate_clog(TransactionId frozenXID) +static void vac_truncate_clog(TransactionId frozenXID, MultiXactId frozenMulti) { Relation relation; TableScanDesc scan; HeapTuple tuple; - Oid oldest_datoid; - /* init oldest_datoid to sync with my frozenXID */ - oldest_datoid = u_sess->proc_cxt.MyDatabaseId; + Oid oldestxid_datoid; + Oid oldestmulti_datoid; + + /* init oldest datoids to sync with my frozen values */ + oldestxid_datoid = u_sess->proc_cxt.MyDatabaseId; + oldestmulti_datoid = u_sess->proc_cxt.MyDatabaseId; /* * Scan pg_database to compute the minimum datfrozenxid @@ -1518,6 +1603,7 @@ static void vac_truncate_clog(TransactionId frozenXID) scan = tableam_scan_begin(relation, SnapshotNow, 0, NULL); while ((tuple = (HeapTuple) tableam_scan_getnexttuple(scan, ForwardScanDirection)) != NULL) { volatile FormData_pg_database* dbform = (Form_pg_database)GETSTRUCT(tuple); + bool isNull = false; TransactionId datfrozenxid; Datum xid64datum = tableam_tops_tuple_getattr(tuple, Anum_pg_database_datfrozenxid64, RelationGetDescr(relation), &isNull); @@ -1534,8 +1620,20 @@ static void vac_truncate_clog(TransactionId frozenXID) if (TransactionIdPrecedes(datfrozenxid, frozenXID)) { frozenXID = datfrozenxid; - oldest_datoid = HeapTupleGetOid(tuple); + oldestxid_datoid = HeapTupleGetOid(tuple); } + +#ifndef ENABLE_MULTIPLE_NODES + Datum minmxidDatum = tableam_tops_tuple_getattr(tuple, Anum_pg_database_datminmxid, + RelationGetDescr(relation), &isNull); + MultiXactId datminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(minmxidDatum); + Assert(MultiXactIdIsValid(datminmxid)); + + if (MultiXactIdPrecedes(datminmxid, frozenMulti)) { + frozenMulti = datminmxid; + oldestmulti_datoid = HeapTupleGetOid(tuple); + } +#endif } heap_endscan(scan); @@ -1544,20 +1642,26 @@ static void vac_truncate_clog(TransactionId frozenXID) /* Truncate CLOG to the oldest frozenxid */ TruncateCLOG(frozenXID); +#ifndef ENABLE_MULTIPLE_NODES + TruncateMultiXact(frozenMulti); +#endif /* - * Update the wrap limit for GetNewTransactionId. Note: this function - * will also signal the postmaster for an(other) autovac cycle if needed. + * Update the wrap limit for GetNewTransactionId and creation of new + * MultiXactIds. Note: these functions will also signal the postmaster for + * an(other) autovac cycle if needed. XXX should we avoid possibly + * signalling twice? */ - SetTransactionIdLimit(frozenXID, oldest_datoid); + SetTransactionIdLimit(frozenXID, oldestxid_datoid); + MultiXactAdvanceOldest(frozenMulti, oldestmulti_datoid); ereport(LOG, - (errmsg("In truncate clog: frozenXID:" XID_FMT ", oldest_datoid: %u, xid:" XID_FMT + (errmsg("In truncate clog: frozenXID:" XID_FMT ", oldestxid_datoid: %u, xid:" XID_FMT ", pid: %lu, ShmemVariableCache: nextOid:%u, oldestXid:" XID_FMT "," "nextXid:" XID_FMT ", xidVacLimit:%lu, oldestXidDB:%u, RecentXmin:" XID_FMT ", RecentGlobalXmin:" XID_FMT ", OldestXmin:" XID_FMT ", FreezeLimit:%lu, useLocalSnapshot:%d.", frozenXID, - oldest_datoid, + oldestxid_datoid, t_thrd.pgxact->xid, t_thrd.proc->pid, t_thrd.xact_cxt.ShmemVariableCache->nextOid, @@ -2516,7 +2620,7 @@ void vacuum_delay_point(void) } void vac_update_partstats(Partition part, BlockNumber num_pages, double num_tuples, BlockNumber num_all_visible_pages, - TransactionId frozenxid) + TransactionId frozenxid, MultiXactId minmulti) { Oid partid = PartitionGetPartid(part); Relation rd; @@ -2527,6 +2631,7 @@ void vac_update_partstats(Partition part, BlockNumber num_pages, double num_tupl bool isNull = false; TransactionId relfrozenxid; Datum xid64datum; + MultiXactId relminmxid = MaxMultiXactId; rd = heap_open(PartitionRelationId, RowExclusiveLock); @@ -2576,7 +2681,14 @@ void vac_update_partstats(Partition part, BlockNumber num_pages, double num_tupl relfrozenxid = DatumGetTransactionId(xid64datum); } - if (TransactionIdIsNormal(frozenxid) && TransactionIdPrecedes(relfrozenxid, frozenxid)) { +#ifndef ENABLE_MULTIPLE_NODES + Datum minmxidDatum = tableam_tops_tuple_getattr(parttup, Anum_pg_partition_relminmxid, + RelationGetDescr(rd), &isNull); + relminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(minmxidDatum); +#endif + + if ((TransactionIdIsNormal(frozenxid) && TransactionIdPrecedes(relfrozenxid, frozenxid)) || + (MultiXactIdIsValid(minmulti) && MultiXactIdPrecedes(relminmxid, minmulti))) { Datum values[Natts_pg_partition]; bool nulls[Natts_pg_partition]; bool replaces[Natts_pg_partition]; @@ -2591,8 +2703,17 @@ void vac_update_partstats(Partition part, BlockNumber num_pages, double num_tupl rc = memset_s(replaces, sizeof(replaces), false, sizeof(replaces)); securec_check(rc, "", ""); - replaces[Anum_pg_partition_relfrozenxid64 - 1] = true; - values[Anum_pg_partition_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenxid); + if (TransactionIdIsNormal(frozenxid) && TransactionIdPrecedes(relfrozenxid, frozenxid)) { + replaces[Anum_pg_partition_relfrozenxid64 - 1] = true; + values[Anum_pg_partition_relfrozenxid64 - 1] = TransactionIdGetDatum(frozenxid); + } + +#ifndef ENABLE_MULTIPLE_NODES + if (MultiXactIdIsValid(minmulti) && MultiXactIdPrecedes(relminmxid, minmulti)) { + replaces[Anum_pg_partition_relminmxid - 1] = true; + values[Anum_pg_partition_relminmxid - 1] = TransactionIdGetDatum(minmulti); + } +#endif nparttup = (HeapTuple) tableam_tops_modify_tuple(parttup, RelationGetDescr(rd), values, nulls, replaces); parttup = nparttup; @@ -2742,16 +2863,18 @@ void vac_close_part_indexes( } /* Scan pg_partition to get all the partitions of the partitioned table, - * calculate all the pages, tuples, and the min frozenXid + * calculate all the pages, tuples, and the min frozenXid, multiXid */ void CalculatePartitionedRelStats(_in_ Relation partitionRel, _in_ Relation partRel, _out_ BlockNumber* totalPages, - _out_ BlockNumber* totalVisiblePages, _out_ double* totalTuples, _out_ TransactionId* minFrozenXid) + _out_ BlockNumber* totalVisiblePages, _out_ double* totalTuples, _out_ TransactionId* minFrozenXid, + _out_ MultiXactId* minMultiXid) { ScanKeyData partKey[2]; BlockNumber pages = 0; BlockNumber allVisiblePages = 0; double tuples = 0; TransactionId frozenXid; + MultiXactId multiXid = InvalidMultiXactId; Form_pg_partition partForm; Assert(partitionRel->rd_rel->parttype == PARTTYPE_PARTITIONED_RELATION); @@ -2781,7 +2904,7 @@ void CalculatePartitionedRelStats(_in_ Relation partitionRel, _in_ Relation part SysScanDesc partScan = systable_beginscan(partRel, PartitionParentOidIndexId, true, NULL, 2, partKey); HeapTuple partTuple = NULL; - /* compute all pages, tuples and the minimum frozenXid */ + /* compute all pages, tuples and the minimum frozenXid, multiXid */ partTuple = systable_getnext(partScan); if (partTuple != NULL) { partForm = (Form_pg_partition)GETSTRUCT(partTuple); @@ -2803,6 +2926,12 @@ void CalculatePartitionedRelStats(_in_ Relation partitionRel, _in_ Relation part frozenXid = relfrozenxid; +#ifndef ENABLE_MULTIPLE_NODES + xid64datum = tableam_tops_tuple_getattr(partTuple, Anum_pg_partition_relminmxid, + RelationGetDescr(rel), &isNull); + multiXid = isNull ? FirstMultiXactId : DatumGetTransactionId(xid64datum); +#endif + do { partForm = (Form_pg_partition)GETSTRUCT(partTuple); @@ -2826,11 +2955,22 @@ void CalculatePartitionedRelStats(_in_ Relation partitionRel, _in_ Relation part if (TransactionIdPrecedes(relfrozenxid, frozenXid)) { frozenXid = relfrozenxid; } + +#ifndef ENABLE_MULTIPLE_NODES + xid64datum = tableam_tops_tuple_getattr(partTuple, Anum_pg_partition_relminmxid, + RelationGetDescr(rel), &isNull); + MultiXactId relminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(xid64datum); + + if (TransactionIdPrecedes(relminmxid, multiXid)) { + multiXid = relminmxid; + } +#endif } while ((partTuple = systable_getnext(partScan)) != NULL); heap_close(rel, AccessShareLock); } else { frozenXid = InvalidTransactionId; + multiXid = InvalidMultiXactId; } systable_endscan(partScan); @@ -2843,17 +2983,21 @@ void CalculatePartitionedRelStats(_in_ Relation partitionRel, _in_ Relation part *totalTuples = tuples; if (minFrozenXid != NULL) *minFrozenXid = frozenXid; + if (minMultiXid != NULL) + *minMultiXid = multiXid; } /* * After VACUUM or ANALYZE, update pg_class for the partitioned tables. */ -void vac_update_pgclass_partitioned_table(Relation partitionRel, bool hasIndex, TransactionId newFrozenXid) +void vac_update_pgclass_partitioned_table(Relation partitionRel, bool hasIndex, TransactionId newFrozenXid, + MultiXactId newMultiXid) { BlockNumber pages = 0; BlockNumber allVisiblePages = 0; double tuples = 0; TransactionId frozenXid = newFrozenXid; + MultiXactId multiXid = newMultiXid; Assert(partitionRel->rd_rel->parttype == PARTTYPE_PARTITIONED_RELATION); @@ -2864,8 +3008,8 @@ void vac_update_pgclass_partitioned_table(Relation partitionRel, bool hasIndex, */ Relation classRel = heap_open(RelationRelationId, RowExclusiveLock); Relation partRel = heap_open(PartitionRelationId, ShareUpdateExclusiveLock); - CalculatePartitionedRelStats(partitionRel, partRel, &pages, &allVisiblePages, &tuples, &frozenXid); - vac_update_relstats(partitionRel, classRel, pages, tuples, allVisiblePages, hasIndex, frozenXid); + CalculatePartitionedRelStats(partitionRel, partRel, &pages, &allVisiblePages, &tuples, &frozenXid, &multiXid); + vac_update_relstats(partitionRel, classRel, pages, tuples, allVisiblePages, hasIndex, frozenXid, multiXid); heap_close(partRel, ShareUpdateExclusiveLock); heap_close(classRel, RowExclusiveLock); } @@ -2884,7 +3028,7 @@ void CStoreVacUpdatePartitionRelStats(Relation partitionRel, TransactionId newFr */ Relation pgclassRel = heap_open(RelationRelationId, RowExclusiveLock); Relation pgPartitionRel = heap_open(PartitionRelationId, ShareUpdateExclusiveLock); - CalculatePartitionedRelStats(partitionRel, pgPartitionRel, NULL, NULL, NULL, &frozenXid); + CalculatePartitionedRelStats(partitionRel, pgPartitionRel, NULL, NULL, NULL, &frozenXid, NULL); CStoreVacUpdateNormalRelStats(RelationGetRelid(partitionRel), frozenXid, pgclassRel); heap_close(pgPartitionRel, ShareUpdateExclusiveLock); heap_close(pgclassRel, RowExclusiveLock); @@ -3149,7 +3293,8 @@ void merge_cu_relation(void* _info, VacuumStmt* stmt) getTuplesAndInsert(delta_rel, OIDNewHeap); /* swap relfile node */ - finish_heap_swap(deltaOid, OIDNewHeap, false, false, false, u_sess->utils_cxt.RecentGlobalXmin); + finish_heap_swap(deltaOid, OIDNewHeap, false, false, false, u_sess->utils_cxt.RecentGlobalXmin, + InvalidMultiXactId); /* close relation */ relation_close(delta_rel, NoLock); @@ -3191,7 +3336,8 @@ void merge_cu_relation(void* _info, VacuumStmt* stmt) getTuplesAndInsert(delta_rel, OIDNewHeap); /* swap relfile node */ - finish_heap_swap(deltaOid, OIDNewHeap, false, false, false, u_sess->utils_cxt.RecentGlobalXmin); + finish_heap_swap(deltaOid, OIDNewHeap, false, false, false, u_sess->utils_cxt.RecentGlobalXmin, + InvalidMultiXactId); /* close relation */ if (delta_rel != NULL) @@ -3939,7 +4085,8 @@ static void GPIVacuumMainPartition( cbi_set_enable_clean(iRel[i]); } vac_update_relstats( - iRel[i], classRel, indstats[i]->num_pages, indstats[i]->num_index_tuples, 0, false, InvalidTransactionId); + iRel[i], classRel, indstats[i]->num_pages, indstats[i]->num_index_tuples, 0, false, InvalidTransactionId, + InvalidMultiXactId); pfree_ext(indstats[i]); index_close(iRel[i], lockmode); } diff --git a/src/gausskernel/optimizer/commands/vacuumlazy.cpp b/src/gausskernel/optimizer/commands/vacuumlazy.cpp index 2c555681b..5fe65de5b 100644 --- a/src/gausskernel/optimizer/commands/vacuumlazy.cpp +++ b/src/gausskernel/optimizer/commands/vacuumlazy.cpp @@ -47,6 +47,7 @@ #include "access/transam.h" #include "access/visibilitymap.h" #include "access/xlog.h" +#include "access/multixact.h" #include "catalog/catalog.h" #include "catalog/storage.h" #include "catalog/pg_hashbucket_fn.h" @@ -181,6 +182,7 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy double new_rel_tuples; BlockNumber new_rel_allvisible; TransactionId new_frozen_xid; + MultiXactId new_min_multi; Relation* indexrel = NULL; Partition* indexpart = NULL; uint32 statFlag = onerel->parentId; @@ -193,7 +195,8 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy vacstmt->freeze_table_age, &u_sess->cmd_cxt.OldestXmin, &u_sess->cmd_cxt.FreezeLimit, - &freezeTableLimit); + &freezeTableLimit, + &u_sess->cmd_cxt.MultiXactFrzLimit); new_frozen_xid = u_sess->cmd_cxt.FreezeLimit; @@ -322,7 +325,8 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy vacstmt->freeze_table_age, &u_sess->cmd_cxt.OldestXmin, &u_sess->cmd_cxt.FreezeLimit, - &freezeTableLimit); + &freezeTableLimit, + &u_sess->cmd_cxt.MultiXactFrzLimit); bool isNull = false; TransactionId relfrozenxid; @@ -488,10 +492,16 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy new_frozen_xid = InvalidTransactionId; } + new_min_multi = u_sess->cmd_cxt.MultiXactFrzLimit; + if (vacrelstats->scanned_pages < vacrelstats->rel_pages || vacrelstats->hasKeepInvisbleTuples) { + new_min_multi = InvalidMultiXactId; + } + if (RelationIsPartition(onerel)) { Assert(vacstmt->onepart != NULL); - vac_update_partstats(vacstmt->onepart, new_rel_pages, new_rel_tuples, new_rel_allvisible, new_frozen_xid); + vac_update_partstats(vacstmt->onepart, new_rel_pages, new_rel_tuples, + new_rel_allvisible, new_frozen_xid, new_min_multi); /* * when vacuum partition, do not change the relhasindex field in pg_class * for partitioned table, as some partition may be altered as "all local @@ -500,7 +510,7 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy * misdguge as hot update even if update indexes columns. */ vac_update_pgclass_partitioned_table( - vacstmt->onepartrel, vacstmt->onepartrel->rd_rel->relhasindex, new_frozen_xid); + vacstmt->onepartrel, vacstmt->onepartrel->rd_rel->relhasindex, new_frozen_xid, new_min_multi); // update stats of local partition indexes for (int idx = 0; idx < nindexes - nindexesGlobal; idx++) { @@ -512,9 +522,10 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy vacrelstats->new_idx_pages[idx], vacrelstats->new_idx_tuples[idx], 0, - InvalidTransactionId); + InvalidTransactionId, + InvalidMultiXactId); - vac_update_pgclass_partitioned_table(indexrel[idx], false, InvalidTransactionId); + vac_update_pgclass_partitioned_table(indexrel[idx], false, InvalidTransactionId, InvalidMultiXactId); } // update stats of global partition indexes @@ -531,13 +542,15 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy vacrelstats->new_idx_tuples[idx], 0, false, - InvalidTransactionId); + InvalidTransactionId, + InvalidMultiXactId); } heap_close(classRel, RowExclusiveLock); } else { Relation classRel = heap_open(RelationRelationId, RowExclusiveLock); vac_update_relstats( - onerel, classRel, new_rel_pages, new_rel_tuples, new_rel_allvisible, vacrelstats->hasindex, new_frozen_xid); + onerel, classRel, new_rel_pages, new_rel_tuples, new_rel_allvisible, + vacrelstats->hasindex, new_frozen_xid, new_min_multi); for (int idx = 0; idx < nindexes; idx++) { /* update index status */ @@ -551,7 +564,8 @@ void lazy_vacuum_rel(Relation onerel, VacuumStmt* vacstmt, BufferAccessStrategy vacrelstats->new_idx_tuples[idx], 0, false, - InvalidTransactionId); + InvalidTransactionId, + InvalidMultiXactId); } heap_close(classRel, RowExclusiveLock); } @@ -1032,6 +1046,7 @@ static IndexBulkDeleteResult** lazy_scan_heap( bool all_visible = false; bool has_dead_tuples = false; TransactionId visibility_cutoff_xid = InvalidTransactionId; + bool changedMultiXid; /* IO collector and IO scheduler for vacuum */ if (ENABLE_WORKLOAD_CONTROL) @@ -1266,6 +1281,7 @@ static IndexBulkDeleteResult** lazy_scan_heap( all_visible = true; has_dead_tuples = false; nfrozen = 0; + changedMultiXid = false; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); @@ -1417,7 +1433,8 @@ static IndexBulkDeleteResult** lazy_scan_heap( * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ - if (heap_freeze_tuple(&tuple, u_sess->cmd_cxt.FreezeLimit)) + if (heap_freeze_tuple(&tuple, u_sess->cmd_cxt.FreezeLimit, u_sess->cmd_cxt.MultiXactFrzLimit, + &changedMultiXid)) frozen[nfrozen++] = offnum; } @@ -1435,7 +1452,9 @@ static IndexBulkDeleteResult** lazy_scan_heap( if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; - recptr = log_heap_freeze(onerel, buf, u_sess->cmd_cxt.FreezeLimit, frozen, nfrozen); + recptr = log_heap_freeze(onerel, buf, u_sess->cmd_cxt.FreezeLimit, + changedMultiXid ? u_sess->cmd_cxt.MultiXactFrzLimit : InvalidMultiXactId, + frozen, nfrozen); PageSetLSN(page, recptr); } END_CRIT_SECTION(); @@ -1782,7 +1801,7 @@ static bool lazy_check_needs_freeze(Buffer buf) HeapTupleCopyBaseFromPage(&tuple, page); ItemPointerSet(&(tuple.t_self), BufferGetBlockNumber(buf), offnum); - if (heap_tuple_needs_freeze(&tuple, u_sess->cmd_cxt.FreezeLimit, buf)) + if (heap_tuple_needs_freeze(&tuple, u_sess->cmd_cxt.FreezeLimit, u_sess->cmd_cxt.MultiXactFrzLimit, buf)) return true; } /* scan along page */ diff --git a/src/gausskernel/optimizer/plan/initsplan.cpp b/src/gausskernel/optimizer/plan/initsplan.cpp index 460a12afa..973d92b9b 100644 --- a/src/gausskernel/optimizer/plan/initsplan.cpp +++ b/src/gausskernel/optimizer/plan/initsplan.cpp @@ -885,11 +885,11 @@ static SpecialJoinInfo* make_outerjoininfo( "unexpected join type."); /* - * Presently the executor cannot support FOR UPDATE/SHARE marking of rels + * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of rels * appearing on the nullable side of an outer join. (It's somewhat unclear * what that would mean, anyway: what should we mark when a result row is * generated from no element of the nullable relation?) So, complain if - * any nullable rel is FOR UPDATE/SHARE. + * any nullable rel is FOR [KEY] UPDATE/SHARE. * * You might be wondering why this test isn't made far upstream in the * parser. It's because the parser hasn't got enough info --- consider @@ -906,7 +906,8 @@ static SpecialJoinInfo* make_outerjoininfo( ereport(ERROR, (errmodule(MOD_OPT), errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE cannot be applied to the nullable side of an outer join"))); + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE cannot be applied to the nullable side " + "of an outer join"))); } } diff --git a/src/gausskernel/optimizer/plan/planner.cpp b/src/gausskernel/optimizer/plan/planner.cpp index 626db0e1c..c35af9ca2 100755 --- a/src/gausskernel/optimizer/plan/planner.cpp +++ b/src/gausskernel/optimizer/plan/planner.cpp @@ -1781,7 +1781,7 @@ Plan* subquery_planner(PlannerGlobal* glob, Query* parse, PlannerInfo* parent_ro returningLists = NIL; /* - * If there was a FOR UPDATE/SHARE clause, the LockRows node will + * If there was a FOR [KEY] UPDATE/SHARE clause, the LockRows node will * have dealt with fetching non-locked marked rows, else we need * to have ModifyTable do that. */ @@ -2298,7 +2298,7 @@ static Plan* inheritance_planner(PlannerInfo* root) root->simple_rel_array = save_rel_array; root->simple_rte_array = save_rte_array; /* - * If there was a FOR UPDATE/SHARE clause, the LockRows node will have + * If there was a FOR [KEY] UPDATE/SHARE clause, the LockRows node will have * dealt with fetching non-locked marked rows, else we need to have * ModifyTable do that. */ @@ -2649,13 +2649,14 @@ static Plan* grouping_planner(PlannerInfo* root, double tuple_fraction) tlist = postprocess_setop_tlist((List*)copyObject(result_plan->targetlist), tlist); /* - * Can't handle FOR UPDATE/SHARE here (parser should have checked + * Can't handle FOR [KEY] UPDATE/SHARE here (parser should have checked * already, but let's make sure). */ if (parse->rowMarks) ereport(ERROR, (errmodule(MOD_OPT), errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("SELECT FOR UPDATE/SHARE is not allowed with UNION/INTERSECT/EXCEPT"), + errmsg("SELECT FOR UPDATE/SHARE/NO KEY UPDATE/KEY SHARE is not allowed " + "with UNION/INTERSECT/EXCEPT"), errdetail("N/A"), errcause("SQL uses unsupported feature."), erraction("Modify SQL statement according to the manual."))); @@ -4142,7 +4143,7 @@ static Plan* grouping_planner(PlannerInfo* root, double tuple_fraction) } /* - * If there is a FOR UPDATE/SHARE clause, add the LockRows node. (Note: we + * If there is a FOR [KEY] UPDATE/SHARE clause, add the LockRows node. (Note: we * intentionally test parse->rowMarks not root->rowMarks here. If there * are only non-locking rowmarks, they should be handled by the * ModifyTable node instead.) @@ -4832,7 +4833,7 @@ static void preprocess_rowmarks(PlannerInfo* root) if (parse->rowMarks) { /* - * We've got trouble if FOR UPDATE/SHARE appears inside grouping, + * We've got trouble if FOR [KEY] UPDATE/SHARE appears inside grouping, * since grouping renders a reference to individual tuple CTIDs * invalid. This is also checked at parse time, but that's * insufficient because of rule substitution, query pullup, etc. @@ -4840,7 +4841,7 @@ static void preprocess_rowmarks(PlannerInfo* root) CheckSelectLocking(parse); } else { /* - * We only need rowmarks for UPDATE, DELETE, MEREG INTO, or FOR UPDATE/SHARE. + * We only need rowmarks for UPDATE, DELETE, MEREG INTO, or FOR [KEY] UPDATE/SHARE. */ if (parse->commandType != CMD_UPDATE && parse->commandType != CMD_DELETE && (parse->commandType != CMD_MERGE || (u_sess->opt_cxt.is_stream == false && IS_SINGLE_NODE == false))) @@ -4850,7 +4851,7 @@ static void preprocess_rowmarks(PlannerInfo* root) /* * We need to have rowmarks for all base relations except the target. We * make a bitmapset of all base rels and then remove the items we don't - * need or have FOR UPDATE/SHARE marks for. + * need or have FOR [KEY] UPDATE/SHARE marks for. */ rels = get_base_rel_indexes((Node*)parse->jointree); if (parse->resultRelation) @@ -4897,10 +4898,23 @@ static void preprocess_rowmarks(PlannerInfo* root) newrc = makeNode(PlanRowMark); newrc->rti = newrc->prti = rc->rti; newrc->rowmarkId = ++(root->glob->lastRowMarkId); - if (rc->forUpdate) - newrc->markType = ROW_MARK_EXCLUSIVE; - else - newrc->markType = ROW_MARK_SHARE; + switch (rc->strength) { + case LCS_FORUPDATE: + newrc->markType = ROW_MARK_EXCLUSIVE; + break; + case LCS_FORNOKEYUPDATE: + newrc->markType = ROW_MARK_NOKEYEXCLUSIVE; + break; + case LCS_FORSHARE: + newrc->markType = ROW_MARK_SHARE; + break; + case LCS_FORKEYSHARE: + newrc->markType = ROW_MARK_KEYSHARE; + break; + default: + ereport(ERROR, (errmsg("unknown lock type: %d", rc->strength))); + break; + } newrc->noWait = rc->noWait; newrc->isParent = false; newrc->bms_nodeids = ng_get_baserel_data_nodeids(rte->relid, rte->relkind); diff --git a/src/gausskernel/optimizer/rewrite/rewriteHandler.cpp b/src/gausskernel/optimizer/rewrite/rewriteHandler.cpp index 2e410e90c..2597d13c2 100644 --- a/src/gausskernel/optimizer/rewrite/rewriteHandler.cpp +++ b/src/gausskernel/optimizer/rewrite/rewriteHandler.cpp @@ -73,7 +73,7 @@ static TargetEntry* process_matched_tle(TargetEntry* src_tle, TargetEntry* prior static Node* get_assignment_input(Node* node); static void rewriteValuesRTE(RangeTblEntry* rte, Relation target_relation, List* attrnos); static void rewriteTargetListUD(Query* parsetree, RangeTblEntry* target_rte, Relation target_relation); -static void markQueryForLocking(Query* qry, Node* jtnode, bool forUpdate, bool noWait, bool pushedDown); +static void markQueryForLocking(Query* qry, Node* jtnode, LockClauseStrength strength, bool noWait, bool pushedDown); static List* matchLocks(CmdType event, RuleLock* rulelocks, int varno, Query* parsetree); static Query* fireRIRrules(Query* parsetree, List* activeRIRs, bool forUpdatePushedDown); @@ -152,7 +152,7 @@ void AcquireRewriteLocks(Query* parsetree, bool forUpdatePushedDown) * * If the relation is the query's result relation, then we * need RowExclusiveLock. Otherwise, check to see if the - * relation is accessed FOR UPDATE/SHARE or not. We can't + * relation is accessed FOR [KEY] UPDATE/SHARE or not. We can't * just grab AccessShareLock because then the executor would * be trying to upgrade the lock, leading to possible * deadlocks. @@ -1555,7 +1555,7 @@ static Query* ApplyRetrieveRule(Query* parsetree, RewriteRule* rule, int rt_inde } /* - * If FOR UPDATE/SHARE of view, be sure we get right initial lock on the + * If FOR [KEY] UPDATE/SHARE of view, be sure we get right initial lock on the * relations it references. */ rc = get_parse_rowmark(parsetree, rt_index); @@ -1607,21 +1607,21 @@ static Query* ApplyRetrieveRule(Query* parsetree, RewriteRule* rule, int rt_inde rte->extraUpdatedCols = NULL; /* - * If FOR UPDATE/SHARE of view, mark all the contained tables as implicit - * FOR UPDATE/SHARE, the same as the parser would have done if the view's + * If FOR [KEY] UPDATE/SHARE of view, mark all the contained tables as implicit + * FOR [KEY] UPDATE/SHARE, the same as the parser would have done if the view's * subquery had been written out explicitly. * * Note: we don't consider forUpdatePushedDown here; such marks will be * made by recursing from the upper level in markQueryForLocking. */ if (rc != NULL) - markQueryForLocking(rule_action, (Node*)rule_action->jointree, rc->forUpdate, rc->noWait, true); + markQueryForLocking(rule_action, (Node*)rule_action->jointree, rc->strength, rc->noWait, true); return parsetree; } /* - * Recursively mark all relations used by a view as FOR UPDATE/SHARE. + * Recursively mark all relations used by a view as FOR [KEY] UPDATE/SHARE. * * This may generate an invalid query, eg if some sub-query uses an * aggregate. We leave it to the planner to detect that. @@ -1631,7 +1631,7 @@ static Query* ApplyRetrieveRule(Query* parsetree, RewriteRule* rule, int rt_inde * OLD and NEW rels for updating. The best way to handle that seems to be * to scan the jointree to determine which rels are used. */ -static void markQueryForLocking(Query* qry, Node* jtnode, bool forUpdate, bool noWait, bool pushedDown) +static void markQueryForLocking(Query* qry, Node* jtnode, LockClauseStrength strength, bool noWait, bool pushedDown) { if (jtnode == NULL) return; @@ -1640,12 +1640,12 @@ static void markQueryForLocking(Query* qry, Node* jtnode, bool forUpdate, bool n RangeTblEntry* rte = rt_fetch(rti, qry->rtable); if (rte->rtekind == RTE_RELATION) { - applyLockingClause(qry, rti, forUpdate, noWait, pushedDown); + applyLockingClause(qry, rti, strength, noWait, pushedDown); rte->requiredPerms |= ACL_SELECT_FOR_UPDATE; } else if (rte->rtekind == RTE_SUBQUERY) { - applyLockingClause(qry, rti, forUpdate, noWait, pushedDown); - /* FOR UPDATE/SHARE of subquery is propagated to subquery's rels */ - markQueryForLocking(rte->subquery, (Node*)rte->subquery->jointree, forUpdate, noWait, true); + applyLockingClause(qry, rti, strength, noWait, pushedDown); + /* FOR [KYE] UPDATE/SHARE of subquery is propagated to subquery's rels */ + markQueryForLocking(rte->subquery, (Node*)rte->subquery->jointree, strength, noWait, true); } /* other RTE types are unaffected by FOR UPDATE */ } else if (IsA(jtnode, FromExpr)) { @@ -1653,12 +1653,12 @@ static void markQueryForLocking(Query* qry, Node* jtnode, bool forUpdate, bool n ListCell* l = NULL; foreach (l, f->fromlist) - markQueryForLocking(qry, (Node*)lfirst(l), forUpdate, noWait, pushedDown); + markQueryForLocking(qry, (Node*)lfirst(l), strength, noWait, pushedDown); } else if (IsA(jtnode, JoinExpr)) { JoinExpr* j = (JoinExpr*)jtnode; - markQueryForLocking(qry, j->larg, forUpdate, noWait, pushedDown); - markQueryForLocking(qry, j->rarg, forUpdate, noWait, pushedDown); + markQueryForLocking(qry, j->larg, strength, noWait, pushedDown); + markQueryForLocking(qry, j->rarg, strength, noWait, pushedDown); } else ereport(ERROR, (errmodule(MOD_OPT_REWRITE), diff --git a/src/gausskernel/process/postmaster/autovacuum.cpp b/src/gausskernel/process/postmaster/autovacuum.cpp index 1e4b92d17..84c9a3751 100755 --- a/src/gausskernel/process/postmaster/autovacuum.cpp +++ b/src/gausskernel/process/postmaster/autovacuum.cpp @@ -81,6 +81,7 @@ #include "access/twophase.h" #include "access/transam.h" #include "access/xact.h" +#include "access/multixact.h" #include "catalog/dependency.h" #include "catalog/namespace.h" #include "catalog/pg_database.h" @@ -137,6 +138,7 @@ typedef struct avw_dbase { Oid adw_datid; char* adw_name; TransactionId adw_frozenxid; + MultiXactId adw_frozenmulti; PgStat_StatDBEntry* adw_entry; } avw_dbase; @@ -863,7 +865,11 @@ static Oid do_start_worker(void) List* dblist = NIL; ListCell* cell = NULL; TransactionId xidForceLimit; +#ifndef ENABLE_MULTIPLE_NODES + MultiXactId multiForceLimit; +#endif bool for_xid_wrap = false; + bool for_multi_wrap = false; avw_dbase* avdb = NULL; TimestampTz current_time; bool skipit = false; @@ -906,6 +912,16 @@ static Oid do_start_worker(void) else xidForceLimit = FirstNormalTransactionId; +#ifndef ENABLE_MULTIPLE_NODES + /* Also determine the oldest datminmxid we will consider. */ + t_thrd.autovacuum_cxt.recentMulti = ReadNextMultiXactId(); + if (t_thrd.autovacuum_cxt.recentMulti > + FirstMultiXactId + g_instance.attr.attr_storage.autovacuum_freeze_max_age) + multiForceLimit = t_thrd.autovacuum_cxt.recentMulti - g_instance.attr.attr_storage.autovacuum_freeze_max_age; + else + multiForceLimit = FirstMultiXactId; +#endif + /* * Choose a database to connect to. We pick the database that was least * recently auto-vacuumed, or one that needs vacuuming to recycle clog. @@ -923,6 +939,7 @@ static Oid do_start_worker(void) */ avdb = NULL; for_xid_wrap = false; + for_multi_wrap = false; current_time = GetCurrentTimestamp(); foreach (cell, dblist) { avw_dbase* tmp = (avw_dbase*)lfirst(cell); @@ -936,6 +953,15 @@ static Oid do_start_worker(void) continue; } else if (for_xid_wrap) continue; /* ignore not-at-risk DBs */ +#ifndef ENABLE_MULTIPLE_NODES + else if (MultiXactIdPrecedes(tmp->adw_frozenmulti, multiForceLimit)) { + if (avdb == NULL || MultiXactIdPrecedes(tmp->adw_frozenmulti, avdb->adw_frozenmulti)) + avdb = tmp; + for_multi_wrap = true; + continue; + } else if (for_multi_wrap) + continue; /* ignore not-at-risk DBs */ +#endif /* Find pgstat entry if any */ tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid); @@ -1359,6 +1385,7 @@ NON_EXEC_STATIC void AutoVacWorkerMain() /* And do an appropriate amount of work */ t_thrd.autovacuum_cxt.recentXid = ReadNewTransactionId(); + t_thrd.autovacuum_cxt.recentMulti = ReadNextMultiXactId(); do_autovacuum(); } @@ -1576,7 +1603,11 @@ static List* get_database_list(void) datfrozenxid = FirstNormalTransactionId; } else datfrozenxid = DatumGetTransactionId(xid64datum); - +#ifndef ENABLE_MULTIPLE_NODES + Datum mxidDatum = heap_getattr(tup, Anum_pg_database_datminmxid, RelationGetDescr(rel), &isNull); + MultiXactId datminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(mxidDatum); + avdb->adw_frozenmulti = datminmxid; +#endif avdb->adw_frozenxid = datfrozenxid; /* this gets set later: */ avdb->adw_entry = NULL; @@ -2873,7 +2904,7 @@ static autovac_table* table_recheck_autovac( */ static void determine_vacuum_params(float4& vac_scale_factor, int& vac_base_thresh, float4& anl_scale_factor, int& anl_base_thresh, int64& freeze_max_age, bool& av_enabled, TransactionId& xidForceLimit, - const AutoVacOpts* relopts) + MultiXactId& multiForceLimit, const AutoVacOpts* relopts) { /* -1 in autovac setting means use plain vacuum_cost_delay */ vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0) ? relopts->vacuum_scale_factor @@ -2899,6 +2930,15 @@ static void determine_vacuum_params(float4& vac_scale_factor, int& vac_base_thre xidForceLimit = t_thrd.autovacuum_cxt.recentXid - freeze_max_age; else xidForceLimit = FirstNormalTransactionId; + +#ifndef ENABLE_MULTIPLE_NODES + if (t_thrd.autovacuum_cxt.recentMulti > + FirstMultiXactId + g_instance.attr.attr_storage.autovacuum_freeze_max_age) + multiForceLimit = t_thrd.autovacuum_cxt.recentMulti - + g_instance.attr.attr_storage.autovacuum_freeze_max_age; + else + multiForceLimit = FirstMultiXactId; +#endif } /* @@ -2968,18 +3008,18 @@ static void relation_needs_vacanalyze(Oid relid, AutoVacOpts* relopts, Form_pg_c /* freeze parameters */ int64 freeze_max_age = 0; TransactionId xidForceLimit = InvalidTransactionId; + MultiXactId multiForceLimit = InvalidMultiXactId; AssertArg(classForm != NULL); AssertArg(OidIsValid(relid)); determine_vacuum_params(vac_scale_factor, vac_base_thresh, anl_scale_factor, anl_base_thresh, freeze_max_age, - av_enabled, xidForceLimit, relopts); + av_enabled, xidForceLimit, multiForceLimit, relopts); bool isNull = false; TransactionId relfrozenxid = InvalidTransactionId; Relation rel = heap_open(RelationRelationId, AccessShareLock); Datum xid64datum = heap_getattr(tuple, Anum_pg_class_relfrozenxid64, RelationGetDescr(rel), &isNull); - heap_close(rel, AccessShareLock); if (isNull) { relfrozenxid = classForm->relfrozenxid; @@ -2993,6 +3033,14 @@ static void relation_needs_vacanalyze(Oid relid, AutoVacOpts* relopts, Form_pg_c } force_vacuum = (TransactionIdIsNormal(relfrozenxid) && TransactionIdPrecedes(relfrozenxid, xidForceLimit)); +#ifndef ENABLE_MULTIPLE_NODES + if (!force_vacuum) { + Datum mxidDatum = heap_getattr(tuple, Anum_pg_class_relminmxid, RelationGetDescr(rel), &isNull); + MultiXactId relminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(mxidDatum); + force_vacuum = (MultiXactIdIsValid(relminmxid) && MultiXactIdPrecedes(relminmxid, multiForceLimit)); + } +#endif + heap_close(rel, AccessShareLock); *need_freeze = force_vacuum; AUTOVAC_LOG(DEBUG2, "vac \"%s\": need freeze is %s", NameStr(classForm->relname), force_vacuum ? "true" : "false"); @@ -3366,6 +3414,7 @@ static void partition_needs_vacanalyze(Oid partid, AutoVacOpts* relopts, Form_pg /* freeze parameters */ int64 freeze_max_age = 0; TransactionId xidForceLimit = InvalidTransactionId; + MultiXactId multiForceLimit = InvalidMultiXactId; char* relname = NULL; Oid nameSpaceOid = InvalidOid; @@ -3373,7 +3422,7 @@ static void partition_needs_vacanalyze(Oid partid, AutoVacOpts* relopts, Form_pg AssertArg(partForm != NULL && OidIsValid(partid)); determine_vacuum_params(vac_scale_factor, vac_base_thresh, anl_scale_factor, anl_base_thresh, freeze_max_age, - av_enabled, xidForceLimit, relopts); + av_enabled, xidForceLimit, multiForceLimit, relopts); /* Force vacuum if table need freeze the old tuple to recycle clog */ if (t_thrd.autovacuum_cxt.recentXid > FirstNormalTransactionId + freeze_max_age) xidForceLimit = t_thrd.autovacuum_cxt.recentXid - freeze_max_age; @@ -3384,7 +3433,6 @@ static void partition_needs_vacanalyze(Oid partid, AutoVacOpts* relopts, Form_pg TransactionId relfrozenxid = InvalidTransactionId; Relation rel = heap_open(PartitionRelationId, AccessShareLock); Datum xid64datum = heap_getattr(partTuple, Anum_pg_partition_relfrozenxid64, RelationGetDescr(rel), &isNull); - heap_close(rel, AccessShareLock); if (isNull) { relfrozenxid = partForm->relfrozenxid; @@ -3397,6 +3445,15 @@ static void partition_needs_vacanalyze(Oid partid, AutoVacOpts* relopts, Form_pg relfrozenxid = DatumGetTransactionId(xid64datum); } +#ifndef ENABLE_MULTIPLE_NODES + if (!force_vacuum) { + Datum mxidDatum = heap_getattr(partTuple, Anum_pg_partition_relminmxid, RelationGetDescr(rel), &isNull); + MultiXactId relminmxid = isNull ? FirstMultiXactId : DatumGetTransactionId(mxidDatum); + force_vacuum = (MultiXactIdIsValid(relminmxid) && MultiXactIdPrecedes(relminmxid, multiForceLimit)); + } +#endif + heap_close(rel, AccessShareLock); + force_vacuum = (TransactionIdIsNormal(relfrozenxid) && TransactionIdPrecedes(relfrozenxid, xidForceLimit)); *need_freeze = force_vacuum; diff --git a/src/gausskernel/process/tcop/utility.cpp b/src/gausskernel/process/tcop/utility.cpp index c7b7016cb..d314a1450 100755 --- a/src/gausskernel/process/tcop/utility.cpp +++ b/src/gausskernel/process/tcop/utility.cpp @@ -343,7 +343,7 @@ bool CommandIsReadOnly(Node* parse_tree) switch (stmt->commandType) { case CMD_SELECT: if (stmt->rowMarks != NIL) - return false; /* SELECT FOR UPDATE/SHARE */ + return false; /* SELECT FOR [KEY] UPDATE/SHARE */ else if (stmt->hasModifyingCTE) return false; /* data-modifying CTE */ else @@ -8522,10 +8522,23 @@ const char* CreateCommandTag(Node* parse_tree) tag = "DECLARE CURSOR"; } else if (stmt->rowMarks != NIL) { /* not 100% but probably close enough */ - if (((PlanRowMark*)linitial(stmt->rowMarks))->markType == ROW_MARK_EXCLUSIVE) - tag = "SELECT FOR UPDATE"; - else - tag = "SELECT FOR SHARE"; + switch (((PlanRowMark *)linitial(stmt->rowMarks))->markType) { + case ROW_MARK_EXCLUSIVE: + tag = "SELECT FOR UPDATE"; + break; + case ROW_MARK_NOKEYEXCLUSIVE: + tag = "SELECT FOR NO KEY UPDATE"; + break; + case ROW_MARK_SHARE: + tag = "SELECT FOR SHARE"; + break; + case ROW_MARK_KEYSHARE: + tag = "SELECT FOR KEY SHARE"; + break; + default: + tag = "SELECT"; + break; + } } else tag = "SELECT"; break; @@ -8566,10 +8579,23 @@ const char* CreateCommandTag(Node* parse_tree) tag = "DECLARE CURSOR"; } else if (stmt->rowMarks != NIL) { /* not 100% but probably close enough */ - if (((RowMarkClause*)linitial(stmt->rowMarks))->forUpdate) - tag = "SELECT FOR UPDATE"; - else - tag = "SELECT FOR SHARE"; + switch (((RowMarkClause *)linitial(stmt->rowMarks))->strength) { + case LCS_FORKEYSHARE: + tag = "SELECT FOR KEY SHARE"; + break; + case LCS_FORSHARE: + tag = "SELECT FOR SHARE"; + break; + case LCS_FORNOKEYUPDATE: + tag = "SELECT FOR NO KEY UPDATE"; + break; + case LCS_FORUPDATE: + tag = "SELECT FOR UPDATE"; + break; + default: + tag = "?\?\?"; + break; + } } else tag = "SELECT"; break; diff --git a/src/gausskernel/runtime/executor/execMain.cpp b/src/gausskernel/runtime/executor/execMain.cpp index 8092904a3..eeaee0331 100755 --- a/src/gausskernel/runtime/executor/execMain.cpp +++ b/src/gausskernel/runtime/executor/execMain.cpp @@ -317,7 +317,7 @@ void standard_ExecutorStart(QueryDesc *queryDesc, int eflags) switch (queryDesc->operation) { case CMD_SELECT: /* - * SELECT FOR UPDATE/SHARE and modifying CTEs need to mark tuples + * SELECT FOR [KEY] UPDATE/SHARE and modifying CTEs need to mark tuples */ if (queryDesc->plannedstmt->rowMarks != NIL || queryDesc->plannedstmt->hasModifyingCTE) { estate->es_output_cid = GetCurrentCommandId(true); @@ -1241,7 +1241,7 @@ void InitPlan(QueryDesc *queryDesc, int eflags) } /* - * Similarly, we have to lock relations selected FOR UPDATE/FOR SHARE + * Similarly, we have to lock relations selected FOR [KEY] UPDATE/SHARE * before we initialize the plan tree, else we'd be risking lock upgrades. * While we are at it, build the ExecRowMark list. */ @@ -1264,7 +1264,9 @@ void InitPlan(QueryDesc *queryDesc, int eflags) */ switch (rc->markType) { case ROW_MARK_EXCLUSIVE: + case ROW_MARK_NOKEYEXCLUSIVE: case ROW_MARK_SHARE: + case ROW_MARK_KEYSHARE: if (IS_PGXC_COORDINATOR || u_sess->pgxc_cxt.PGXCNodeId < 0 || bms_is_member(u_sess->pgxc_cxt.PGXCNodeId, rc->bms_nodeids)) { relid = getrelid(rc->rti, rangeTable); @@ -1979,7 +1981,7 @@ static void ExecEndPlan(PlanState *planstate, EState *estate) } /* - * close any relations selected FOR UPDATE/FOR SHARE, again keeping locks + * close any relations selected FOR [KEY] UPDATE/SHARE, again keeping locks */ foreach (l, estate->es_rowMarks) { ExecRowMark *erm = (ExecRowMark *)lfirst(l); @@ -2773,6 +2775,7 @@ TupleTableSlot *EvalPlanQualUHeap(EState *estate, EPQState *epqstate, Relation r * epqstate - state for EvalPlanQual rechecking * relation - table containing tuple * rti - rangetable index of table containing tuple + * lockmode - requested tuple lock mode * *tid - t_ctid from the outdated tuple (ie, next updated version) * priorXmax - t_xmax from the outdated tuple * @@ -2781,9 +2784,12 @@ TupleTableSlot *EvalPlanQualUHeap(EState *estate, EPQState *epqstate, Relation r * * Returns a slot containing the new candidate update/delete tuple, or * NULL if we determine we shouldn't process the row. + * + * Note: properly, lockmode should be declared as enum LockTupleMode, + * but we use "int" to avoid having to include heapam.h in executor.h. */ -TupleTableSlot *EvalPlanQual(EState *estate, EPQState *epqstate, Relation relation, Index rti, ItemPointer tid, - TransactionId priorXmax, bool partRowMoveUpdate) +TupleTableSlot *EvalPlanQual(EState *estate, EPQState *epqstate, Relation relation, Index rti, int lockmode, + ItemPointer tid, TransactionId priorXmax, bool partRowMoveUpdate) { TupleTableSlot *slot = NULL; Tuple copyTuple; @@ -2793,7 +2799,7 @@ TupleTableSlot *EvalPlanQual(EState *estate, EPQState *epqstate, Relation relati /* * Get and lock the updated version of the row; if fail, return NULL. */ - copyTuple = tableam_tuple_lock_updated(estate->es_output_cid, relation, LockTupleExclusive, tid, priorXmax, + copyTuple = tableam_tuple_lock_updated(estate->es_output_cid, relation, lockmode, tid, priorXmax, estate->es_snapshot); if (copyTuple == NULL) { @@ -3073,7 +3079,7 @@ HeapTuple heap_lock_updated(CommandId cid, Relation relation, int lockmode, Item /* updated, so look at the updated row */ tuple.t_self = tuple.t_data->t_ctid; /* updated row should have xmin matching this xmax */ - priorXmax = HeapTupleGetRawXmax(&tuple); + priorXmax = HeapTupleGetUpdateXid(&tuple); ReleaseBuffer(buffer); /* loop back to fetch next in chain */ } diff --git a/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp b/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp index b337b817a..833f532cc 100644 --- a/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp +++ b/src/gausskernel/runtime/executor/nodeBitmapHeapscan.cpp @@ -835,7 +835,8 @@ BitmapHeapScanState* ExecInitBitmapHeapScan(BitmapHeapScan* node, EState* estate * occured after taking the snapshot. Skip for explain only commands. */ if (isUstoreRel && !(eflags & EXEC_FLAG_EXPLAIN_ONLY)) { - TransactionId relfrozenxid64 = getPartitionRelfrozenxid(partitiontrel); + TransactionId relfrozenxid64 = InvalidTransactionId; + getPartitionRelxids(partitiontrel, &relfrozenxid64); if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) && !TransactionIdIsCurrentTransactionId(relfrozenxid64) && TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) { @@ -859,7 +860,8 @@ BitmapHeapScanState* ExecInitBitmapHeapScan(BitmapHeapScan* node, EState* estate * occured after taking the snapshot. Skip for explain only commands. */ if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY)) { - TransactionId relfrozenxid64 = getRelationRelfrozenxid(currentRelation); + TransactionId relfrozenxid64 = InvalidTransactionId; + getRelationRelxids(currentRelation, &relfrozenxid64); if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) && !TransactionIdIsCurrentTransactionId(relfrozenxid64) && TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) { diff --git a/src/gausskernel/runtime/executor/nodeIndexonlyscan.cpp b/src/gausskernel/runtime/executor/nodeIndexonlyscan.cpp index 9d9e57b62..7e84aa38b 100644 --- a/src/gausskernel/runtime/executor/nodeIndexonlyscan.cpp +++ b/src/gausskernel/runtime/executor/nodeIndexonlyscan.cpp @@ -711,7 +711,8 @@ IndexOnlyScanState* ExecInitIndexOnlyScan(IndexOnlyScan* node, EState* estate, i * occured after taking the snapshot. */ if (RelationIsUstoreFormat(indexstate->ss.ss_currentPartition)) { - TransactionId relfrozenxid64 = getPartitionRelfrozenxid(indexstate->ss.ss_currentPartition); + TransactionId relfrozenxid64 = InvalidTransactionId; + getPartitionRelxids(indexstate->ss.ss_currentPartition, &relfrozenxid64); if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) && !TransactionIdIsCurrentTransactionId(relfrozenxid64) && TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) { @@ -735,7 +736,8 @@ IndexOnlyScanState* ExecInitIndexOnlyScan(IndexOnlyScan* node, EState* estate, i * occured after taking the snapshot. */ if (RelationIsUstoreFormat(currentRelation)) { - TransactionId relfrozenxid64 = getRelationRelfrozenxid(currentRelation); + TransactionId relfrozenxid64 = InvalidTransactionId; + getRelationRelxids(currentRelation, &relfrozenxid64); if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) && !TransactionIdIsCurrentTransactionId(relfrozenxid64) && TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) { diff --git a/src/gausskernel/runtime/executor/nodeIndexscan.cpp b/src/gausskernel/runtime/executor/nodeIndexscan.cpp index a4b379a21..2422050c0 100644 --- a/src/gausskernel/runtime/executor/nodeIndexscan.cpp +++ b/src/gausskernel/runtime/executor/nodeIndexscan.cpp @@ -708,7 +708,8 @@ IndexScanState* ExecInitIndexScan(IndexScan* node, EState* estate, int eflags) * occured after taking the snapshot. */ if (RelationIsUstoreFormat(index_state->ss.ss_currentPartition)) { - TransactionId relfrozenxid64 = getPartitionRelfrozenxid(index_state->ss.ss_currentPartition); + TransactionId relfrozenxid64 = InvalidTransactionId; + getPartitionRelxids(index_state->ss.ss_currentPartition, &relfrozenxid64); if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) && !TransactionIdIsCurrentTransactionId(relfrozenxid64) && TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) { @@ -733,7 +734,8 @@ IndexScanState* ExecInitIndexScan(IndexScan* node, EState* estate, int eflags) * occured after taking the snapshot. */ if (RelationIsUstoreFormat(current_relation)) { - TransactionId relfrozenxid64 = getRelationRelfrozenxid(current_relation); + TransactionId relfrozenxid64 = InvalidTransactionId; + getRelationRelxids(current_relation, &relfrozenxid64); if (TransactionIdPrecedes(FirstNormalTransactionId, scanSnap->xmax) && !TransactionIdIsCurrentTransactionId(relfrozenxid64) && TransactionIdPrecedes(scanSnap->xmax, relfrozenxid64)) { diff --git a/src/gausskernel/runtime/executor/nodeLockRows.cpp b/src/gausskernel/runtime/executor/nodeLockRows.cpp index 9656a3c64..75487e2f4 100755 --- a/src/gausskernel/runtime/executor/nodeLockRows.cpp +++ b/src/gausskernel/runtime/executor/nodeLockRows.cpp @@ -181,15 +181,34 @@ lnext: searchHBucketFakeRelation(estate->esfRelations, estate->es_query_cxt, target_rel, bucket_id, bucket_rel); } /* okay, try to lock the tuple */ - if (erm->markType == ROW_MARK_EXCLUSIVE) - lock_mode = LockTupleExclusive; - else - lock_mode = LockTupleShared; + switch (erm->markType) { + case ROW_MARK_EXCLUSIVE: + lock_mode = LockTupleExclusive; + break; + case ROW_MARK_NOKEYEXCLUSIVE: + lock_mode = LockTupleNoKeyExclusive; + break; + case ROW_MARK_SHARE: + lock_mode = LockTupleShared; + break; + case ROW_MARK_KEYSHARE: + lock_mode = LockTupleKeyShare; + break; + default: + elog(ERROR, "unsupported rowmark type"); + lock_mode = LockTupleNoKeyExclusive; /* keep compiler quiet */ + break; + } /* Need to merge the ustore logic with AM logic */ test = tableam_tuple_lock(bucket_rel, &tuple, &buffer, estate->es_output_cid, lock_mode, erm->noWait, &tmfd, +#ifdef ENABLE_MULTIPLE_NODES false, false, false, estate->es_snapshot, NULL, true); +#else + false, true, false, estate->es_snapshot, NULL, true); +#endif + ReleaseBuffer(buffer); switch (test) { @@ -262,8 +281,6 @@ lnext: errmsg("could not serialize access due to concurrent update"))); /* Tuple was deleted, so don't return it */ - Assert(ItemPointerEquals(&tmfd.ctid, &tuple.t_self)); - if (rowMovement) { /* * the may be a row movement update action which delete tuple from original diff --git a/src/gausskernel/runtime/executor/nodeModifyTable.cpp b/src/gausskernel/runtime/executor/nodeModifyTable.cpp index 418534135..b0a9ba78b 100644 --- a/src/gausskernel/runtime/executor/nodeModifyTable.cpp +++ b/src/gausskernel/runtime/executor/nodeModifyTable.cpp @@ -1315,7 +1315,7 @@ ldelete: errmsg("concurrent update under Stream mode is not yet supported"))); } TupleTableSlot *epqslot = EvalPlanQual(estate, epqstate, fake_relation, - result_rel_info->ri_RangeTableIndex, &tmfd.ctid, tmfd.xmax, false); + result_rel_info->ri_RangeTableIndex, LockTupleExclusive, &tmfd.ctid, tmfd.xmax, false); if (!TupIsNull(epqslot)) { *tupleid = tmfd.ctid; goto ldelete; @@ -1652,6 +1652,7 @@ TupleTableSlot* ExecUpdate(ItemPointer tupleid, tuple = tableam_tslot_get_tuple_from_slot(result_relation_desc, slot); } else { bool update_indexes = false; + LockTupleMode lockmode; /* * Compute stored generated columns @@ -1721,7 +1722,8 @@ lreplace: /* add para 2 for heap_update */ result = tableam_tuple_update(fake_relation, parent_relation, tupleid, tuple, estate->es_output_cid, estate->es_crosscheck_snapshot, estate->es_snapshot, true, // wait for commit - &oldslot, &tmfd, &update_indexes, &modifiedIdxAttrs, allow_update_self, allowInplaceUpdate); + &oldslot, &tmfd, &update_indexes, &modifiedIdxAttrs, allow_update_self, + allowInplaceUpdate, &lockmode); switch (result) { case TM_SelfUpdated: case TM_SelfModified: @@ -1794,7 +1796,7 @@ lreplace: } TupleTableSlot *epq_slot = EvalPlanQual(estate, epqstate, fake_relation, - result_rel_info->ri_RangeTableIndex, &tmfd.ctid, tmfd.xmax, false); + result_rel_info->ri_RangeTableIndex, lockmode, &tmfd.ctid, tmfd.xmax, false); if (!TupIsNull(epq_slot)) { *tupleid = tmfd.ctid; @@ -1977,7 +1979,8 @@ lreplace: &update_indexes, &modifiedIdxAttrs, allow_update_self, - allowInplaceUpdate); + allowInplaceUpdate, + &lockmode); switch (result) { case TM_SelfUpdated: case TM_SelfModified: @@ -2036,7 +2039,7 @@ lreplace: } TupleTableSlot *epq_slot = EvalPlanQual(estate, epqstate, fake_relation, - result_rel_info->ri_RangeTableIndex, &tmfd.ctid, tmfd.xmax, + result_rel_info->ri_RangeTableIndex, lockmode, &tmfd.ctid, tmfd.xmax, result_relation_desc->rd_rel->relrowmovement); if (!TupIsNull(epq_slot)) { @@ -2220,6 +2223,7 @@ ldelete: epqstate, old_fake_relation, result_rel_info->ri_RangeTableIndex, + LockTupleExclusive, &tmfd.ctid, tmfd.xmax, result_relation_desc->rd_rel->relrowmovement); diff --git a/src/gausskernel/runtime/executor/nodeSeqscan.cpp b/src/gausskernel/runtime/executor/nodeSeqscan.cpp index 400cc8b64..3594ce864 100644 --- a/src/gausskernel/runtime/executor/nodeSeqscan.cpp +++ b/src/gausskernel/runtime/executor/nodeSeqscan.cpp @@ -423,7 +423,8 @@ void InitScanRelation(SeqScanState* node, EState* estate, int eflags) if (!node->isPartTbl) { /* add qual for redis */ - TransactionId relfrozenxid64 = getRelationRelfrozenxid(current_relation); + TransactionId relfrozenxid64 = InvalidTransactionId; + getRelationRelxids(current_relation, &relfrozenxid64); current_scan_desc = BeginScanRelation(node, current_relation, relfrozenxid64, eflags); } else { plan = (SeqScan*)node->ps.plan; @@ -491,7 +492,8 @@ void InitScanRelation(SeqScanState* node, EState* estate, int eflags) node->ss_currentPartition = current_part_rel; /* add qual for redis */ - TransactionId relfrozenxid64 = getPartitionRelfrozenxid(current_part_rel); + TransactionId relfrozenxid64 = InvalidTransactionId; + getPartitionRelxids(current_part_rel, &relfrozenxid64); current_scan_desc = BeginScanRelation(node, current_part_rel, relfrozenxid64, eflags); } else { node->ss_currentPartition = NULL; diff --git a/src/gausskernel/storage/access/common/reloptions.cpp b/src/gausskernel/storage/access/common/reloptions.cpp index 460c57337..a1d80c4ef 100644 --- a/src/gausskernel/storage/access/common/reloptions.cpp +++ b/src/gausskernel/storage/access/common/reloptions.cpp @@ -2863,3 +2863,11 @@ bool is_contain_crossbucket(List *defList) return false; } +bool is_cstore_option(char relkind, Datum reloptions) +{ + StdRdOptions* std_opt = (StdRdOptions*)heap_reloptions(relkind, reloptions, false); + bool result = std_opt == NULL && pg_strcasecmp(ORIENTATION_COLUMN, + StdRdOptionsGetStringData(std_opt, orientation, ORIENTATION_ROW)) == 0; + pfree_ext(std_opt); + return result; +} diff --git a/src/gausskernel/storage/access/heap/README.tuplock b/src/gausskernel/storage/access/heap/README.tuplock new file mode 100644 index 000000000..247a41e6a --- /dev/null +++ b/src/gausskernel/storage/access/heap/README.tuplock @@ -0,0 +1,132 @@ +Locking tuples +-------------- + +Locking tuples is not as easy as locking tables or other database objects. +The problem is that transactions might want to lock large numbers of tuples at +any one time, so it's not possible to keep the locks objects in shared memory. +To work around this limitation, we use a two-level mechanism. The first level +is implemented by storing locking information in the tuple header: a tuple is +marked as locked by setting the current transaction's XID as its XMAX, and +setting additional infomask bits to distinguish this case from the more normal +case of having deleted the tuple. When multiple transactions concurrently +lock a tuple, a MultiXact is used; see below. This mechanism can accomodate +arbitrarily large numbers of tuples being locked simultaneously. + +When it is necessary to wait for a tuple-level lock to be released, the basic +delay is provided by XactLockTableWait or MultiXactIdWait on the contents of +the tuple's XMAX. However, that mechanism will release all waiters +concurrently, so there would be a race condition as to which waiter gets the +tuple, potentially leading to indefinite starvation of some waiters. The +possibility of share-locking makes the problem much worse --- a steady stream +of share-lockers can easily block an exclusive locker forever. To provide +more reliable semantics about who gets a tuple-level lock first, we use the +standard lock manager, which implements the second level mentioned above. The +protocol for waiting for a tuple-level lock is really + + LockTuple() + XactLockTableWait() + mark tuple as locked by me + UnlockTuple() + +When there are multiple waiters, arbitration of who is to get the lock next +is provided by LockTuple(). However, at most one tuple-level lock will +be held or awaited per backend at any time, so we don't risk overflow +of the lock table. Note that incoming share-lockers are required to +do LockTuple as well, if there is any conflict, to ensure that they don't +starve out waiting exclusive-lockers. However, if there is not any active +conflict for a tuple, we don't incur any extra overhead. + +We provide four levels of tuple locking strength: SELECT FOR KEY UPDATE is +super-exclusive locking (used to delete tuples and more generally to update +tuples modifying the values of the columns that make up the key of the tuple); +SELECT FOR UPDATE is a standards-compliant exclusive lock; SELECT FOR SHARE +implements shared locks; and finally SELECT FOR KEY SHARE is a super-weak mode +that does not conflict with exclusive mode, but conflicts with SELECT FOR KEY +UPDATE. This last mode implements a mode just strong enough to implement RI +checks, i.e. it ensures that tuples do not go away from under a check, without +blocking when some other transaction that want to update the tuple without +changing its key. + +The conflict table is: + + KEY UPDATE UPDATE SHARE KEY SHARE +KEY UPDATE conflict conflict conflict conflict +UPDATE conflict conflict conflict +SHARE conflict conflict +KEY SHARE conflict + +When there is a single locker in a tuple, we can just store the locking info +in the tuple itself. We do this by storing the locker's Xid in XMAX, and +setting infomask bits specifying the locking strength. There is one exception +here: since infomask space is limited, we do not provide a separate bit +for SELECT FOR SHARE, so we have to use the extended info in a MultiXact in +that case. (The other cases, SELECT FOR UPDATE and SELECT FOR KEY SHARE, are +presumably more commonly used due to being the standards-mandated locking +mechanism, or heavily used by the RI code, so we want to provide fast paths +for those.) + +MultiXacts +---------- + +A tuple header provides very limited space for storing information about tuple +locking and updates: there is room only for a single Xid and a small number of +infomask bits. Whenever we need to store more than one lock, we replace the +first locker's Xid with a new MultiXactId. Each MultiXact provides extended +locking data; it comprises an array of Xids plus some flags bits for each one. +The flags are currently used to store the locking strength of each member +transaction. (The flags also distinguish a pure locker from an updater.) + +In earlier releases, a MultiXact always meant that the tuple was +locked in shared mode by multiple transactions. This is no longer the case; a +MultiXact may contain an update or delete Xid. (Keep in mind that tuple locks +in a transaction do not conflict with other tuple locks in the same +transaction, so it's possible to have otherwise conflicting locks in a +MultiXact if they belong to the same transaction). + +Note that each lock is attributed to the subtransaction that acquires it. +This means that a subtransaction that aborts is seen as though it releases the +locks it acquired; concurrent transactions can then proceed without having to +wait for the main transaction to finish. It also means that a subtransaction +can upgrade to a stronger lock level than an earlier transaction had, and if +the subxact aborts, the earlier, weaker lock is kept. + +The possibility of having an update within a MultiXact means that they must +persist across crashes and restarts: a future reader of the tuple needs to +figure out whether the update committed or aborted. So we have a requirement +that pg_multixact needs to retain pages of its data until we're certain that +the MultiXacts in them are no longer of interest. + +Infomask Bits +------------- + +The following infomask bits are applicable: + +- HEAP_XMAX_INVALID + Any tuple with this bit set does not have a valid value stored in XMAX. + +- HEAP_XMAX_IS_MULTI + This bit is set if the tuple's Xmax is a MultiXactId (as opposed to a + regular TransactionId). + +- HEAP_XMAX_LOCK_ONLY + This bit lives in t_infomask2. This bit is set when the XMAX is a locker only; + that is, if it's a multixact, it does not contain an update among its members. + It's set when the XMAX is a plain Xid that locked the tuple, as well. + +- HEAP_XMAX_KEYSHR_LOCK +- HEAP_XMAX_EXCL_LOCK + These bits indicate the strength of the lock acquired; they are useful when + the XMAX is not a MultiXactId. If it's a multi, the info is to be found in + the member flags. If HEAP_XMAX_IS_MULTI is not set and HEAP_XMAX_LOCK_ONLY + is set, then one of these *must* be set as well. + Note there is no infomask bit for a SELECT FOR SHARE lock. Also there is no + separate bit for a SELECT FOR KEY UPDATE lock; this is implemented by the + HEAP_KEYS_UPDATED bit. + +- HEAP_KEYS_UPDATED + This bit lives in t_infomask2. If set, indicates that the XMAX updated + this tuple and changed the key values, or it deleted the tuple. + It's set regardless of whether the XMAX is a TransactionId or a MultiXactId. + +We currently never set the HEAP_XMAX_COMMITTED when the HEAP_XMAX_IS_MULTI bit +is set. diff --git a/src/gausskernel/storage/access/heap/heapam.cpp b/src/gausskernel/storage/access/heap/heapam.cpp index 3c30a6d80..e35f31353 100755 --- a/src/gausskernel/storage/access/heap/heapam.cpp +++ b/src/gausskernel/storage/access/heap/heapam.cpp @@ -121,10 +121,10 @@ static void HeapParallelscanStartblockInit(HeapScanDesc scan); static BlockNumber HeapParallelscanNextpage(HeapScanDesc scan); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, CommandId cid, int options); -static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointer from, Buffer newbuf, HeapTuple newtup, - HeapTuple old_key_tup, bool all_visible_cleared, bool new_all_visible_cleared); -static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitmapset* id_attrs, bool* satisfies_hot, - bool* satisfies_id, HeapTuple oldtup, HeapTuple newtup, char* page); +static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, HeapTuple oldtup, Buffer newbuf, HeapTuple newtup, + HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared); +static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitmapset *key_attrs, Bitmapset* id_attrs, + bool* satisfies_hot, bool *satisfies_key, bool* satisfies_id, HeapTuple oldtup, HeapTuple newtup, char* page); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified, bool* copy); static void SkipToNewPage( HeapScanDesc scan, ScanDirection dir, BlockNumber page, bool* finished, bool* isValidRelationPage); @@ -191,10 +191,6 @@ static inline void InitScanBlocks(HeapScanDesc scan, RangeScanInRedis rangeScanI static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, uint32 flags, ParallelHeapScanDesc parallel_scan, RangeScanInRedis rangeScanInRedis = {false, 0, 0}); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, CommandId cid, int options); -static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointer from, Buffer newbuf, HeapTuple newtup, - HeapTuple old_key_tup, bool all_visible_cleared, bool new_all_visible_cleared); -static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs, Bitmapset *id_attrs, bool *satisfies_hot, - bool *satisfies_id, HeapTuple oldtup, HeapTuple newtup, char *page); static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified, bool *copy); static void SkipToNewPage(HeapScanDesc scan, ScanDirection dir, BlockNumber page, bool* finished, bool* is_valid_relation_page); @@ -202,7 +198,16 @@ static bool VerifyHeapGetTup(HeapScanDesc scan, ScanDirection dir); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); extern void Start_Prefetch(TableScanDesc scan, SeqScanAccessor *pAccessor, ScanDirection dir); extern void vacuum_set_xid_limits(Relation rel, int64 freeze_min_age, int64 freeze_table_age, TransactionId *oldestXmin, - TransactionId *freezeLimit, TransactionId *freezeTableLimit); + TransactionId *freezeLimit, TransactionId *freezeTableLimit, MultiXactId* multiXactFrzLimit); +static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, TransactionId xid, + LockTupleMode mode); +static void ComputeNewXmaxInfomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, + TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, + uint16 *result_infomask, uint16 *result_infomask2); + +static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2); +static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask, uint16 t_infomask2); +static bool DoesMultiXactIdConflict(MultiXactId multi, LockTupleMode lockmode); /* ---------------- * initscan - scan code common to heap_beginscan and heap_rescan @@ -2400,7 +2405,7 @@ bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, S Assert(ItemPointerGetBlockNumber(&heap_tuple->t_data->t_ctid) == ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(&heap_tuple->t_data->t_ctid); at_chain_start = false; - prev_xmax = HeapTupleGetRawXmax(heap_tuple); + prev_xmax = HeapTupleGetUpdateXid(heap_tuple); } else { break; /* end of chain */ } @@ -2592,14 +2597,14 @@ void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid) /* * If there's a valid t_ctid link, follow it, else we're done. */ - if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) || + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleIsOnlyLocked(&tp) || ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) { UnlockReleaseBuffer(buffer); break; } ctid = tp.t_data->t_ctid; - priorXmax = HeapTupleGetRawXmax(&tp); + priorXmax = HeapTupleGetUpdateXid(&tp); UnlockReleaseBuffer(buffer); } /* end of loop */ } @@ -2611,15 +2616,22 @@ void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid) * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will * be set on exit. If the transaction committed, we set the XMAX_COMMITTED * hint bit if possible --- but beware that that may not yet be possible, - * if the transaction committed asynchronously. Hence callers should look - * only at XMAX_INVALID. + * if the transaction committed asynchronously. + * + * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID + * even if it commits. + * + * Hence callers should look only at XMAX_INVALID. + * + * Note this is not allowed for tuples whose xmax is a multixact. */ static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) { Assert(TransactionIdEquals(HeapTupleHeaderGetXmax(BufferGetPage(buffer), tuple), xid)); + Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID))) { - if (TransactionIdDidCommit(xid)) { + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2) && TransactionIdDidCommit(xid)) { HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, xid); } else { if (!LatestFetchTransactionIdDidAbort(xid)) { @@ -2938,6 +2950,51 @@ Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, Bu return HeapTupleGetOid(tup); } +/* + * Given infomask/infomask2, compute the bits that must be saved in the + * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock WAL recrods. + * + * See FixInfomaskFromInfobits. + */ +static uint8 ComputeInfobits(uint16 infomask, uint16 infomask2) +{ + return ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | + (HEAP_XMAX_IS_LOCKED_ONLY(infomask, infomask2) ? XLHL_XMAX_LOCK_ONLY : 0) | + (HEAP_XMAX_IS_EXCL_LOCKED(infomask) ? XLHL_XMAX_EXCL_LOCK : 0) | + /* note we ignore HEAP_XMAX_SHR_LOCK here */ + (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ? XLHL_XMAX_KEYSHR_LOCK : 0) | + ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? XLHL_KEYS_UPDATED : 0); +} + +/* + * Given an "infobits" field from an XLog record, set the correct bits in the + * given infomask and infomask2 for the tuple touched by the record. + * + * (This is the reverse of compute_infobits). + */ +void FixInfomaskFromInfobits(uint8 infobits, uint16 *infomask, uint16 *infomask2) +{ + *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK); + *infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + + if (infobits & XLHL_XMAX_IS_MULTI) { + *infomask |= HEAP_XMAX_IS_MULTI; + } + if (infobits & XLHL_XMAX_LOCK_ONLY) { + *infomask2 |= HEAP_XMAX_LOCK_ONLY; + } + if (infobits & XLHL_XMAX_EXCL_LOCK) { + *infomask |= HEAP_XMAX_EXCL_LOCK; + } + /* note HEAP_XMAX_SHR_LOCK isn't considered here */ + if (infobits & XLHL_XMAX_KEYSHR_LOCK) { + *infomask |= HEAP_XMAX_KEYSHR_LOCK; + } + if (infobits & XLHL_KEYS_UPDATED) { + *infomask2 |= HEAP_KEYS_UPDATED; + } +} + /* * heap_abort_speculative - kill a speculatively inserted tuple * @@ -3023,8 +3080,8 @@ void heap_abort_speculative(Relation relation, HeapTuple tuple) PageSetPrunable(page, xid); /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | - HEAP_IS_LOCKED | HEAP_MOVED); + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); /* * Set the tuple header xmin and xmax to FrozenTransactionId. @@ -3044,16 +3101,25 @@ void heap_abort_speculative(Relation relation, HeapTuple tuple) if (RelationNeedsWAL(relation)) { xl_heap_delete xlrec; XLogRecPtr recptr; + bool useOldXlog; xlrec.flags = XLH_DELETE_IS_SUPER; xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = FrozenTransactionId; + xlrec.infobits_set = ComputeInfobits(tp.t_data->t_infomask, tp.t_data->t_infomask2); + useOldXlog = t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM || + !(xlrec.infobits_set & XLHL_XMAX_IS_MULTI); +#ifdef ENABLE_MULTIPLE_NODES + useOldXlog = true; +#endif XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); + XLogRegisterData((char *)&xlrec, useOldXlog ? SizeOfOldHeapDelete : SizeOfHeapDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* No replica identity & replication origin logged */ - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + recptr = XLogInsert(RM_HEAP_ID, useOldXlog ? XLOG_HEAP_DELETE : + XLOG_HEAP_DELETE | XLOG_TUPLE_LOCK_UPGRADE_FLAG); PageSetLSN(page, recptr); } @@ -3244,10 +3310,12 @@ static int freeze_single_heap_page(Relation relation, Buffer buffer) TransactionId oldest_xmin = InvalidTransactionId; TransactionId freeze_xid = InvalidTransactionId; bool useLocalSnapshot_change = false; + MultiXactId freeze_mxid = InvalidMultiXactId; + bool changedMultiXid = false; gstrace_entry(GS_TRC_ID_freeze_single_heap_page); - vacuum_set_xid_limits(relation, 0, 0, &oldest_xmin, &freeze_xid, NULL); + vacuum_set_xid_limits(relation, 0, 0, &oldest_xmin, &freeze_xid, NULL, &freeze_mxid); /* since xid_base must be adjusted, heap_page_prune needs to be done, * so t_thrd.xact_cxt.useLocalSnapshot should be set to false */ @@ -3311,7 +3379,7 @@ static int freeze_single_heap_page(Relation relation, Buffer buffer) * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ - if (heap_freeze_tuple(&tuple, freeze_xid)) { + if (heap_freeze_tuple(&tuple, freeze_xid, freeze_mxid, &changedMultiXid)) { frozen[nfrozen++] = offnum; } } /* scan along page */ @@ -3327,7 +3395,9 @@ static int freeze_single_heap_page(Relation relation, Buffer buffer) MarkBufferDirty(buffer); /* Now WAL-log freezing if necessary */ if (RelationNeedsWAL(relation)) { - XLogRecPtr recptr = log_heap_freeze(relation, buffer, freeze_xid, frozen, nfrozen); + XLogRecPtr recptr = log_heap_freeze(relation, buffer, freeze_xid, + changedMultiXid ? freeze_mxid : InvalidMultiXactId, + frozen, nfrozen); PageSetLSN(page, recptr); } @@ -3375,6 +3445,9 @@ bool heap_page_prepare_for_xid(Relation relation, Buffer buffer, TransactionId x ereport(LOG, (errmsg("new page, the xid base is not correct, base is %lu, reset the xid_base to %lu", base, xid_base))); + ereport(LOG, (errmsg("relation is %s, prepare xid %lu, page min xid: %lu, page max xid: %lu", + RelationGetRelationName(relation), xid, base + FirstNormalTransactionId, + base + MaxShortTransactionId))); phdr->pd_xid_base = xid_base; return false; } @@ -4097,6 +4170,28 @@ Oid simple_heap_insert(Relation relation, HeapTuple tup) return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); } +/* + * Given two versions of the same t_infomask for a tuple, compare them and + * return whether the relevant status for a tuple Xmax has changed. This is + * used after a buffer lock has been released and reacquired: we want to ensure + * that the tuple state continues to be the same it was when we previously + * examined it. + * + * Note the Xmax field itself must be compared separately. + */ +static inline bool xmax_infomask_changed(uint16 new_infomask, uint16 new_infomask2, uint16 old_infomask, + uint16 old_infomask2) +{ + const uint16 interesting = HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK; + const uint16 interesting2 = HEAP_XMAX_LOCK_ONLY; + + if (((new_infomask & interesting) != (old_infomask & interesting)) || + ((new_infomask2 & interesting2) != (old_infomask2 & interesting2))) + return true; + + return false; +} + /* * heap_delete - delete a tuple * @@ -4133,6 +4228,9 @@ TM_Result heap_delete(Relation relation, ItemPointer tid, CommandId cid, BlockNumber block; Buffer buffer; Buffer vmbuffer = InvalidBuffer; + TransactionId new_xmax; + uint16 new_infomask; + uint16 new_infomask2; bool have_tuple_lock = false; bool is_combo = false; bool all_visible_cleared = false; @@ -4222,13 +4320,13 @@ l1: } else if (result == TM_BeingModified && wait) { TransactionId xwait; uint16 infomask; + uint16 infomask2; /* must copy state data before unlocking buffer */ HeapTupleCopyBaseFromPage(&tp, BufferGetPage(buffer)); xwait = HeapTupleGetRawXmax(&tp); infomask = tp.t_data->t_infomask; - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + infomask2 = tp.t_data->t_infomask2; if (!u_sess->attr.attr_common.allow_concurrent_tuple_update) { ereport(ERROR, @@ -4236,37 +4334,42 @@ l1: } /* - * Acquire tuple lock to establish our priority for the tuple (see - * heap_lock_tuple). LockTuple will release us when we are - * next-in-line for the tuple. + * Sleep until concurrent transaction ends -- except when there's a single + * locker and it's our own transaction. Note we don't care + * which lock mode the locker has, because we need the strongest one. + * + * Before sleeping, we need to acquire tuple lock to establish our + * priority for the tuple (see heap_lock_tuple). LockTuple will + * release us when we are next-in-line for the tuple. * * If we are forced to "start over" below, we keep the tuple lock; * this arranges that we stay at the head of the line while rechecking * tuple state. */ - if (!have_tuple_lock) { - LockTuple(relation, &(tp.t_self), ExclusiveLock, true); - have_tuple_lock = true; - } - - /* - * Sleep until concurrent transaction ends. Note that we don't care - * if the locker has an exclusive or shared lock, because we need - * exclusive. - */ if (infomask & HEAP_XMAX_IS_MULTI) { /* wait for multixact */ - MultiXactIdWait((MultiXactId)xwait, true); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + if (DoesMultiXactIdConflict((MultiXactId)xwait, LockTupleExclusive)) { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - /* - * If xwait had just locked the tuple then some other xact could - * update this tuple before we get to this point. Check for xmax - * change, and start over if so. - */ - if (!(tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) || - !TransactionIdEquals(HeapTupleGetRawXmax(&tp), xwait)) { - goto l1; + /* acquire tuple lock, if necessary */ + if (!have_tuple_lock) { + LOCK_TUPLE_TUP_LOCK(relation, &(tp.t_self), LockTupleExclusive); + have_tuple_lock = true; + } + + /* wait for multixact */ + MultiXactIdWait((MultiXactId)xwait, MultiXactStatusUpdate, NULL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact could + * update this tuple before we get to this point. Check for xmax + * change, and start over if so. + */ + if (xmax_infomask_changed(tp.t_data->t_infomask, tp.t_data->t_infomask2, infomask, infomask2) || + !TransactionIdEquals(HeapTupleGetRawXmax(&tp), xwait)) { + goto l1; + } } /* @@ -4278,8 +4381,16 @@ l1: * exclusive). We don't bother changing the on-disk hint bits * since we are about to overwrite the xmax altogether. */ - } else { - /* wait for regular transaction to end */ + } else if (!TransactionIdIsCurrentTransactionId(xwait)) { + /* + * Wait for regular transaction to end; but first, acquire + * tuple lock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (!have_tuple_lock) { + LOCK_TUPLE_TUP_LOCK(relation, &(tp.t_self), LockTupleExclusive); + have_tuple_lock = true; + } XactLockTableWait(xwait, true); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -4288,7 +4399,8 @@ l1: * other xact could update this tuple before we get to this point. * Check for xmax change, and start over if so. */ - if ((tp.t_data->t_infomask & HEAP_XMAX_IS_MULTI) || !TransactionIdEquals(HeapTupleGetRawXmax(&tp), xwait)) { + if (xmax_infomask_changed(tp.t_data->t_infomask, tp.t_data->t_infomask2, infomask, infomask2) || + !TransactionIdEquals(HeapTupleGetRawXmax(&tp), xwait)) { goto l1; } @@ -4300,7 +4412,9 @@ l1: * We may overwrite if previous xmax aborted, or if it committed but * only locked the tuple without updating it. */ - if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) { + if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask, tp.t_data->t_infomask2) || + HeapTupleIsOnlyLocked(&tp)) { result = TM_Ok; } else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) { result = TM_Updated; @@ -4323,7 +4437,7 @@ l1: Assert(result != TM_Updated || !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)); tmfd->ctid = tp.t_data->t_ctid; - tmfd->xmax = HeapTupleGetRawXmax(&tp); + tmfd->xmax = HeapTupleGetUpdateXid(&tp); if (result == TM_SelfModified) { tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data, page); } else { @@ -4331,7 +4445,7 @@ l1: } UnlockReleaseBuffer(buffer); if (have_tuple_lock) { - UnlockTuple(relation, &(tp.t_self), ExclusiveLock); + UNLOCK_TUPLE_TUP_LOCK(relation, &(tp.t_self), LockTupleExclusive); } if (vmbuffer != InvalidBuffer) { ReleaseBuffer(vmbuffer); @@ -4357,6 +4471,32 @@ l1: */ old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied); + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId setting. + * We can be certain that the transaction will never become a member of + * any older MultiXactIds than that. (We have to do this even if we + * end up just using our own TransactionId below, since some other + * backend could incorporate our XID into a MultiXact immediately + * afterwards.) + */ + MultiXactIdSetOldestMember(); + + ComputeNewXmaxInfomask(HeapTupleGetRawXmax(&tp), tp.t_data->t_infomask, tp.t_data->t_infomask2, + xid, LockTupleExclusive, true, &new_xmax, &new_infomask, &new_infomask2); + + if (t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM) { + if (!TransactionIdEquals(new_xmax, xid) || new_infomask != 0) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("New MultiXact feature isn't support in this version. Please upgrade to version: %d", + ENHANCED_TUPLE_LOCK_VERSION_NUM))); + } + } + + if (new_infomask & HEAP_XMAX_IS_MULTI) { + (void)heap_page_prepare_for_xid(relation, buffer, new_xmax, true); + } + START_CRIT_SECTION(); /* @@ -4375,10 +4515,12 @@ l1: } /* store transaction information of xact deleting the tuple */ - tp.t_data->t_infomask &= - ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + tp.t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + tp.t_data->t_infomask |= new_infomask; + tp.t_data->t_infomask2 |= new_infomask2; HeapTupleHeaderClearHotUpdated(tp.t_data); - HeapTupleHeaderSetXmax(page, tp.t_data, xid); + HeapTupleHeaderSetXmax(page, tp.t_data, new_xmax); HeapTupleHeaderSetCmax(tp.t_data, cid, is_combo); /* Make sure there is no forward chain link in t_ctid */ @@ -4391,6 +4533,7 @@ l1: xl_heap_delete xlrec; XLogRecPtr recptr; xl_heap_header xlhdr; + bool useOldXlog; /* For logical decode we need combocids to properly decode the catalog */ if (RelationIsAccessibleInLogicalDecoding(relation)) { @@ -4399,6 +4542,13 @@ l1: xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0; xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); + xlrec.xmax = new_xmax; + xlrec.infobits_set = ComputeInfobits(tp.t_data->t_infomask, tp.t_data->t_infomask2); + useOldXlog = t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM || + !(xlrec.infobits_set & XLHL_XMAX_IS_MULTI); +#ifdef ENABLE_MULTIPLE_NODES + useOldXlog = true; +#endif if (old_key_tuple != NULL) { bool is_null = false; @@ -4428,7 +4578,7 @@ l1: } XLogBeginInsert(); - XLogRegisterData((char*)&xlrec, SizeOfHeapDelete); + XLogRegisterData((char *)&xlrec, useOldXlog ? SizeOfOldHeapDelete : SizeOfHeapDelete); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); /* @@ -4446,8 +4596,8 @@ l1: /* filtering by origin on a row level is much more efficient */ XLogIncludeOrigin(); - - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); + recptr = XLogInsert(RM_HEAP_ID, useOldXlog ? XLOG_HEAP_DELETE : + XLOG_HEAP_DELETE | XLOG_TUPLE_LOCK_UPGRADE_FLAG); PageSetLSN(page, recptr); } @@ -4487,7 +4637,7 @@ l1: * Release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) { - UnlockTuple(relation, &(tp.t_self), ExclusiveLock); + UNLOCK_TUPLE_TUP_LOCK(relation, &(tp.t_self), LockTupleExclusive); } pgstat_count_heap_delete(relation); @@ -4561,6 +4711,7 @@ void simple_heap_delete(Relation relation, ItemPointer tid, int options, bool al * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this * wait - true if should wait for any conflicting update to commit/abort + * lockmode - output parameter, filled with lock mode acquired on tuple * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we *did* update it. Failure return codes are @@ -4579,11 +4730,13 @@ void simple_heap_delete(Relation relation, ItemPointer tid, int options, bool al * (t_xmax is needed to verify that the replacement tuple matches.) */ TM_Result heap_update(Relation relation, Relation parentRelation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool allow_update_self) + CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, + bool allow_update_self) { TM_Result result; TransactionId xid = GetCurrentTransactionId(); Bitmapset* hot_attrs = NULL; + Bitmapset *key_attrs = NULL; Bitmapset* id_attrs = NULL; ItemId lp; HeapTupleData oldtup; @@ -4592,6 +4745,7 @@ TM_Result heap_update(Relation relation, Relation parentRelation, ItemPointer ot bool old_key_copied = false; Page page, newpage; BlockNumber block; + MultiXactStatus mxact_status; Buffer buffer = InvalidBuffer; Buffer newbuf = InvalidBuffer; Buffer vmbuffer = InvalidBuffer; @@ -4602,14 +4756,25 @@ TM_Result heap_update(Relation relation, Relation parentRelation, ItemPointer ot bool have_tuple_lock = false; bool is_combo = false; bool satisfies_hot = false; + bool satisfies_key = false; bool satisfies_id = false; bool use_hot_update = false; + bool key_intact; bool all_visible_cleared = false; bool all_visible_cleared_new = false; + bool checked_lockers; + bool locker_remains; + TransactionId xmax_new_tuple; + TransactionId xmax_old_tuple; + uint16 infomask_old_tuple; + uint16 infomask2_old_tuple; + uint16 infomask_new_tuple; + uint16 infomask2_new_tuple; int options = 0; bool rel_in_redis = RelationInClusterResizing(relation); OffsetNumber maxoff; BlockNumber rel_end_block = InvalidBlockNumber; + LockTupleMode mode; Assert(ItemPointerIsValid(otid)); /* Don't allow any write/lock operator in stream. */ @@ -4643,9 +4808,11 @@ TM_Result heap_update(Relation relation, Relation parentRelation, ItemPointer ot */ Assert(RELATION_IS_PARTITIONED(parentRelation) || RELATION_OWN_BUCKET(parentRelation)); hot_attrs = RelationGetIndexAttrBitmap(parentRelation, INDEX_ATTR_BITMAP_ALL); + key_attrs = RelationGetIndexAttrBitmap(parentRelation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(parentRelation, INDEX_ATTR_BITMAP_IDENTITY_KEY); } else { hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL); + key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); id_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_IDENTITY_KEY); } @@ -4686,12 +4853,57 @@ TM_Result heap_update(Relation relation, Relation parentRelation, ItemPointer ot oldtup.t_self = *otid; oldtup.t_tableOid = RelationGetRelid(relation); oldtup.t_bucketId = RelationGetBktid(relation); - HeapSatisfiesHOTUpdate(relation, hot_attrs, id_attrs, &satisfies_hot, &satisfies_id, &oldtup, newtup, page); + HeapSatisfiesHOTUpdate(relation, hot_attrs, key_attrs, id_attrs, &satisfies_hot, + &satisfies_key, &satisfies_id, &oldtup, newtup, page); tmfd->xmin = HeapTupleHeaderGetXmin(page, oldtup.t_data); +#ifndef ENABLE_MULTIPLE_NODES + /* + * If we're not updating any "key" column, we can grab a weaker lock type. + * This allows for more concurrency when we are running simultaneously with + * foreign key checks. + * + * Note that if a column gets detoasted while executing the update, but the + * value ends up being the same, this test will fail and we will use the + * stronger lock. This is acceptable; the important case to optimize is + * updates that don't manipulate key columns, not those that + * serendipitiously arrive at the same key values. + */ + if (satisfies_key && t_thrd.proc->workingVersionNum >= ENHANCED_TUPLE_LOCK_VERSION_NUM) { + mode = LockTupleNoKeyExclusive; + mxact_status = MultiXactStatusNoKeyUpdate; + key_intact = true; + + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId setting. + * We can be certain that the transaction will never become a member of + * any older MultiXactIds than that. (We have to do this even if we + * end up just using our own TransactionId below, since some other + * backend could incorporate our XID into a MultiXact immediately + * afterwards.) + */ + MultiXactIdSetOldestMember(); + } else +#endif + { + mode = LockTupleExclusive; + mxact_status = MultiXactStatusUpdate; + key_intact = false; + } + if (lockmode != NULL) { + *lockmode = mode; + } + l2: + checked_lockers = false; + locker_remains = false; HeapTupleCopyBaseFromPage(&oldtup, BufferGetPage(buffer)); result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer, allow_update_self); + + /* see below about the "no wait" case */ + Assert(result != TM_BeingModified || wait); + if (result == TM_Invisible) { UnlockReleaseBuffer(buffer); ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("attempted to update invisible tuple"))); @@ -4703,13 +4915,26 @@ l2: } else if (result == TM_BeingModified && wait) { TransactionId xwait; uint16 infomask; + uint16 infomask2; + bool can_continue = false; + + /* + * XXX note that we don't consider the "no wait" case here. This + * isn't a problem currently because no caller uses that case, but it + * should be fixed if such a caller is introduced. It wasn't a problem + * previously because this code would always wait, but now that some + * tuple locks do not conflict with one of the lock modes we use, it is + * possible that this case is interesting to handle specially. + * + * This may cause failures with third-party code that calls heap_update + * directly. + */ /* must copy state data before unlocking buffer */ HeapTupleCopyBaseFromPage(&oldtup, BufferGetPage(buffer)); xwait = HeapTupleGetRawXmax(&oldtup); infomask = oldtup.t_data->t_infomask; - - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + infomask2 = oldtup.t_data->t_infomask2; if (!u_sess->attr.attr_common.allow_concurrent_tuple_update) { ereport(ERROR, @@ -4717,50 +4942,117 @@ l2: } /* - * Acquire tuple lock to establish our priority for the tuple (see - * heap_lock_tuple). LockTuple will release us when we are - * next-in-line for the tuple. + * Now we have to do something about the existing locker. If it's a + * multi, sleep on it; we might be awakened before it is completely + * gone (or even not sleep at all in some cases); we need to preserve + * it as locker, unless it is gone completely. + * + * If it's not a multi, we need to check for sleeping conditions before + * actually going to sleep. If the update doesn't conflict with the + * locks, we just continue without sleeping (but making sure it is + * preserved). + * + * Before sleeping, we need to acquire tuple lock to establish our + * priority for the tuple (see heap_lock_tuple). LockTuple will + * release us when we are next-in-line for the tuple. Note we must not + * acquire the tuple lock until we're sure we're going to sleep; + * otherwise we're open for race conditions with other transactions + * holding the tuple lock which sleep on us. * * If we are forced to "start over" below, we keep the tuple lock; * this arranges that we stay at the head of the line while rechecking * tuple state. */ - if (!have_tuple_lock) { - LockTuple(relation, &(oldtup.t_self), ExclusiveLock, true); - have_tuple_lock = true; - } - - /* - * Sleep until concurrent transaction ends. Note that we don't care - * if the locker has an exclusive or shared lock, because we need - * exclusive. - */ if (infomask & HEAP_XMAX_IS_MULTI) { - /* wait for multixact */ - MultiXactIdWait((MultiXactId)xwait, true); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + TransactionId update_xact; + int remain; - /* - * If xwait had just locked the tuple then some other xact could - * update this tuple before we get to this point. Check for xmax - * change, and start over if so. - */ - if (!(oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) || - !TransactionIdEquals(HeapTupleGetRawXmax(&oldtup), xwait)) { - goto l2; + /* acquire tuple lock, if necessary */ + if (DoesMultiXactIdConflict((MultiXactId)xwait, mode)) { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* acquire tuple lock, if necessary */ + if (!have_tuple_lock) { + LOCK_TUPLE_TUP_LOCK(relation, &(oldtup.t_self), mode); + have_tuple_lock = true; + } + + /* wait for multixact */ + MultiXactIdWait((MultiXactId)xwait, mxact_status, &remain); + checked_lockers = true; + locker_remains = remain != 0; + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * If xwait had just locked the tuple then some other xact could + * update this tuple before we get to this point. Check for xmax + * change, and start over if so. + */ + if (xmax_infomask_changed(oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2, infomask, infomask2) || + !TransactionIdEquals(HeapTupleGetRawXmax(&oldtup), xwait)) { + goto l2; + } } /* - * You might think the multixact is necessarily done here, but not - * so: it could have surviving members, namely our own xact or - * other subxacts of this backend. It is legal for us to update - * the tuple in either case, however (the latter case is - * essentially a situation of upgrading our former shared lock to - * exclusive). We don't bother changing the on-disk hint bits - * since we are about to overwrite the xmax altogether. + * Note that the multixact may not be done by now. It could have + * surviving members; our own xact or other subxacts of this + * backend, and also any other concurrent transaction that locked + * the tuple with LockTupleKeyShare if we only got LockTupleNoKeyExclusive. + * If this is the case, we have to be careful to mark the updated tuple + * with the surviving members in Xmax. + * + * Note that there could have been another update in the MultiXact. + * In that case, we need to check whether it committed or aborted. + * If it aborted we are safe to update it again; otherwise there is + * an update conflict, and we have to return HeapTupleUpdated + * below. + * + * In the LockTupleExclusive case, we still need to preserve the + * surviving members: those would include the tuple locks we had + * before this one, which are important to keep in case this + * subxact aborts. */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2)) { + update_xact = HeapTupleMultiXactGetUpdateXid(&oldtup); + } else + update_xact = InvalidTransactionId; + + /* + * There was no UPDATE in the MultiXact; or it aborted. No + * TransactionIdIsInProgress() call needed here, since we called + * MultiXactIdWait() above. + */ + if (!TransactionIdIsValid(update_xact) || TransactionIdDidAbort(update_xact)) { + can_continue = true; + } + } else if (TransactionIdIsCurrentTransactionId(xwait)) { + /* + * The only locker is ourselves; we can avoid grabbing the tuple + * lock here, but must preserve our locking information. + */ + checked_lockers = true; + locker_remains = true; + can_continue = true; + } else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact) { + /* + * If it's just a key-share locker, and we're not changing the + * key columns, we don't need to wait for it to end; but we + * need to preserve it as locker. + */ + checked_lockers = true; + locker_remains = true; + can_continue = true; } else { - /* wait for regular transaction to end */ + /* + * Wait for regular transaction to end; but first, acquire + * tuple lock. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + if (!have_tuple_lock) { + LOCK_TUPLE_TUP_LOCK(relation, &(oldtup.t_self), mode); + have_tuple_lock = true; + } XactLockTableWait(xwait, true); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); @@ -4769,20 +5061,22 @@ l2: * other xact could update this tuple before we get to this point. * Check for xmax change, and start over if so. */ - if ((oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + if (xmax_infomask_changed(oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2, infomask, infomask2) || !TransactionIdEquals(HeapTupleGetRawXmax(&oldtup), xwait)) { goto l2; } /* Otherwise check if it committed or aborted */ UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); + if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) + can_continue = true; } /* * We may overwrite if previous xmax aborted, or if it committed but * only locked the tuple without updating it. */ - if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) { + if (can_continue) { result = TM_Ok; ereport(DEBUG1, (errmsg("heap maybe updated ctid (%u,%d) cur_xid " @@ -4815,7 +5109,7 @@ l2: Assert(result != TM_Updated || !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); tmfd->ctid = oldtup.t_data->t_ctid; - tmfd->xmax = HeapTupleGetRawXmax(&oldtup); + tmfd->xmax = HeapTupleGetUpdateXid(&oldtup); if (result == TM_SelfModified) { tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data, page); } else { @@ -4823,12 +5117,13 @@ l2: } UnlockReleaseBuffer(buffer); if (have_tuple_lock) { - UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); + UNLOCK_TUPLE_TUP_LOCK(relation, &(oldtup.t_self), mode); } if (vmbuffer != InvalidBuffer) { ReleaseBuffer(vmbuffer); } bms_free(hot_attrs); + bms_free(key_attrs); bms_free(id_attrs); return result; } @@ -4868,9 +5163,48 @@ l2: Assert(!(newtup->t_data->t_infomask & HEAP_HASOID)); } + /* + * If the tuple we're updating is locked, we need to preserve the locking + * info in the old tuple's Xmax. Prepare a new Xmax value for this. + */ + ComputeNewXmaxInfomask(HeapTupleGetRawXmax(&oldtup), oldtup.t_data->t_infomask, oldtup.t_data->t_infomask2, + xid, mode, true, &xmax_old_tuple, &infomask_old_tuple, &infomask2_old_tuple); + + /* And also prepare an Xmax value for the new copy of the tuple */ + if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) || (checked_lockers && !locker_remains)) { + xmax_new_tuple = InvalidTransactionId; + } else { + xmax_new_tuple = HeapTupleGetRawXmax(&oldtup); + } + + if (!TransactionIdIsValid(xmax_new_tuple)) { + infomask_new_tuple = HEAP_XMAX_INVALID; + infomask2_new_tuple = 0; + } else { + if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) { + GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple, &infomask2_new_tuple); + } else { + infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK; + infomask2_new_tuple = HEAP_XMAX_LOCK_ONLY; + } + } + + if (t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM) { + /* if the only locker of old tuple is ourselves, xmax_new_tuple may be xid and it would be valid */ + if (TransactionIdIsValid(xmax_new_tuple) || !TransactionIdEquals(xmax_old_tuple, xid)) { + ereport(WARNING, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("New MultiXact feature isn't support in this version. Please upgrade to version: %d", + ENHANCED_TUPLE_LOCK_VERSION_NUM))); + } + xmax_new_tuple = 0; + infomask2_new_tuple &= ~(HEAP_KEYS_UPDATED | HEAP_XMAX_LOCK_ONLY); + } + /* + * Prepare the new tuple with the appropriate initial values of Xmin and + * Xmax, as well as initial infomask bits as computed above. + */ newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); - newtup->t_data->t_infomask |= (HEAP_XMAX_INVALID | HEAP_UPDATED); /* Unset the HEAP_HAS_REDIS_COLUMNS bit in the new tuple to make sure hidden * columns added by redis (if any) are removed from the tuple. @@ -4879,11 +5213,17 @@ l2: HeapTupleHeaderUnsetRedisColumns(newtup->t_data); heap_page_prepare_for_xid(relation, buffer, xid, false); + if (TransactionIdIsNormal(xmax_old_tuple)) { + heap_page_prepare_for_xid(relation, buffer, xmax_old_tuple, + (infomask_old_tuple & HEAP_XMAX_IS_MULTI) ? true : false); + } HeapTupleCopyBaseFromPage(newtup, page); HeapTupleSetXmin(newtup, xid); HeapTupleHeaderSetCmin(newtup->t_data, cid); - HeapTupleHeaderSetXmax(page, newtup->t_data, 0); /* for cleanliness */ + newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple; + newtup->t_data->t_infomask2 |= infomask2_new_tuple; + HeapTupleHeaderSetXmax(page, newtup->t_data, xmax_new_tuple); /* for cleanliness */ newtup->t_tableOid = RelationGetRelid(relation); newtup->t_bucketId = RelationGetBktid(relation); #ifdef PGXC @@ -4926,11 +5266,14 @@ l2: new_tup_size = MAXALIGN(newtup->t_len); if (need_toast || new_tup_size > pagefree || rel_in_redis) { /* Clear obsolete visibility flags ... */ - oldtup.t_data->t_infomask &= - ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + Assert(TransactionIdIsValid(xmax_old_tuple)); + oldtup.t_data->t_infomask |= infomask_old_tuple; + oldtup.t_data->t_infomask2 |= infomask2_old_tuple; HeapTupleClearHotUpdated(&oldtup); /* ... and store info about transaction updating this tuple */ - HeapTupleHeaderSetXmax(page, oldtup.t_data, xid); + HeapTupleHeaderSetXmax(page, oldtup.t_data, xmax_old_tuple); HeapTupleHeaderSetCmax(oldtup.t_data, cid, is_combo); /* temporarily make it look not-updated */ oldtup.t_data->t_ctid = oldtup.t_self; @@ -5043,8 +5386,13 @@ l2: if (newbuf != buffer) { /* Prepare new page for xids */ (void)heap_page_prepare_for_xid(relation, newbuf, xid, false); - HeapTupleCopyBaseFromPage(heaptup, newpage); } + if (TransactionIdIsNormal(xmax_new_tuple)) { + (void)heap_page_prepare_for_xid(relation, newbuf, xmax_new_tuple, + (infomask_new_tuple & HEAP_XMAX_IS_MULTI) ? true : false); + } + HeapTupleCopyBaseFromPage(heaptup, newpage); + HeapTupleHeaderSetXmax(newpage, heaptup->t_data, xmax_new_tuple); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); @@ -5083,10 +5431,13 @@ l2: if (!already_marked) { /* Clear obsolete visibility flags ... */ - oldtup.t_data->t_infomask &= - ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + oldtup.t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + Assert(TransactionIdIsValid(xmax_old_tuple)); + oldtup.t_data->t_infomask |= infomask_old_tuple; + oldtup.t_data->t_infomask2 |= infomask2_old_tuple; /* ... and store info about transaction updating this tuple */ - HeapTupleHeaderSetXmax(page, oldtup.t_data, xid); + HeapTupleHeaderSetXmax(page, oldtup.t_data, xmax_old_tuple); HeapTupleHeaderSetCmax(oldtup.t_data, cid, is_combo); } @@ -5123,7 +5474,7 @@ l2: } recptr = log_heap_update(relation, buffer, - &(oldtup.t_self), + &oldtup, newbuf, heaptup, old_key_tuple, @@ -5169,7 +5520,7 @@ l2: * Release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) { - UnlockTuple(relation, &(oldtup.t_self), ExclusiveLock); + UNLOCK_TUPLE_TUP_LOCK(relation, &(oldtup.t_self), mode); } pgstat_count_heap_update(relation, use_hot_update); @@ -5188,6 +5539,7 @@ l2: } bms_free(hot_attrs); + bms_free(key_attrs); bms_free(id_attrs); return TM_Ok; @@ -5252,7 +5604,7 @@ static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup) * inserted. */ if ((hdr->t_infomask & HEAP_XMAX_INVALID) || - (hdr->t_infomask & HEAP_IS_LOCKED)) { + HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask, hdr->t_infomask2)) { xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr); xlrec.cmax = InvalidCommandId; } else { @@ -5444,12 +5796,14 @@ static bool heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum, HeapTuple tup * * Returns true if safe to do HOT update. */ -static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitmapset* id_attrs, bool* satisfies_hot, - bool* satisfies_id, HeapTuple oldtup, HeapTuple newtup, char* page) +static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitmapset *key_attrs, Bitmapset* id_attrs, + bool* satisfies_hot, bool *satisfies_key, bool* satisfies_id, HeapTuple oldtup, HeapTuple newtup, char* page) { int next_hot_attnum; + int next_key_attum; int next_id_attnum; bool hot_result = true; + bool key_result = true; bool id_result = true; /* @@ -5460,6 +5814,8 @@ static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitm */ next_hot_attnum = bms_first_member(hot_attrs); next_hot_attnum += FirstLowInvalidHeapAttributeNumber; + next_key_attum = bms_first_member(key_attrs); + next_key_attum += FirstLowInvalidHeapAttributeNumber; next_id_attnum = bms_first_member(id_attrs); next_id_attnum += FirstLowInvalidHeapAttributeNumber; @@ -5474,6 +5830,8 @@ static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitm */ if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber) { check_now = next_hot_attnum; + } else if (key_result && next_key_attum > FirstLowInvalidHeapAttributeNumber) { + check_now = next_key_attum; } else if (id_result && next_id_attnum > FirstLowInvalidHeapAttributeNumber) { check_now = next_id_attnum; } else { @@ -5486,12 +5844,15 @@ static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitm if (check_now == next_hot_attnum) { hot_result = false; } + if (check_now == next_key_attum) { + key_result = false; + } if (check_now == next_id_attnum) { id_result = false; } /* if all are false now, we can stop checking */ - if (!hot_result && !id_result) { + if (!hot_result && !key_result && !id_result) { break; } } @@ -5508,12 +5869,17 @@ static void HeapSatisfiesHOTUpdate(Relation relation, Bitmapset* hot_attrs, Bitm next_hot_attnum = bms_first_member(hot_attrs); next_hot_attnum += FirstLowInvalidHeapAttributeNumber; } + if (key_result && check_now == next_key_attum) { + next_key_attum = bms_first_member(key_attrs); + next_key_attum += FirstLowInvalidHeapAttributeNumber; + } if (id_result && check_now == next_id_attnum) { next_id_attnum = bms_first_member(id_attrs); next_id_attnum += FirstLowInvalidHeapAttributeNumber; } } *satisfies_hot = hot_result; + *satisfies_key = key_result; *satisfies_id = id_result; } @@ -5529,6 +5895,7 @@ void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) { TM_Result result; TM_FailureData tmfd; + LockTupleMode lockmode; /* All built-in functions are hard coded, and thus they should not be updated */ if (u_sess->attr.attr_common.IsInplaceUpgrade == false && IsProcRelation(relation) && @@ -5545,6 +5912,7 @@ void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) InvalidSnapshot, true /* wait for commit */, &tmfd, + &lockmode, false); switch (result) { case TM_SelfModified: @@ -5571,6 +5939,72 @@ void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) } } +/* + * Return the MultiXactStatus corresponding to the given tuple lock mode. + */ +MultiXactStatus GetMXactStatusForLock(LockTupleMode mode, bool isUpdate) +{ + MultiXactStatus retval; + + if (isUpdate) { + retval = TupleLockExtraInfo[mode].updstatus; + } else { + retval = TupleLockExtraInfo[mode].lockstatus; + } + + if (retval == (MultiXactStatus)-1) { + ereport(ERROR, (errmsg("invalid lock tuple mode %d/%s", mode, isUpdate ? "true" : "false"))); + } + + return retval; +} + +/* + * Check the tuple lock compatibility before upgrade commit. + */ +static LockTupleMode CheckTupleLockCompatilibilty(LockTupleMode mode) +{ + switch (mode) { + case LockTupleKeyShare: + ereport(WARNING, + (errmsg("For Key Share is not support in this version and changed to For Share. " + "You can Upgrade to vesrion %d to use it.", ENHANCED_TUPLE_LOCK_VERSION_NUM))); + return LockTupleShared; + case LockTupleNoKeyExclusive: + ereport(WARNING, + (errmsg("For No Key Update is not support in this version and changed to For Update. " + "You can Upgrade to vesrion %d to use it.", ENHANCED_TUPLE_LOCK_VERSION_NUM))); + return LockTupleExclusive; + default: + return mode; + } +} + +/* + * Check the infomask compatibility before upgrade commit. + */ +static void CheckInfomaskCompatilibilty(TransactionId xid, uint16 infomask) +{ + if (infomask & HEAP_XMAX_IS_MULTI) { + /* In earlier version, MultiXact only have shared lock */ + MultiXactMember *members = NULL; + int nmembers = GetMultiXactIdMembers((MultiXactId)xid, &members); + for (int i = 0; i < nmembers; ++i) { + if (members[i].status != MultiXactStatusForShare) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("New MultiXact feature isn't support in this version. Please upgrade to version: %d", + ENHANCED_TUPLE_LOCK_VERSION_NUM))); + } + } + pfree_ext(members); + } else if (!HEAP_XMAX_IS_SHR_LOCKED(infomask) && !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { + /* Only a transaction lock the tuple, the lockmode must be share or exclusive */ + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("New MultiXact feature isn't support in this version. Please upgrade to version: %d", + ENHANCED_TUPLE_LOCK_VERSION_NUM))); + } +} + /* * heap_lock_tuple - lock a tuple in shared or exclusive mode * @@ -5583,6 +6017,7 @@ void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) * tuple's cmax if lock is successful) * mode: indicates if shared or exclusive tuple lock is desired * nowait: if true, ereport rather than blocking if lock not available + * follow_updates: if true, follow the update chain to also lock descendant tuples. * * Output parameters: * *tuple: all fields filled in @@ -5600,38 +6035,10 @@ void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) * tuple was updated, and t_ctid is the location of the replacement tuple. * (t_xmax is needed to verify that the replacement tuple matches.) * - * - * NOTES: because the shared-memory lock table is of finite size, but users - * could reasonably want to lock large numbers of tuples, we do not rely on - * the standard lock manager to store tuple-level locks over the long term. - * Instead, a tuple is marked as locked by setting the current transaction's - * XID as its XMAX, and setting additional infomask bits to distinguish this - * usage from the more normal case of having deleted the tuple. When - * multiple transactions concurrently share-lock a tuple, the first locker's - * XID is replaced in XMAX with a MultiTransactionId representing the set of - * XIDs currently holding share-locks. - * - * When it is necessary to wait for a tuple-level lock to be released, the - * basic delay is provided by XactLockTableWait or MultiXactIdWait on the - * contents of the tuple's XMAX. However, that mechanism will release all - * waiters concurrently, so there would be a race condition as to which - * waiter gets the tuple, potentially leading to indefinite starvation of - * some waiters. The possibility of share-locking makes the problem much - * worse --- a steady stream of share-lockers can easily block an exclusive - * locker forever. To provide more reliable semantics about who gets a - * tuple-level lock first, we use the standard lock manager. The protocol - * for waiting for a tuple-level lock is really LockTuple(), XactLockTableWait() - * mark tuple as locked by me UnlockTuple() - * When there are multiple waiters, arbitration of who is to get the lock next - * is provided by LockTuple(). However, at most one tuple-level lock will - * be held or awaited per backend at any time, so we don't risk overflow - * of the lock table. Note that incoming share-lockers are required to - * do LockTuple as well, if there is any conflict, to ensure that they don't - * starve out waiting exclusive-lockers. However, if there is not any active - * conflict for a tuple, we don't incur any extra overhead. + * See README.tuplock for a thorough explanation of this mechanism. */ -TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer* buffer, - CommandId cid, LockTupleMode mode, bool nowait, TM_FailureData *tmfd, bool allow_lock_self) +TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer* buffer, CommandId cid, + LockTupleMode mode, bool nowait, bool follow_updates, TM_FailureData *tmfd, bool allow_lock_self) { TM_Result result; ItemPointer tid = &(tuple->t_self); @@ -5641,7 +6048,8 @@ TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer* buffer, TransactionId xmax; uint16 old_infomask; uint16 new_infomask; - LOCKMODE tuple_lock_type; + uint16 new_infomask2; + bool first_time = true; bool have_tuple_lock = false; Buffer vmbuffer = InvalidBuffer; BlockNumber block; @@ -5654,7 +6062,10 @@ TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer* buffer, nowait = true; } - tuple_lock_type = (mode == LockTupleShared) ? ShareLock : ExclusiveLock; + if (t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM) { + mode = CheckTupleLockCompatilibilty(mode); + follow_updates = false; + } block = ItemPointerGetBlockNumber(tid); *buffer = ReadBuffer(relation, block); @@ -5733,131 +6144,326 @@ l3: } else if (result == TM_BeingModified) { TransactionId xwait; uint16 infomask; + uint16 infomask2; + bool require_sleep; + ItemPointerData t_ctid; /* must copy state data before unlocking buffer */ xwait = HeapTupleGetRawXmax(tuple); infomask = tuple->t_data->t_infomask; + infomask2 = tuple->t_data->t_infomask2; + ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - /* - * If we wish to acquire share lock, and the tuple is already - * share-locked by a multixact that includes any subtransaction of the - * current top transaction, then we effectively hold the desired lock - * already. We *must* succeed without trying to take the tuple lock, - * else we will deadlock against anyone waiting to acquire exclusive - * lock. We don't need to make any state changes in this case. - */ - if (mode == LockTupleShared && (infomask & HEAP_XMAX_IS_MULTI) && MultiXactIdIsCurrent((MultiXactId)xwait)) { - Assert(infomask & HEAP_XMAX_SHARED_LOCK); + /* + * If any subtransaction of the current top transaction already holds a + * lock as strong as or stronger than what we're requesting, we + * effectively hold the desired lock already. We *must* succeed + * without trying to take the tuple lock, else we will deadlock against + * anyone wanting to acquire a stronger lock. + * + * Note we only do this the first time we loop on the TM_Result; + * there is no point in testing in subsequent passes, because + * evidently our own transaction cannot have acquired a new lock after + * the first time we checked. + */ + if (first_time) { + first_time = false; - result = TM_Ok; - goto out_unlocked; + if (infomask & HEAP_XMAX_IS_MULTI) { + MultiXactMember *members = NULL; + int nmembers = GetMultiXactIdMembers(xwait, &members); + + for (int i = 0; i < nmembers; i++) { + /* only consider members of our own transaction */ + if (!TransactionIdIsCurrentTransactionId(members[i].xid)) + continue; + + if (TUPLOCK_FROM_MXSTATUS(members[i].status) >= mode) { + pfree(members); + result = TM_Ok; + goto out_unlocked; + } + } + pfree_ext(members); + } else if (TransactionIdIsCurrentTransactionId(xwait)) { + switch (mode) { + case LockTupleKeyShare: + Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || HEAP_XMAX_IS_SHR_LOCKED(infomask) || + HEAP_XMAX_IS_EXCL_LOCKED(infomask)); + result = TM_Ok; + goto out_unlocked; + case LockTupleShared: + if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { + result = TM_Ok; + goto out_unlocked; + } + break; + case LockTupleNoKeyExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { + result = TM_Ok; + goto out_unlocked; + } + break; + case LockTupleExclusive: + if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && (infomask2 & HEAP_KEYS_UPDATED)) { + result = TM_Ok; + goto out_unlocked; + } + break; + } + } } /* - * Acquire tuple lock to establish our priority for the tuple. - * LockTuple will release us when we are next-in-line for the tuple. - * We must do this even if we are share-locking. + * Initially assume that we will have to wait for the locking + * transaction(s) to finish. We check various cases below in which + * this can be turned off. + */ + require_sleep = true; + if (mode == LockTupleKeyShare) { + /* + * If we're requesting KeyShare, and there's no update present, we + * don't need to wait. Even if there is an update, we can still + * continue if the key hasn't been modified. + * + * However, if there are updates, we need to walk the update chain + * to mark future versions of the row as locked, too. That way, if + * somebody deletes that future version, we're protected against + * the key going away. This locking of future versions could block + * momentarily, if a concurrent transaction is deleting a key; or + * it could return a value to the effect that the transaction + * deleting the key has already committed. So we do this before + * re-locking the buffer; otherwise this would be prone to + * deadlocks. + * + * Note that the TID we're locking was grabbed before we unlocked + * the buffer. For it to change while we're not looking, the other + * properties we're testing for below after re-locking the buffer + * would also change, in which case we would restart this loop + * above. + */ + if (!(infomask2 & HEAP_KEYS_UPDATED)) { + bool updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask, infomask2); + + /* + * If there are updates, follow the update chain; bail out + * if that cannot be done. + */ + if (follow_updates && updated) { + TM_Result res = heap_lock_updated_tuple(relation, tuple, &t_ctid, GetCurrentTransactionId(), mode); + if (res != TM_Ok) { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } + } + + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* + * Make sure it's still an appropriate lock, else start over. + * Also, if it wasn't updated before we released the lock, but + * is updated now, we start over too; the reason is that we now + * need to follow the update chain to lock the new versions. + */ + if (!HeapTupleIsOnlyLocked(tuple) && + ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || !updated)) + goto l3; + + /* Things look okay, so we can skip sleeping */ + require_sleep = false; + + /* + * Note we allow Xmax to change here; other updaters/lockers + * could have modified it before we grabbed the buffer lock. + * However, this is not a problem, because with the recheck we + * just did we ensure that they still don't conflict with the + * lock we want. + */ + } + } else if (mode == LockTupleShared) { + /* + * If we're requesting Share, we can similarly avoid sleeping if + * there's no update and no exclusive lock present. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(infomask, infomask2) && !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + /* + * Make sure it's still an appropriate lock, else start over. + * See above about allowing xmax to change. + */ + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask, tuple->t_data->t_infomask2) || + HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask)) + goto l3; + require_sleep = false; + } + } else if (mode == LockTupleNoKeyExclusive) { + /* + * If we're requesting NoKeyExclusive, we might also be able to + * avoid sleeping; just ensure that there no conflicting lock + * already acquired. + */ + if (infomask & HEAP_XMAX_IS_MULTI) { + if (!DoesMultiXactIdConflict((MultiXactId)xwait, mode)) { + /* + * No conflict, but if the xmax changed under us in the + * meantime, start over. + */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple->t_data->t_infomask, tuple->t_data->t_infomask2, + infomask, infomask2) || !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) + goto l3; + + /* otherwise, we're good */ + require_sleep = false; + } + } else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) { + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + + /* if the xmax changed in the meantime, start over */ + if (xmax_infomask_changed(tuple->t_data->t_infomask, tuple->t_data->t_infomask2, infomask, infomask2) || + !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) + goto l3; + /* otherwise, we're good */ + require_sleep = false; + } + } + + /* + * As a check independent from those above, we can also avoid sleeping + * if the current transaction is the sole locker of the tuple. Note + * that the strength of the lock already held is irrelevant; this is + * not about recording the lock in Xmax (which will be done regardless + * of this optimization, below). Also, note that the cases where we + * hold a lock stronger than we are requesting are already handled + * above by not doing anything. * - * If we are forced to "start over" below, we keep the tuple lock; - * this arranges that we stay at the head of the line while rechecking - * tuple state. + * Note we only deal with the non-multixact case here; MultiXactIdWait + * is well equipped to deal with this situation on its own. */ - if (!have_tuple_lock) { - if (nowait) { - if (!ConditionalLockTuple(relation, tid, tuple_lock_type)) { - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg( - "could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); - } - } else { - LockTuple(relation, tid, tuple_lock_type, true); - } - have_tuple_lock = true; + if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && TransactionIdIsCurrentTransactionId(xwait)) { + /* ... but if the xmax changed in the meantime, start over */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + if (xmax_infomask_changed(tuple->t_data->t_infomask, tuple->t_data->t_infomask2, infomask, infomask2) || + !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) + goto l3; + Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask, tuple->t_data->t_infomask2)); + require_sleep = false; } - if (mode == LockTupleShared && (infomask & HEAP_XMAX_SHARED_LOCK)) { - /* - * Acquiring sharelock when there's at least one sharelocker - * already. We need not wait for him/them to complete. - */ - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + /* + * By here, we either have already acquired the buffer exclusive lock, + * or we must wait for the locking transaction or multixact; so below + * we ensure that we grab buffer lock after the sleep. + */ + if (require_sleep) { /* - * Make sure it's still a shared lock, else start over. (It's OK - * if the ownership of the shared lock has changed, though.) + * Acquire tuple lock to establish our priority for the tuple, or + * die trying. LockTuple will release us when we are next-in-line + * for the tuple. We must do this even if we are share-locking. + * + * If we are forced to "start over" below, we keep the tuple lock; + * this arranges that we stay at the head of the line while rechecking + * tuple state. */ - if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK)) { - goto l3; - } - } else if (infomask & HEAP_XMAX_IS_MULTI) { - /* wait for multixact to end */ - if (nowait) { - if (!ConditionalMultiXactIdWait((MultiXactId)xwait)) { - ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg("could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); + if (!have_tuple_lock) { + if (nowait) { + if (!ConditionalLockTupleTuplock(relation, tid, mode)) { + ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), + errmsg("could not obtain lock on row in relation \"%s\"", + RelationGetRelationName(relation)))); + } + } else { + LOCK_TUPLE_TUP_LOCK(relation, tid, mode); } + have_tuple_lock = true; + } + if (infomask & HEAP_XMAX_IS_MULTI) { + MultiXactStatus status = GetMXactStatusForLock(mode, false); + + /* We only ever lock tuples, never update them */ + if (status >= MultiXactStatusNoKeyUpdate) + ereport(ERROR, (errmsg("invalid lock mode in heap_lock_tuple"))); + + /* wait for multixact to end */ + if (nowait) { + if (!ConditionalMultiXactIdWait((MultiXactId)xwait, status, NULL)) + ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg( + "could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); + } else { + MultiXactIdWait((MultiXactId)xwait, status, NULL); + } + + /* + * Of course, the multixact might not be done here: if we're + * requesting a light lock mode, other transactions with light + * locks could still be alive, as well as locks owned by our + * own xact or other subxacts of this backend. We need to + * preserve the surviving MultiXact members. Note that it + * isn't absolutely necessary in the latter case, but doing so + * is simpler. + */ } else { - MultiXactIdWait((MultiXactId)xwait, true); + /* wait for regular transaction to end */ + if (nowait) { + if (!ConditionalXactLockTableWait(xwait)) + ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg( + "could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); + } else { + XactLockTableWait(xwait, true); + } + } + + /* if there are updates, follow the update chain */ + if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask, infomask2)) { + TM_Result res = heap_lock_updated_tuple(relation, tuple, &t_ctid, GetCurrentTransactionId(), mode); + if (res != TM_Ok) { + result = res; + /* recovery code expects to have buffer lock held */ + LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); + goto failed; + } } LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); /* - * If xwait had just locked the tuple then some other xact could - * update this tuple before we get to this point. Check for xmax - * change, and start over if so. + * xwait is done, but if xwait had just locked the tuple then + * some other xact could update this tuple before we get to + * this point. Check for xmax change, and start over if so. */ - if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || + if (xmax_infomask_changed(tuple->t_data->t_infomask, tuple->t_data->t_infomask2, infomask, infomask2) || !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) { goto l3; } - /* - * You might think the multixact is necessarily done here, but not - * so: it could have surviving members, namely our own xact or - * other subxacts of this backend. It is legal for us to lock the - * tuple in either case, however. We don't bother changing the - * on-disk hint bits since we are about to overwrite the xmax - * altogether. - */ - } else { - /* wait for regular transaction to end */ - if (nowait) { - if (!ConditionalXactLockTableWait(xwait)) - ereport(ERROR, - (errcode(ERRCODE_LOCK_NOT_AVAILABLE), - errmsg( - "could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); - } else { - XactLockTableWait(xwait, true); + if (!(infomask & HEAP_XMAX_IS_MULTI)) { + /* + * Otherwise check if it committed or aborted. Note we cannot + * be here if the tuple was only locked by somebody who didn't + * conflict with us; that should have been handled above. So + * that transaction must necessarily be gone by now. But don't + * check for this in the multixact case, because some locker + * transactions might still be running. + */ + UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); } - - LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); - - /* - * xwait is done, but if xwait had just locked the tuple then some - * other xact could update this tuple before we get to this point. - * Check for xmax change, and start over if so. - */ - if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) || - !TransactionIdEquals(HeapTupleGetRawXmax(tuple), xwait)) { - goto l3; - } - - /* Otherwise check if it committed or aborted */ - UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); } + /* By here, we're certain that we hold buffer exclusive lock again */ + /* * We may lock if previous xmax aborted, or if it committed but only - * locked the tuple without updating it. The case where we didn't - * wait because we are joining an existing shared lock is correctly - * handled, too. + * locked the tuple without updating it; or if we didn't have to wait + * at all for whatever reason. */ - if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) { + if (!require_sleep || (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || + HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask, tuple->t_data->t_infomask2) || + HeapTupleIsOnlyLocked(tuple)) { result = TM_Ok; } else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)){ result = TM_Updated; @@ -5866,13 +6472,14 @@ l3: } } +failed: if (result != TM_Ok) { Assert(result == TM_SelfModified || result == TM_Updated || result == TM_Deleted); Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); Assert(result != TM_Updated || !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)); tmfd->ctid = tuple->t_data->t_ctid; - tmfd->xmax = HeapTupleGetRawXmax(tuple); + tmfd->xmax = HeapTupleGetUpdateXid(tuple); if (result == TM_SelfModified) { tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data, page); } else { @@ -5881,105 +6488,6 @@ l3: goto out_locked; } - /* - * We might already hold the desired lock (or stronger), possibly under a - * different subtransaction of the current top transaction. If so, there - * is no need to change state or issue a WAL record. We already handled - * the case where this is true for xmax being a MultiXactId, so now check - * for cases where it is a plain TransactionId. - * - * Note in particular that this covers the case where we already hold - * exclusive lock on the tuple and the caller only wants shared lock. It - * would certainly not do to give up the exclusive lock. - */ - xmax = HeapTupleGetRawXmax(tuple); - old_infomask = tuple->t_data->t_infomask; - - if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED | HEAP_XMAX_IS_MULTI)) && - (mode == LockTupleShared ? (old_infomask & HEAP_IS_LOCKED) : (old_infomask & HEAP_XMAX_EXCL_LOCK)) && - TransactionIdIsCurrentTransactionId(xmax)) { - LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); - /* Probably can't hold tuple lock here, but may as well check */ - if (have_tuple_lock) { - UnlockTuple(relation, tid, tuple_lock_type); - } - result = TM_Ok; - goto out_unlocked; - } - - /* - * Compute the new xmax and infomask to store into the tuple. Note we do - * not modify the tuple just yet, because that would leave it in the wrong - * state if multixact.c elogs. - */ - xid = GetCurrentTransactionId(); - - new_infomask = - old_infomask & ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); - - if (mode == LockTupleShared) { - /* - * If this is the first acquisition of a shared lock in the current - * transaction, set my per-backend OldestMemberMXactId setting. We can - * be certain that the transaction will never become a member of any - * older MultiXactIds than that. (We have to do this even if we end - * up just using our own TransactionId below, since some other backend - * could incorporate our XID into a MultiXact immediately afterwards.) - */ - MultiXactIdSetOldestMember(); - - new_infomask |= HEAP_XMAX_SHARED_LOCK; - - /* - * Check to see if we need a MultiXactId because there are multiple - * lockers. - * - * HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if - * the xmax was a MultiXactId but it was not running anymore. There is - * a race condition, which is that the MultiXactId may have finished - * since then, but that uncommon case is handled within - * MultiXactIdExpand. - * - * There is a similar race condition possible when the old xmax was a - * regular TransactionId. We test TransactionIdIsInProgress again - * just to narrow the window, but it's still possible to end up - * creating an unnecessary MultiXactId. Fortunately this is harmless. - */ - if (!(old_infomask & (HEAP_XMAX_INVALID | HEAP_XMAX_COMMITTED))) { - if (old_infomask & HEAP_XMAX_IS_MULTI) { - /* - * If the XMAX is already a MultiXactId, then we need to - * expand it to include our own TransactionId. - */ - xid = MultiXactIdExpand((MultiXactId)xmax, xid); - new_infomask |= HEAP_XMAX_IS_MULTI; - } else if (TransactionIdIsInProgress(xmax)) { - /* - * If the XMAX is a valid TransactionId, then we need to - * create a new MultiXactId that includes both the old locker - * and our own TransactionId. - */ - xid = MultiXactIdCreate(xmax, xid); - new_infomask |= HEAP_XMAX_IS_MULTI; - } else { - /* - * Can get here iff HeapTupleSatisfiesUpdate saw the old xmax - * as running, but it finished before - * TransactionIdIsInProgress() got to run. Treat it like - * there's no locker in the tuple. - */ - } - } else { - /* - * There was no previous locker, so just insert our own - * TransactionId. - */ - } - } else { - /* We want an exclusive lock on the tuple */ - new_infomask |= HEAP_XMAX_EXCL_LOCK; - } - /* * If we didn't pin the visibility map page and the page has become all * visible while we were busy locking the buffer, or during some @@ -5996,6 +6504,33 @@ l3: goto l3; } + xmax = HeapTupleGetRawXmax(tuple); + old_infomask = tuple->t_data->t_infomask; + + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId setting. + * We can be certain that the transaction will never become a member of + * any older MultiXactIds than that. (We have to do this even if we + * end up just using our own TransactionId below, since some other + * backend could incorporate our XID into a MultiXact immediately + * afterwards.) + */ + MultiXactIdSetOldestMember(); + + /* + * Compute the new xmax and infomask to store into the tuple. Note we do + * not modify the tuple just yet, because that would leave it in the wrong + * state if multixact.c elogs. + */ + ComputeNewXmaxInfomask(xmax, old_infomask, tuple->t_data->t_infomask2, GetCurrentTransactionId(), + mode, false, &xid, &new_infomask, &new_infomask2); + + if (t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM) { + CheckInfomaskCompatilibilty(xid, new_infomask); + new_infomask2 &= ~(HEAP_KEYS_UPDATED | HEAP_XMAX_LOCK_ONLY); + } + if (TransactionIdIsNormal(xid)) { (void)heap_page_prepare_for_xid(relation, *buffer, xid, (new_infomask & HEAP_XMAX_IS_MULTI) ? true : false); } @@ -6008,13 +6543,31 @@ l3: * Store transaction information of xact locking the tuple. * * Note: Cmax is meaningless in this context, so don't set it; this avoids - * possibly generating a useless combo CID. + * possibly generating a useless combo CID. Moreover, if we're locking a + * previously updated tuple, it's important to preserve the Cmax. + * + * Also reset the HOT UPDATE bit, but only if there's no update; otherwise + * we would break the HOT chain. */ - tuple->t_data->t_infomask = new_infomask; - HeapTupleHeaderClearHotUpdated(tuple->t_data); + tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; + tuple->t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + tuple->t_data->t_infomask |= new_infomask; + tuple->t_data->t_infomask2 |= new_infomask2; + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask, new_infomask2)) { + HeapTupleHeaderClearHotUpdated(tuple->t_data); + } HeapTupleHeaderSetXmax(page, tuple->t_data, xid); - /* Make sure there is no forward chain link in t_ctid */ - tuple->t_data->t_ctid = *tid; + + /* + * Make sure there is no forward chain link in t_ctid. Note that in the + * cases where the tuple has been updated, we must not overwrite t_ctid, + * because it was set by the updater. Moreover, if the tuple has been + * updated, we need to follow the update chain to lock the new versions + * of the tuple as well. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask, new_infomask2)) { + tuple->t_data->t_ctid = *tid; + } /* Clear bit on visibility map if needed */ if (PageIsAllVisible(BufferGetPage(*buffer))) { @@ -6038,18 +6591,25 @@ l3: if (RelationNeedsWAL(relation)) { xl_heap_lock xlrec; XLogRecPtr recptr; + bool useOldXlog; xlrec.locking_xid = xid; xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0); xlrec.shared_lock = (mode == LockTupleShared); + xlrec.infobits_set = ComputeInfobits(new_infomask, tuple->t_data->t_infomask2); + xlrec.lock_updated = false; + useOldXlog = t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM || + !(xlrec.infobits_set & XLHL_XMAX_IS_MULTI); +#ifdef ENABLE_MULTIPLE_NODES + useOldXlog = true; +#endif XLogBeginInsert(); - XLogRegisterData((char*)&xlrec, SizeOfHeapLock); + XLogRegisterData((char *)&xlrec, useOldXlog ? SizeOfOldHeapLock : SizeOfHeapLock); XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); - recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); - + recptr = XLogInsert(RM_HEAP_ID, useOldXlog ? XLOG_HEAP_LOCK : XLOG_HEAP_LOCK | XLOG_TUPLE_LOCK_UPGRADE_FLAG); PageSetLSN(page, recptr); } @@ -6073,12 +6633,602 @@ out_unlocked: * release the lmgr tuple lock, if we had it. */ if (have_tuple_lock) { - UnlockTuple(relation, tid, tuple_lock_type); + UNLOCK_TUPLE_TUP_LOCK(relation, tid, mode); } return result; } +/* + * Given an original set of Xmax and infomask, and a transaction (identified by + * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and + * corresponding infomasks to use on the tuple. + * + * Note that this might have side effects such as creating a new MultiXactId. + * + * Most callers will have called HeapTupleSatisfiesUpdate before this function; + * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId + * but it was not running anymore. There is a race condition, which is that the + * MultiXactId may have finished since then, but that uncommon case is handled + * either here, or within MultiXactIdExpand. + * + * There is a similar race condition possible when the old xmax was a regular + * TransactionId. We test TransactionIdIsInProgress again just to narrow the + * window, but it's still possible to end up creating an unnecessary + * MultiXactId. Fortunately this is harmless. + */ +static void ComputeNewXmaxInfomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, + TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, + uint16 *result_infomask, uint16 *result_infomask2) +{ + TransactionId new_xmax; + uint16 new_infomask; + uint16 new_infomask2; + + Assert(TransactionIdIsCurrentTransactionId(add_to_xmax)); + +l5: + new_infomask = 0; + new_infomask2 = 0; + if (old_infomask & HEAP_XMAX_INVALID) { + /* + * No previous locker; we just insert our own TransactionId. + * + * Note that it's critical that this case be the first one checked, + * because there are several blocks below that come back to this one + * to implement certain optimizations; old_infomask might contain + * other dirty bits in those cases, but we don't really care. + */ + if (is_update) { + new_xmax = add_to_xmax; + if (mode == LockTupleExclusive) + new_infomask2 |= HEAP_KEYS_UPDATED; + } else { + new_infomask2 |= HEAP_XMAX_LOCK_ONLY; + switch (mode) { + case LockTupleKeyShare: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_KEYSHR_LOCK; + break; + case LockTupleShared: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_SHARED_LOCK; + break; + case LockTupleNoKeyExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + break; + case LockTupleExclusive: + new_xmax = add_to_xmax; + new_infomask |= HEAP_XMAX_EXCL_LOCK; + new_infomask2 |= HEAP_KEYS_UPDATED; + break; + default: + new_xmax = InvalidTransactionId; /* silence compiler */ + ereport(ERROR, (errmsg("invalid lock mode"))); + } + } + } else if (old_infomask & HEAP_XMAX_IS_MULTI) { + MultiXactStatus new_status; + + /* + * Currently we don't allow XMAX_COMMITTED to be set for multis, + * so cross-check. + */ + Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); + + /* + * If the XMAX is already a MultiXactId, then we need to expand it to + * include add_to_xmax; but if all the members were lockers and are all + * gone, we can do away with the IS_MULTI bit and just set add_to_xmax + * as the only locker/updater. If all lockers are gone and we have an + * updater that aborted, we can also do without a multi. + * + * The cost of doing GetMultiXactIdMembers would be paid by + * MultiXactIdExpand if we weren't to do this, so this check is not + * incurring extra work anyhow. + */ + if (!MultiXactIdIsRunning(xmax)) { + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask, old_infomask2) || + !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, old_infomask, old_infomask2))) { + /* + * Reset these bits and restart; otherwise fall through to + * create a new multi below. + */ + old_infomask &= ~HEAP_XMAX_IS_MULTI; + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + } + + new_status = GetMXactStatusForLock(mode, is_update); + + new_xmax = MultiXactIdExpand((MultiXactId)xmax, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } else if (old_infomask & HEAP_XMAX_COMMITTED) { + /* + * It's a committed update, so we need to preserve him as updater of + * the tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; + + if (old_infomask2 & HEAP_KEYS_UPDATED) { + status = MultiXactStatusUpdate; + } else { + status = MultiXactStatusNoKeyUpdate; + } + + new_status = GetMXactStatusForLock(mode, is_update); + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } else if (TransactionIdIsInProgress(xmax)) { + /* + * If the XMAX is a valid, in-progress TransactionId, then we need to + * create a new MultiXactId that includes both the old locker or + * updater and our own TransactionId. + */ + MultiXactStatus new_status; + MultiXactStatus old_status; + LockTupleMode old_mode; + + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask, old_infomask2)) { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForKeyShare; + else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) + old_status = MultiXactStatusForShare; + else { + if (old_infomask2 & HEAP_KEYS_UPDATED) { + old_status = MultiXactStatusForUpdate; + } else { + old_status = MultiXactStatusForNoKeyUpdate; + } + } + } else { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) { + old_status = MultiXactStatusUpdate; + } else { + old_status = MultiXactStatusNoKeyUpdate; + } + } + + old_mode = TUPLOCK_FROM_MXSTATUS(old_status); + + /* + * If the lock to be acquired is for the same TransactionId as the + * existing lock, there's an optimization possible: consider only the + * strongest of both locks as the only one present, and restart. + */ + if (xmax == add_to_xmax) { + /* + * Note that it's not possible for the original tuple to be updated: + * we wouldn't be here because the tuple would have been invisible and + * we wouldn't try to update it. As a subtlety, this code can also + * run when traversing an update chain to lock future versions of a + * tuple. But we wouldn't be here either, because the add_to_xmax + * would be different from the original updater. + */ + Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask, old_infomask2)); + + /* acquire the strongest of both */ + if (mode < old_mode) + mode = old_mode; + /* mustn't touch is_update */ + + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + /* otherwise, just fall back to creating a new multixact */ + new_status = GetMXactStatusForLock(mode, is_update); + new_xmax = MultiXactIdCreate(xmax, old_status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask, old_infomask2) && TransactionIdDidCommit(xmax)) { + /* + * It's a committed update, so we gotta preserve him as updater of the + * tuple. + */ + MultiXactStatus status; + MultiXactStatus new_status; + + if (old_infomask2 & HEAP_KEYS_UPDATED) { + status = MultiXactStatusUpdate; + } else { + status = MultiXactStatusNoKeyUpdate; + } + + new_status = GetMXactStatusForLock(mode, is_update); + /* + * since it's not running, it's obviously impossible for the old + * updater to be identical to the current one, so we need not check + * for that case as we do in the block above. + */ + new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); + GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); + } else { + /* + * Can get here iff the locking/updating transaction was running when + * the infomask was extracted from the tuple, but finished before + * TransactionIdIsInProgress got to run. Deal with it as if there was + * no locker at all in the first place. + */ + old_infomask |= HEAP_XMAX_INVALID; + goto l5; + } + + *result_infomask = new_infomask; + *result_infomask2 = new_infomask2; + *result_xmax = new_xmax; +} + +/* + * Subroutine for heap_lock_updated_tuple_rec. + * + * Given an hypothetical multixact status held by the transaction identified + * with the given xid, does the current transaction need to wait, fail, or can + * it continue if it wanted to acquire a lock of the given mode? "needwait" + * is set to true if waiting is necessary; if it can continue, then + * HeapTupleMayBeUpdated is returned. If the lock is already held by the + * current transaction, return HeapTupleSelfUpdated. In case of a conflict + * with another transaction,, a different HeapTupleSatisfiesUpdate return code + * is returned. + * + * The held status is said to be hypothetical because it might correspond to a + * lock held by a single Xid, i.e. not a real MultiXactId; we express it this + * way for simplicity of API. + */ +static TM_Result test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, + bool *needwait) +{ + MultiXactStatus wantedstatus; + + *needwait = false; + wantedstatus = GetMXactStatusForLock(mode, false); + + /* + * Note: we *must* check TransactionIdIsInProgress before + * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an + * explanation. + */ + if (TransactionIdIsCurrentTransactionId(xid)) { + /* + * The tuple has already been locked by our own transaction. This is + * very rare but can happen if multiple transactions are trying to + * lock an ancient version of the same tuple. + */ + return TM_SelfUpdated; + } else if (TransactionIdIsInProgress(xid)) { + /* + * If the locking transaction is running, what we do depends on whether + * the lock modes conflict: if they do, then we must wait for it to + * finish; otherwise we can fall through to lock this tuple version + * without waiting. + */ + if (DoLockModesConflict(LOCKMODE_FROM_MXSTATUS(status), LOCKMODE_FROM_MXSTATUS(wantedstatus))) { + *needwait = true; + } + + /* + * If we set needwait above, then this value doesn't matter; otherwise, + * this value signals to caller that it's okay to proceed. + */ + return TM_Ok; + } else if (TransactionIdDidAbort(xid)) + return TM_Ok; + else if (TransactionIdDidCommit(xid)) { + /* + * If the updating transaction committed, what we do depends on whether + * the lock modes conflict: if they do, then we must report error to + * caller. But if they don't, we can fall through to lock it. + */ + if (DoLockModesConflict(LOCKMODE_FROM_MXSTATUS(status), LOCKMODE_FROM_MXSTATUS(wantedstatus))) + /* bummer */ + return TM_Updated; + + return TM_Ok; + } + + /* Not in progress, not aborted, not committed -- must have crashed */ + return TM_Ok; +} + +/* + * Recursive part of heap_lock_updated_tuple + * + * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given + * xid with the given mode; if this tuple is updated, recurse to lock the new + * version as well. + */ +static TM_Result heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, LockTupleMode mode) +{ + ItemPointerData tupid; + HeapTupleData mytup; + Buffer buf; + uint16 new_infomask; + uint16 new_infomask2; + uint16 old_infomask; + uint16 old_infomask2; + TransactionId xmax; + TransactionId new_xmax; + Buffer vmbuffer = InvalidBuffer; + BlockNumber block; + TM_Result result; + + ItemPointerCopy(tid, &tupid); + + for (;;) { + new_infomask = 0; + new_xmax = InvalidTransactionId; + block = ItemPointerGetBlockNumber(&tupid); + ItemPointerCopy(&tupid, &(mytup.t_self)); + + if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL)) { + /* + * if we fail to find the updated version of the tuple, it's + * because it was vacuumed/pruned away after its creator + * transaction aborted. So behave as if we got to the end of the + * chain, and there's no further tuple to lock: return succ:qess to + * caller. + */ + result = TM_Ok; + goto out_unlocked; + } + +l4: + CHECK_FOR_INTERRUPTS(); + + /* + * Before locking the buffer, pin the visibility map page if it + * appears to be necessary. Since we haven't got the lock yet, + * someone else might be in the middle of changing this, so we'll need + * to recheck after we have the lock. + */ + if (PageIsAllVisible(BufferGetPage(buf))) { + visibilitymap_pin(rel, block, &vmbuffer); + } + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + old_infomask = mytup.t_data->t_infomask; + old_infomask2 = mytup.t_data->t_infomask2; + xmax = HeapTupleGetRawXmax(&mytup); + + /* + * If this tuple version has been updated or locked by some concurrent + * transaction(s), what we do depends on whether our lock mode + * conflicts with what those other transactions hold, and also on the + * status of them. + */ + if (!(old_infomask & HEAP_XMAX_INVALID)) { + TransactionId rawxmax; + bool needwait; + + rawxmax = HeapTupleGetRawXmax(&mytup); + if (old_infomask & HEAP_XMAX_IS_MULTI) { + int nmembers; + int i; + MultiXactMember *members; + + nmembers = GetMultiXactIdMembers(rawxmax, &members); + for (i = 0; i < nmembers; i++) { + result = test_lockmode_for_conflict(members[i].status, members[i].xid, mode, &needwait); + + /* + * If the tuple was already locked by ourselves in a + * previous iteration of this (say heap_lock_tuple was + * forced to restart the locking loop because of a change + * in xmax), then we hold the lock already on this tuple + * version and we don't need to do anything; and this is + * not an error condition either. We just need to skip + * this tuple and continue locking the next version in the + * update chain. + */ + if (result == TM_SelfUpdated) { + pfree(members); + goto next; + } + + if (needwait) { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(members[i].xid); + pfree_ext(members); + goto l4; + } + + if (result != TM_Ok) { + pfree_ext(members); + goto out_locked; + } + } + + pfree_ext(members); + } else { + MultiXactStatus status = MultiXactStatusForShare; + + /* + * For a non-multi Xmax, we first need to compute the + * corresponding MultiXactStatus by using the infomask bits. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask, old_infomask2)) { + if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) { + status = MultiXactStatusForKeyShare; + } else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) { + status = MultiXactStatusForShare; + } else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) { + if (old_infomask2 & HEAP_KEYS_UPDATED) { + status = MultiXactStatusForUpdate; + } else { + status = MultiXactStatusNoKeyUpdate; + } + } else { + /* + * LOCK_ONLY present alone (a pg_upgraded tuple + * marked as share-locked in the old cluster) shouldn't + * be seen in the middle of an update chain. + */ + ereport(ERROR, (errmsg("invalid lock status in tuple"))); + } + } else { + /* it's an update, but which kind? */ + if (old_infomask2 & HEAP_KEYS_UPDATED) { + status = MultiXactStatusUpdate; + } else { + status = MultiXactStatusNoKeyUpdate; + } + } + + result = test_lockmode_for_conflict(status, rawxmax, mode, &needwait); + + /* + * If the tuple was already locked by ourselves in a previous + * iteration of this (say heap_lock_tuple was forced to + * restart the locking loop because of a change in xmax), then + * we hold the lock already on this tuple version and we don't + * need to do anything; and this is not an error condition + * either. We just need to skip this tuple and continue + * locking the next version in the update chain. + */ + if (result == TM_SelfUpdated) + goto next; + + if (needwait) { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + XactLockTableWait(rawxmax); + goto l4; + } + + if (result != TM_Ok) { + goto out_locked; + } + } + } + + /* compute the new Xmax and infomask values for the tuple ... */ + ComputeNewXmaxInfomask(xmax, old_infomask, mytup.t_data->t_infomask2, xid, mode, + false, &new_xmax, &new_infomask, &new_infomask2); + + START_CRIT_SECTION(); + + /* ... and set them */ + mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; + mytup.t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + mytup.t_data->t_infomask |= new_infomask; + mytup.t_data->t_infomask2 |= new_infomask2; + HeapTupleSetXmax(&mytup, new_xmax); + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { + xl_heap_lock xlrec; + XLogRecPtr recptr; + Page page = BufferGetPage(buf); + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + + xlrec.locking_xid = new_xmax; + xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); + xlrec.xid_is_mxact = ((new_infomask & HEAP_XMAX_IS_MULTI) != 0); + xlrec.infobits_set = ComputeInfobits(new_infomask, new_infomask2); + xlrec.lock_updated = true; + /* + * We don't record shared_lock, this field is reserverd for compatibility + * This xlog can only be record in new version + */ + XLogRegisterData((char *)&xlrec, SizeOfHeapLock); + + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK | XLOG_TUPLE_LOCK_UPGRADE_FLAG); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + +next: + /* if we find the end of update chain, we're done. */ + if ((mytup.t_data->t_infomask & HEAP_XMAX_INVALID) || ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || + HeapTupleIsOnlyLocked(&mytup)) { + result = TM_Ok; + goto out_locked; + } + + /* tail recursion */ + ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); + UnlockReleaseBuffer(buf); + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + } + + result = TM_Ok; + +out_locked: + UnlockReleaseBuffer(buf); + +out_unlocked: + if (vmbuffer != InvalidBuffer) { + ReleaseBuffer(vmbuffer); + } + + return result; +} + +/* + * heap_lock_updated_tuple + * Follow update chain when locking an updated tuple, acquiring locks (row + * marks) on the updated versions. + * + * The initial tuple is assumed to be already locked. + * + * This function doesn't check visibility, it just inconditionally marks the + * tuple(s) as locked. If any tuple in the updated chain is being deleted + * concurrently (or updated with the key being modified), sleep until the + * transaction doing it is finished. + * + * Note that we don't acquire heavyweight tuple locks on the tuples we walk + * when we have to wait for other transactions to release them, as opposed to + * what heap_lock_tuple does. The reason is that having more than one + * transaction walking the chain is probably uncommon enough that risk of + * starvation is not likely: one of the preconditions for being here is that + * the snapshot in use predates the update that created this tuple (because we + * started at an earlier version of the tuple), but at the same time such a + * transaction cannot be using repeatable read or serializable isolation + * levels, because that would lead to a serializability failure. + */ +static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, TransactionId xid, + LockTupleMode mode) +{ + if (t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("New tuple lock isn't support in this version. Please upgrade to version: %d", + ENHANCED_TUPLE_LOCK_VERSION_NUM))); + } + + if (!ItemPointerEquals(&tuple->t_self, ctid)) { + /* + * If this is the first possibly-multixact-able operation in the + * current transaction, set my per-backend OldestMemberMXactId setting. + * We can be certain that the transaction will never become a member of + * any older MultiXactIds than that. (We have to do this even if we + * end up just using our own TransactionId below, since some other + * backend could incorporate our XID into a MultiXact immediately + * afterwards.) + */ + MultiXactIdSetOldestMember(); + + return heap_lock_updated_tuple_rec(rel, ctid, xid, mode); + } + + /* nothing to lock */ + return TM_Ok; +} + /* * heap_inplace_update - update a tuple "in place" (ie, overwrite it) * @@ -6213,7 +7363,7 @@ void heap_inplace_update(Relation relation, HeapTuple tuple) * tuple status. Also, getting exclusive lock makes it safe to adjust the * infomask bits. */ -bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid) +bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid, MultiXactId cutoff_multi, bool *changedMultiXid) { bool changed = false; TransactionId xid; @@ -6236,10 +7386,12 @@ bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid) changed = true; } + xid = HeapTupleGetRawXmax(tuple); if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)) { - xid = HeapTupleGetRawXmax(tuple); if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, cutoff_xid)) { - if (!RecoveryInProgress() && !(tuple->t_data->t_infomask & HEAP_IS_LOCKED) && TransactionIdDidCommit(xid)) { + if (!RecoveryInProgress() && + !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask, tuple->t_data->t_infomask2) && + TransactionIdDidCommit(xid)) { ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg_internal("cannot freeze commited xmax %lu", xid))); } @@ -6257,22 +7409,34 @@ bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid) changed = true; } } else { - /* ---------- - * XXX perhaps someday we should zero out very old MultiXactIds here? - * - * The only way a stale MultiXactId could pose a problem is if a - * tuple, having once been multiply-share-locked, is not touched by - * any vacuum or attempted lock or deletion for just over 4G MultiXact - * creations, and then in the probably-narrow window where its xmax - * is again a live MultiXactId, someone tries to lock or delete it. - * Even then, another share-lock attempt would work fine. An - * exclusive-lock or delete attempt would face unexpected delay, or - * in the very worst case get a deadlock error. This seems an - * extremely low-probability scenario with minimal downside even if - * it does happen, so for now we don't do the extra bookkeeping that - * would be needed to clean out MultiXactIds. - * ---------- - */ +#ifndef ENABLE_MULTIPLE_NODES + if (MultiXactIdIsValid(xid) && MultiXactIdPrecedes(xid, cutoff_multi) && + (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask, tuple->t_data->t_infomask2) || + TransactionIdPrecedes(HeapTupleMultiXactGetUpdateXid(tuple), cutoff_xid))) { + if (!RecoveryInProgress() && !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) && + !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask, tuple->t_data->t_infomask2) && + TransactionIdDidCommit(HeapTupleMultiXactGetUpdateXid(tuple))) { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), errmsg_internal("cannot freeze commited xmax %lu", xid))); + } + HeapTupleSetXmax(tuple, InvalidTransactionId); + + /* + * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + * + LOCKED. Normalize to INVALID just to be sure no one gets + * confused. Also get rid of the HEAP_KEYS_UPDATED bit. + */ + tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; + tuple->t_data->t_infomask |= HEAP_XMAX_INVALID; + tuple->t_data->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + HeapTupleHeaderClearHotUpdated(tuple->t_data); + + changed = true; + if (changedMultiXid != NULL) { + *changedMultiXid = true; + } + } +#endif } return changed; @@ -6287,9 +7451,10 @@ bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid) * It doesn't matter whether the tuple is alive or dead, we are checking * to see if a tuple needs to be removed or frozen. */ -bool heap_tuple_needs_freeze(HeapTuple htup, TransactionId cutoff_xid, Buffer buf) +bool heap_tuple_needs_freeze(HeapTuple htup, TransactionId cutoff_xid, MultiXactId cutoff_multi, Buffer buf) { TransactionId xid; + MultiXactId multi; HeapTupleHeader tuple = htup->t_data; xid = HeapTupleGetRawXmin(htup); @@ -6297,16 +7462,207 @@ bool heap_tuple_needs_freeze(HeapTuple htup, TransactionId cutoff_xid, Buffer bu return true; } - if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)) { - xid = HeapTupleGetRawXmax(htup); - if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, cutoff_xid)) { - return true; + if (!(tuple->t_infomask & HEAP_XMAX_INVALID)) { + if (!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)) { + xid = HeapTupleGetRawXmax(htup); + if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, cutoff_xid)) { + return true; + } + } else { + multi = HeapTupleGetRawXmax(htup); + if (MultiXactIdIsValid(multi) && MultiXactIdPrecedes(multi, cutoff_multi)) { + return true; + } } } return false; } +/* + * For a given MultiXactId, return the hint bits that should be set in the + * tuple's infomask. + * + * Normally this should be called for a multixact that was just created, and + * so is on our local cache, so the GetMembers call is fast. + */ +static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2) +{ + int nmembers; + MultiXactMember *members = NULL; + uint16 bits = HEAP_XMAX_IS_MULTI; + uint16 bits2 = 0; + bool hasUpdate = false; + LockTupleMode strongest = LockTupleKeyShare; + + nmembers = GetMultiXactIdMembers(multi, &members); + + for (int i = 0; i < nmembers; i++) { + LockTupleMode mode; + + /* + * Remember the strongest lock mode held by any member of the + * multixact. + */ + mode = TUPLOCK_FROM_MXSTATUS(members[i].status); + if (mode > strongest) + strongest = mode; + + switch (members[i].status) { + case MultiXactStatusForKeyShare: + case MultiXactStatusForShare: + case MultiXactStatusForNoKeyUpdate: + break; + case MultiXactStatusForUpdate: + bits2 |= HEAP_KEYS_UPDATED; + break; + case MultiXactStatusNoKeyUpdate: + hasUpdate = true; + break; + case MultiXactStatusUpdate: + bits2 |= HEAP_KEYS_UPDATED; + hasUpdate = true; + break; + } + } + + if (strongest == LockTupleExclusive || strongest == LockTupleNoKeyExclusive) + bits |= HEAP_XMAX_EXCL_LOCK; + else if (strongest == LockTupleShared) + bits |= HEAP_XMAX_SHARED_LOCK; + else if (strongest == LockTupleKeyShare) + bits |= HEAP_XMAX_KEYSHR_LOCK; + + if (!hasUpdate) { + bits2 |= HEAP_XMAX_LOCK_ONLY; + } + + if (nmembers > 0) { + pfree(members); + } + + *new_infomask = bits; + *new_infomask2 = bits2; +} + +/* + * MultiXactIdGetUpdateXid + * + * Given a multixact Xmax and corresponding infomask, which does not have the + * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating + * transaction. + * + * Caller is expected to check the status of the updating transaction, if + * necessary. + */ +static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask, uint16 t_infomask2) +{ + TransactionId updateXact = InvalidTransactionId; + MultiXactMember *members = NULL; + int nmembers; + + Assert(!(t_infomask2 & HEAP_XMAX_LOCK_ONLY)); + Assert(t_infomask & HEAP_XMAX_IS_MULTI); + + nmembers = GetMultiXactIdMembers(xmax, &members); + + if (nmembers > 0) { + for (int i = 0; i < nmembers; i++) { + /* Ignore lockers */ + if (!ISUPDATE_from_mxstatus(members[i].status)) { + continue; + } + + /* there can be at most one updater */ + Assert(updateXact == InvalidTransactionId); + Assert(members[i].status == MultiXactStatusNoKeyUpdate || members[i].status == MultiXactStatusUpdate); + updateXact = members[i].xid; +#ifndef USE_ASSERT_CHECKING + /* + * in an assert-enabled build, walk the whole array to ensure + * there's no other updater. + */ + break; +#endif + } + + pfree(members); + } + + return updateXact; +} + +/* + * HeapTupleMultiXactGetUpdateXid + * + * See also HeapTupleGetUpdateXid, which can be used without previously + * checking the hint bits. + */ +TransactionId HeapTupleMultiXactGetUpdateXid(HeapTuple tuple) +{ + return MultiXactIdGetUpdateXid(HeapTupleGetRawXmax(tuple), + tuple->t_data->t_infomask, tuple->t_data->t_infomask2); +} + +TransactionId HeapTupleHeaderMultiXactGetUpdateXid(Page page, HeapTupleHeader tuple) +{ + return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(page, tuple), + tuple->t_infomask, tuple->t_infomask2); +} + +/* + * Does the given multixact conflict with the current transaction grabbing a + * tuple lock of the given strength? + */ +static bool DoesMultiXactIdConflict(MultiXactId multi, LockTupleMode lockmode) +{ + int nmembers; + MultiXactMember *members; + bool result = false; + LOCKMODE wanted = TupleLockExtraInfo[lockmode].hwlock; + + nmembers = GetMultiXactIdMembers(multi, &members); + if (nmembers >= 0) { + int i; + + for (i = 0; i < nmembers; i++) { + TransactionId memxid; + LOCKMODE memlockmode; + + memlockmode = LOCKMODE_FROM_MXSTATUS(members[i].status); + /* ignore members that don't conflict with the lock we want */ + if (!DoLockModesConflict(memlockmode, wanted)) + continue; + + /* ignore members from current xact */ + memxid = members[i].xid; + if (TransactionIdIsCurrentTransactionId(memxid)) + continue; + + if (!ISUPDATE_from_mxstatus(members[i].status)) { + /* ignore aborted updaters */ + if (TransactionIdDidAbort(memxid)) + continue; + } else { + /* ignore lockers-only that are no longer in progress */ + if (!TransactionIdIsInProgress(memxid)) + continue; + } + + /* + * Whatever remains are either live lockers that conflict with our + * wanted lock, and updaters that are not aborted. Those conflict + * with what we want, so return true. + */ + result = true; + break; + } + pfree(members); + } + + return result; +} + /* ---------------- * heap_restrpos - restore position to marked location * ---------------- @@ -6359,7 +7715,7 @@ void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTuple tuple, TransactionId* late { HeapTupleHeader htup = tuple->t_data; TransactionId xmin = HeapTupleGetRawXmin(tuple); - TransactionId xmax = HeapTupleGetRawXmax(tuple); + TransactionId xmax = HeapTupleGetUpdateXid(tuple); /* * Ignore tuples inserted by an aborted transaction or if the tuple was @@ -6489,10 +7845,16 @@ XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber* redirected * Perform XLogInsert for a heap-freeze operation. Caller must already * have modified the buffer and marked it dirty. */ -XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, OffsetNumber* offsets, int offcnt) +XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, MultiXactId cutoff_multi, + OffsetNumber* offsets, int offcnt) { xl_heap_freeze xlrec; XLogRecPtr recptr; + bool useOldXlog = t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM || + !MultiXactIdIsValid(cutoff_multi); +#ifdef ENABLE_MULTIPLE_NODES + useOldXlog = true; +#endif /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); @@ -6500,9 +7862,10 @@ XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xi Assert(offcnt > 0); xlrec.cutoff_xid = cutoff_xid; + xlrec.cutoff_multi = cutoff_multi; XLogBeginInsert(); - XLogRegisterData((char*)&xlrec, SizeOfHeapFreeze); + XLogRegisterData((char*)&xlrec, useOldXlog ? SizeOfOldHeapFreeze : SizeOfHeapFreeze); /* * The tuple-offsets array is not actually in the buffer, but pretend that @@ -6512,7 +7875,7 @@ XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xi XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); XLogRegisterBufData(0, (char*)offsets, offcnt * sizeof(OffsetNumber)); - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE); + recptr = XLogInsert(RM_HEAP2_ID, useOldXlog ? XLOG_HEAP2_FREEZE : XLOG_HEAP2_FREEZE | XLOG_TUPLE_LOCK_UPGRADE_FLAG); return recptr; } @@ -6613,8 +7976,8 @@ XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer heap_bu * Perform XLogInsert for a heap-update operation. Caller must already * have modified the buffer(s) and marked them dirty. */ -static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointer from, Buffer newbuf, - HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared) +static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, HeapTuple oldtup, Buffer newbuf, HeapTuple newtup, + HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared) { xl_heap_update xlrec; xl_heap_header xlhdr; @@ -6626,6 +7989,7 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointe int bufflags; TdeInfo tdeinfo = {0}; OffsetNumber maxoff; + bool useOldXlog; /* Caller should not call me on a non-WAL-logged relation */ Assert(RelationNeedsWAL(reln)); @@ -6652,9 +8016,18 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointe bufflags = REGBUF_STANDARD; /* Prepare WAL data */ - xlrec.old_offnum = ItemPointerGetOffsetNumber(from); + xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); xlrec.flags = 0; + xlrec.old_xmax = HeapTupleGetRawXmax(oldtup); + xlrec.new_xmax = HeapTupleGetRawXmax(newtup); + xlrec.old_infobits_set = ComputeInfobits(oldtup->t_data->t_infomask, oldtup->t_data->t_infomask2); + useOldXlog = t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM || + !(xlrec.old_infobits_set & XLHL_XMAX_IS_MULTI); +#ifdef ENABLE_MULTIPLE_NODES + useOldXlog = true; +#endif + if (all_visible_cleared) { xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; } @@ -6706,7 +8079,7 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointe XLogRegisterData((char*)&((HeapPageHeader)(page))->pd_xid_base, sizeof(TransactionId)); } - XLogRegisterData((char*)&xlrec, SizeOfHeapUpdate); + XLogRegisterData((char *)&xlrec, useOldXlog ? SizeOfOldHeapUpdate : SizeOfHeapUpdate); /* We need to log a tuple identity */ if (need_tuple_data && old_key_tuple) { @@ -6725,6 +8098,8 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointe /* filtering by origin on a row level is much more efficient */ XLogIncludeOrigin(); + info |= useOldXlog ? 0 : XLOG_TUPLE_LOCK_UPGRADE_FLAG; + recptr = XLogInsert(RM_HEAP_ID, info); ereport(DEBUG4, (errmodule(MOD_REDO), @@ -6732,8 +8107,8 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, const ItemPointe errmsg("[REDO_LOG_TRACE]log_heap_update: fromBlkNum:%u,fromOffsetNum:%hu," "newBlkNum:%u,newOffsetNum:%hu," "t_infomask2:%hu,t_infomask:%hu,t_hoff:%hhu,flags:%hhu,bufflags:%d,newLen:%u", - ItemPointerGetBlockNumber(from), - ItemPointerGetOffsetNumber(from), + ItemPointerGetBlockNumber(&oldtup->t_self), + ItemPointerGetOffsetNumber(&oldtup->t_self), ItemPointerGetBlockNumber(&newtup->t_self), ItemPointerGetOffsetNumber(&newtup->t_self), xlhdr.t_infomask2, @@ -7143,9 +8518,10 @@ static void heap_xlog_freeze(XLogReaderState* record) char* maindata = XLogRecGetData(record); Size blkdatalen; char* blkdata = NULL; + bool isTupleLockUpgrade = (XLogRecGetInfo(record) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; blkdata = XLogRecGetBlockData(record, HEAP_FREEZE_ORIG_BLOCK_NUM, &blkdatalen); - HeapXlogFreezeOperatorPage(&buffer, (void*)maindata, (void*)blkdata, blkdatalen); + HeapXlogFreezeOperatorPage(&buffer, (void*)maindata, (void*)blkdata, blkdatalen, isTupleLockUpgrade); MarkBufferDirty(buffer.buf); } @@ -7372,8 +8748,9 @@ static void heap_xlog_delete(XLogReaderState* record) if (XLogReadBufferForRedo(record, HEAP_DELETE_ORIG_BLOCK_NUM, &buffer) == BLK_NEEDS_REDO) { char* maindata = XLogRecGetData(record); TransactionId recordxid = XLogRecGetXid(record); + bool isTupleLockUpgrade = (XLogRecGetInfo(record) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; - HeapXlogDeleteOperatorPage(&buffer, (void *)maindata, recordxid); + HeapXlogDeleteOperatorPage(&buffer, (void *)maindata, recordxid, isTupleLockUpgrade); MarkBufferDirty(buffer.buf); } if (BufferIsValid(buffer.buf)) { @@ -7577,8 +8954,10 @@ static void heap_xlog_update(XLogReaderState* record, bool hot_update) if (oldaction == BLK_NEEDS_REDO) { char* maindata = XLogRecGetData(record); TransactionId recordxid = XLogRecGetXid(record); + bool isTupleLockUpgrade = (XLogRecGetInfo(record) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; - HeapXlogUpdateOperatorOldpage(&obuffer, (void *)maindata, hot_update, isinit, newblk, recordxid); + HeapXlogUpdateOperatorOldpage(&obuffer, (void *)maindata, hot_update, isinit, newblk, recordxid, + isTupleLockUpgrade); MarkBufferDirty(obuffer.buf); } @@ -7610,10 +8989,11 @@ static void heap_xlog_update(XLogReaderState* record, bool hot_update) Size blkdatalen; char* blkdata = NULL; TransactionId recordxid = XLogRecGetXid(record); + bool isTupleLockUpgrade = (XLogRecGetInfo(record) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; blkdata = XLogRecGetBlockData(record, HEAP_UPDATE_NEW_BLOCK_NUM, &blkdatalen); HeapXlogUpdateOperatorNewpage(&nbuffer, (void *)maindata, isinit, (void *)blkdata, blkdatalen, recordxid, - &freespace, tde); + &freespace, isTupleLockUpgrade, tde); MarkBufferDirty(nbuffer.buf); @@ -7664,8 +9044,9 @@ static void heap_xlog_lock(XLogReaderState* record) if (XLogReadBufferForRedo(record, HEAP_LOCK_ORIG_BLOCK_NUM, &buffer) == BLK_NEEDS_REDO) { char* maindata = XLogRecGetData(record); + bool isTupleLockUpgrade = (XLogRecGetInfo(record) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; - HeapXlogLockOperatorPage(&buffer, (void*)maindata); + HeapXlogLockOperatorPage(&buffer, (void*)maindata, isTupleLockUpgrade); MarkBufferDirty(buffer.buf); } if (BufferIsValid(buffer.buf)) { diff --git a/src/gausskernel/storage/access/heap/heapam_visibility.cpp b/src/gausskernel/storage/access/heap/heapam_visibility.cpp index aeb7d23c8..eb700d618 100644 --- a/src/gausskernel/storage/access/heap/heapam_visibility.cpp +++ b/src/gausskernel/storage/access/heap/heapam_visibility.cpp @@ -211,10 +211,20 @@ static bool DealCurrentTansactionNotCommited(HeapTupleHeader tuple, Page page, B if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) { + return true; + } else { + return false; + } + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { /* deleting subtransaction must have aborted */ @@ -297,19 +307,31 @@ bool HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return true; if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; return false; /* updated by other */ } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) + return true; + + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) { + return true; + } + if (TransactionIdDidCommit(xmax)) + return false; + /* it must have aborted or crashed */ return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; return false; } @@ -329,7 +351,7 @@ bool HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); return true; } @@ -408,10 +430,22 @@ restart: if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + if (!TransactionIdIsValid(xmax)) { + return true; + } + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) { + return true; + } else { + return false; + } + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { /* deleting subtransaction must have aborted */ @@ -451,19 +485,37 @@ restart: return true; if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; return false; } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) + return true; + + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + if (!TransactionIdIsValid(xmax)) { + return true; + } + if (TransactionIdIsCurrentTransactionId(xmax)) { + if (HeapTupleHeaderGetCmax(tuple, page) >= GetCurrentCommandId(false)) { + return true; /* deleted after scan started */ + } else { + return false; /* deleted before scan started */ + } + } + if (TransactionIdIsInProgress(xmax)) { + return true; + } + if (TransactionIdDidCommit(xmax)) { + return false; + } return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; if (HeapTupleHeaderGetCmax(tuple, page) >= GetCurrentCommandId(false)) return true; /* deleted after scan started */ @@ -494,7 +546,7 @@ restart: /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); return true; } @@ -612,10 +664,51 @@ restart: if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return TM_Ok; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ - return TM_Ok; + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { + TransactionId xmax = HeapTupleHeaderGetRawXmax(page, tuple); - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + /* + * Careful here: even though this tuple was created by our own + * transaction, it might be locked by other transactions, if + * the original version was key-share locked when we updated + * it. + */ + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { + if (MultiXactIdIsRunning(xmax)) + return TM_BeingModified; + else + return TM_Ok; + } + + /* + * If the locker is gone, then there is nothing of interest + * left in this Xmax; otherwise, report the tuple as + * locked/updated. + */ + if (!TransactionIdIsInProgress(xmax)) + return TM_Ok; + + return TM_BeingModified; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { + TransactionId xmax= HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* deleting subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) { + if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(page, tuple))) + return TM_BeingModified; + return TM_Ok; + } else { + if (HeapTupleHeaderGetCmax(tuple, page) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { /* deleting subtransaction must have aborted */ @@ -654,7 +747,7 @@ restart: return TM_Ok; } if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { return TM_Ok; } if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) { @@ -665,19 +758,51 @@ restart: } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { + /* + * If it's only locked but HEAP_LOCK_MASK is not set, + * it cannot possibly be running. Otherwise need to check. + */ + if ((tuple->t_infomask & HEAP_LOCK_MASK) && + MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(page, tuple))) + return TM_BeingModified; - if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(page, tuple))) { - return TM_BeingModified; + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + return TM_Ok; } - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsCurrentTransactionId(xmax)) { + if (HeapTupleHeaderGetCmax(tuple, page) >= curcid) + return TM_SelfModified; /* updated after scan started */ + else + return TM_Invisible; /* updated before scan started */ + } + + if (TransactionIdIsInProgress(xmax)) + return TM_BeingModified; + + if (TransactionIdDidCommit(xmax)) { + if (!ItemPointerEquals(&htup->t_self, &tuple->t_ctid)) { + return TM_Updated; + } else { + return TM_Deleted; + } + } + + /* no member, even just a locker, alive anymore */ + if (!MultiXactIdIsRunning(HeapTupleHeaderGetXmax(page, tuple))) + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + /* it must have aborted or crashed */ return TM_Ok; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) { - return TM_Ok; + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { + return TM_BeingModified; } if (HeapTupleHeaderGetCmax(tuple, page) >= curcid) { return TM_SelfModified; /* updated after scan started */ @@ -702,7 +827,7 @@ restart: /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); return TM_Ok; } @@ -787,19 +912,33 @@ static bool HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, Buffer bu return true; if (tuple->t_infomask & HEAP_XMAX_COMMITTED) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; return false; /* updated by other */ } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { + return true; + } + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + if (TransactionIdIsCurrentTransactionId(xmax)) + return false; + if (TransactionIdIsInProgress(xmax)) { + snapshot->xmax = xmax; + return true; + } + if (TransactionIdDidCommit(xmax)) { + return false; + } + /* it must have aborted or crashed */ return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; return false; } @@ -821,7 +960,7 @@ static bool HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, Buffer bu /* xmax transaction committed */ - if (tuple->t_infomask & HEAP_IS_LOCKED) { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); return true; } @@ -896,10 +1035,22 @@ static bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, Buffer buf if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) /* not deleter */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) /* not deleter */ return true; - Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + /* updating subtransaction must have aborted */ + if (!TransactionIdIsCurrentTransactionId(xmax)) + return true; + else if (HeapTupleHeaderGetCmax(tuple, page) >= snapshot->curcid) + return true; /* updated after scan started */ + else + return false; /* updated before scan started */ + } if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(page, tuple))) { /* deleting subtransaction must have aborted */ @@ -952,12 +1103,28 @@ recheck_xmax: if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ return true; - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return true; if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax = HeapTupleHeaderMultiXactGetUpdateXid(page, tuple); + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + if (TransactionIdIsCurrentTransactionId(xmax)) { + if (HeapTupleHeaderGetCmax(tuple, page) >= snapshot->curcid) + return true; /* deleted after scan started */ + else + return false; /* deleted before scan started */ + } + if (TransactionIdIsInProgress(xmax)) + return true; + if (TransactionIdDidCommit(xmax)) { + /* updating transaction committed, but when? */ + if (!CommittedXidVisibleInSnapshot(xmax, snapshot, buffer)) + return true; /* treat as still in progress */ + return false; + } + /* it must have aborted or crashed */ return true; } @@ -1104,7 +1271,7 @@ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, B if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmin(htup))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HEAPTUPLE_INSERT_IN_PROGRESS; - if (tuple->t_infomask & HEAP_IS_LOCKED) + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) return HEAPTUPLE_INSERT_IN_PROGRESS; /* inserted and then deleted by same xact */ if (TransactionIdIsCurrentTransactionId(HeapTupleGetRawXmax(htup))) { @@ -1158,7 +1325,7 @@ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, B if (tuple->t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE; - if (tuple->t_infomask & HEAP_IS_LOCKED) { + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)) { /* * "Deleting" xact really only locked it, so the tuple is live in any * case. However, we should make sure that either XMAX_COMMITTED or @@ -1174,7 +1341,7 @@ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, B xidstatus = TransactionIdGetStatus(HeapTupleGetRawXmax(htup)); if (xidstatus == XID_INPROGRESS && TransactionIdIsInProgress(HeapTupleGetRawXmax(htup))) { return HEAPTUPLE_LIVE; - } + } } /* @@ -1192,8 +1359,36 @@ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, B } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { - /* MultiXacts are currently only allowed to lock tuples */ - Assert(tuple->t_infomask & HEAP_IS_LOCKED); + TransactionId xmax = HeapTupleHeaderGetUpdateXid(page, tuple); + + /* already checked above */ + Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask, tuple->t_infomask2)); + + /* not LOCKED_ONLY, so it has to have an xmax */ + Assert(TransactionIdIsValid(xmax)); + + if (TransactionIdIsInProgress(xmax)) { + return HEAPTUPLE_DELETE_IN_PROGRESS; + } else if (TransactionIdDidCommit(xmax)) { + /* + * The multixact might still be running due to lockers. If the + * updater is below the xid horizon, we have to return DEAD + * regardless -- otherwise we could end up with a tuple where the + * updater has to be removed due to the horizon, but is not pruned + * away. It's not a problem to prune that tuple, because any + * remaining lockers will also be present in newer tuple versions. + */ + if (!TransactionIdPrecedes(xmax, OldestXmin)) + return HEAPTUPLE_RECENTLY_DEAD; + return HEAPTUPLE_DEAD; + } else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(page, tuple))) { + /* + * Not in Progress, Not Committed, so either Aborted or crashed. + * Mark the Xmax as invalid. + */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); + } + /* it must have aborted or crashed */ return HEAPTUPLE_LIVE; } @@ -1263,10 +1458,22 @@ bool HeapTupleIsSurelyDead(HeapTuple tuple, TransactionId OldestXmin) /* * If the inserting transaction committed, but any deleting transaction - * aborted, the tuple is still alive. Likewise, if XMAX is a lock rather - * than a delete, the tuple is still alive. + * aborted, the tuple is still alive. */ - if (tup->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED | HEAP_XMAX_IS_MULTI)) + if (tup->t_infomask & HEAP_XMAX_INVALID) + return false; + + /* + * If the XMAX is just a lock, the tuple is still alive. + */ + if (HEAP_XMAX_IS_LOCKED_ONLY(tup->t_infomask, tup->t_infomask2)) + return false; + + /* + * If the Xmax is a MultiXact, it might be dead or alive, but we cannot + * know without checking pg_multixact. + */ + if (tup->t_infomask & HEAP_XMAX_IS_MULTI) return false; /* If deleter isn't known to have committed, assume it's still running. */ @@ -1277,6 +1484,64 @@ bool HeapTupleIsSurelyDead(HeapTuple tuple, TransactionId OldestXmin) return TransactionIdPrecedes(HeapTupleGetRawXmax(tuple), OldestXmin); } +/* + * Is the tuple really only locked? That is, is it not updated? + * + * It's easy to check just infomask bits if the locker is not a multi; but + * otherwise we need to verify that the updating transaction has not aborted. + * + * This function is here because it follows the same time qualification rules + * laid out at the top of this file. + */ +bool HeapTupleIsOnlyLocked(HeapTuple tuple) +{ + TransactionId xmax; + + /* if there's no valid Xmax, then there's obviously no update either */ + if (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) { + return true; + } + + if (tuple->t_data->t_infomask2 & HEAP_XMAX_LOCK_ONLY) { + return true; + } + + /* invalid xmax means no update */ + if (!TransactionIdIsValid(HeapTupleGetRawXmax(tuple))) { + return true; + } + + /* + * if HEAP_XMAX_LOCK_ONLY is not set and not a multi, then this + * must necessarily have been updated + */ + if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)) { + return false; + } + + /* ... but if it's a multi, then perhaps the updating Xid aborted. */ + xmax = HeapTupleMultiXactGetUpdateXid(tuple); + if (!TransactionIdIsValid(xmax)) { /* shouldn't happen .. */ + return true; + } + + if (TransactionIdIsCurrentTransactionId(xmax)) { + return false; + } + if (TransactionIdIsInProgress(xmax)) { + return false; + } + if (TransactionIdDidCommit(xmax)) { + return false; + } + + /* + * not current, not in progress, not committed -- must have aborted or + * crashed + */ + return true; +} + /* * check whether the transaciont id 'xid' in in the pre-sorted array 'xip'. */ diff --git a/src/gausskernel/storage/access/heap/pruneheap.cpp b/src/gausskernel/storage/access/heap/pruneheap.cpp index cc551de67..caff361fb 100644 --- a/src/gausskernel/storage/access/heap/pruneheap.cpp +++ b/src/gausskernel/storage/access/heap/pruneheap.cpp @@ -497,7 +497,7 @@ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rooto * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ - heap_prune_record_prunable(prstate, HeapTupleGetRawXmax(&tup)); + heap_prune_record_prunable(prstate, HeapTupleGetUpdateXid(&tup)); break; case HEAPTUPLE_DELETE_IN_PROGRESS: @@ -506,7 +506,7 @@ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rooto * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ - heap_prune_record_prunable(prstate, HeapTupleGetRawXmax(&tup)); + heap_prune_record_prunable(prstate, HeapTupleGetUpdateXid(&tup)); break; case HEAPTUPLE_LIVE: @@ -554,7 +554,7 @@ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rooto */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - prior_xmax = HeapTupleGetRawXmax(&tup); + prior_xmax = HeapTupleGetUpdateXid(&tup); } /* @@ -714,7 +714,7 @@ void heap_get_root_tuples(Page page, OffsetNumber *root_offsets) /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - prior_xmax = HeapTupleGetRawXmax(&tup); + prior_xmax = HeapTupleGetUpdateXid(&tup); } else { /* Must be a redirect item. We do not set its root_offsets entry */ Assert(ItemIdIsRedirected(lp)); @@ -751,7 +751,7 @@ void heap_get_root_tuples(Page page, OffsetNumber *root_offsets) break; nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); - prior_xmax = HeapTupleGetRawXmax(&tup); + prior_xmax = HeapTupleGetUpdateXid(&tup); } } } diff --git a/src/gausskernel/storage/access/heap/rewriteheap.cpp b/src/gausskernel/storage/access/heap/rewriteheap.cpp index eecba5c29..7ee272e71 100644 --- a/src/gausskernel/storage/access/heap/rewriteheap.cpp +++ b/src/gausskernel/storage/access/heap/rewriteheap.cpp @@ -160,6 +160,8 @@ typedef struct RewriteStateData { bool rs_use_wal; /* must we WAL-log inserts? */ TransactionId rs_oldest_xmin; /* oldest xmin used by caller to determine tuple visibility */ TransactionId rs_freeze_xid; /* Xid that will be used as freeze cutoff point */ + MultiXactId rs_freeze_multi; /* MultiXactId that will be used as freeze + * cutoff point for multixacts */ MemoryContext rs_cxt; /* for hash tables and entries and tuples in them */ HTAB *rs_unresolved_tups; /* unmatched A tuples */ @@ -212,7 +214,7 @@ typedef OldToNewMappingData *OldToNewMapping; static void raw_heap_insert(RewriteState state, HeapTuple tup); static void RawUHeapInsert(RewriteState state, UHeapTuple tup); static void RawHeapCmprAndMultiInsert(RewriteState state, bool is_last); -static void copyHeapTupleInfo(HeapTuple dest_tup, HeapTuple src_tup, TransactionId freeze_xid); +static void copyHeapTupleInfo(HeapTuple dest_tup, HeapTuple src_tup, TransactionId freeze_xid, MultiXactId freeze_mxid); static void rewrite_page_list_write(RewriteState state); static void rewrite_flush_page(RewriteState state, Page page); static void rewrite_end_flush_page(RewriteState state); @@ -457,11 +459,11 @@ static bool CanWriteUpdatedTuple(RewriteState state, HeapTuple old_tuple, HeapTu /* * If the tuple has been updated, check the old-to-new mapping hash table. */ - if (!(old_tuple->t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) && + if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleIsOnlyLocked(old_tuple)) && !(ItemPointerEquals(&(old_tuple->t_self), &(old_tuple->t_data->t_ctid)))) { OldToNewMapping mapping = NULL; - hashkey.xmin = HeapTupleGetRawXmax(old_tuple); + hashkey.xmin = HeapTupleGetUpdateXid(old_tuple); hashkey.tid = old_tuple->t_data->t_ctid; mapping = (OldToNewMapping)hash_search(state->rs_old_new_tid_map, &hashkey, HASH_FIND, NULL); @@ -520,7 +522,7 @@ void rewrite_heap_tuple(RewriteState state, HeapTuple old_tuple, HeapTuple new_t old_cxt = MemoryContextSwitchTo(state->rs_cxt); - copyHeapTupleInfo(new_tuple, old_tuple, state->rs_freeze_xid); + copyHeapTupleInfo(new_tuple, old_tuple, state->rs_freeze_xid, state->rs_freeze_multi); if (!CanWriteUpdatedTuple(state, old_tuple, new_tuple)) { /* @@ -660,7 +662,7 @@ MemoryContext get_heap_rewrite_memcxt(RewriteState state) return state->rs_cxt; } -static void copyHeapTupleInfo(HeapTuple dest_tup, HeapTuple src_tup, TransactionId freeze_xid) +static void copyHeapTupleInfo(HeapTuple dest_tup, HeapTuple src_tup, TransactionId freeze_xid, MultiXactId freeze_mxid) { /* * Copy the original tuple's visibility information into new_tuple. @@ -683,7 +685,7 @@ static void copyHeapTupleInfo(HeapTuple dest_tup, HeapTuple src_tup, Transaction * While we have our hands on the tuple, we may as well freeze any * very-old xmin or xmax, so that future VACUUM effort can be saved. */ - (void)heap_freeze_tuple(dest_tup, freeze_xid); + (void)heap_freeze_tuple(dest_tup, freeze_xid, freeze_mxid); /* * Invalid ctid means that ctid should point to the tuple itself. We'll @@ -857,7 +859,7 @@ void RewriteAndCompressTup(RewriteState state, HeapTuple old_tuple, HeapTuple ne errno_t rc = EOK; Assert(CurrentMemoryContext == state->rs_cxt); - copyHeapTupleInfo(new_tuple, old_tuple, state->rs_freeze_xid); + copyHeapTupleInfo(new_tuple, old_tuple, state->rs_freeze_xid, state->rs_freeze_multi); /* * Step 1: deal with updated tuples chain. diff --git a/src/gausskernel/storage/access/redo/redo_heapam.cpp b/src/gausskernel/storage/access/redo/redo_heapam.cpp index 4d24e2fd6..d68406631 100755 --- a/src/gausskernel/storage/access/redo/redo_heapam.cpp +++ b/src/gausskernel/storage/access/redo/redo_heapam.cpp @@ -120,7 +120,8 @@ void HeapXlogCleanOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *b PageSetLSN(page, buffer->lsn); } -void HeapXlogFreezeOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *blkdata, Size datalen) +void HeapXlogFreezeOperatorPage(RedoBufferInfo *buffer, void *recorddata, void *blkdata, Size datalen, + bool isTupleLockUpgrade) { xl_heap_freeze *xlrec = (xl_heap_freeze *)recorddata; Page page = buffer->pageinfo.page; @@ -141,7 +142,7 @@ void HeapXlogFreezeOperatorPage(RedoBufferInfo *buffer, void *recorddata, void * HeapTupleCopyBaseFromPage(&tuple, page); ItemPointerSet(&(tuple.t_self), buffer->blockinfo.blkno, *offsets); - (void)heap_freeze_tuple(&tuple, cutoff_xid); + (void)heap_freeze_tuple(&tuple, cutoff_xid, isTupleLockUpgrade ? xlrec->cutoff_multi : InvalidMultiXactId); offsets++; } @@ -237,7 +238,8 @@ inline static void HeapXlogVisibleOperatorVmbuffer(RedoBufferInfo *vmbuffer, voi } } -void HeapXlogDeleteOperatorPage(RedoBufferInfo *buffer, void *recorddata, TransactionId recordxid) +void HeapXlogDeleteOperatorPage(RedoBufferInfo *buffer, void *recorddata, TransactionId recordxid, + bool isTupleLockUpgrade) { xl_heap_delete *xlrec = (xl_heap_delete *)recorddata; Page page = buffer->pageinfo.page; @@ -257,10 +259,24 @@ void HeapXlogDeleteOperatorPage(RedoBufferInfo *buffer, void *recorddata, Transa htup = (HeapTupleHeader)PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); HeapTupleHeaderClearHotUpdated(htup); - if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) { + + if (isTupleLockUpgrade) { + FixInfomaskFromInfobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); + HeapTupleHeaderSetXmax(page, htup, xlrec->xmax); + } else { + htup->t_infomask2 |= HEAP_KEYS_UPDATED; HeapTupleHeaderSetXmax(page, htup, recordxid); + } + + if (!(xlrec->flags & XLH_DELETE_IS_SUPER)) { + if (isTupleLockUpgrade) { + HeapTupleHeaderSetXmax(page, htup, xlrec->xmax); + } else { + HeapTupleHeaderSetXmax(page, htup, recordxid); + } } else { HeapTupleHeaderSetXmin(page, htup, FrozenTransactionId); HeapTupleHeaderSetXmax(page, htup, FrozenTransactionId); @@ -473,7 +489,7 @@ void HeapXlogMultiInsertOperatorPage(RedoBufferInfo *buffer, const void *recored } void HeapXlogUpdateOperatorOldpage(RedoBufferInfo *buffer, void *recoreddata, bool hot_update, bool isnewinit, - BlockNumber newblk, TransactionId recordxid) + BlockNumber newblk, TransactionId recordxid, bool isTupleLockUpgrade) { Page page = buffer->pageinfo.page; Pointer rec_data = (Pointer)recoreddata; @@ -499,12 +515,19 @@ void HeapXlogUpdateOperatorOldpage(RedoBufferInfo *buffer, void *recoreddata, bo htup = (HeapTupleHeader)PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); if (hot_update) HeapTupleHeaderSetHotUpdated(htup); else HeapTupleHeaderClearHotUpdated(htup); - HeapTupleHeaderSetXmax(page, htup, recordxid); + if (isTupleLockUpgrade) { + FixInfomaskFromInfobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); + HeapTupleHeaderSetXmax(page, htup, xlrec->old_xmax); + } else { + htup->t_infomask2 |= HEAP_KEYS_UPDATED; + HeapTupleHeaderSetXmax(page, htup, recordxid); + } HeapTupleHeaderSetCmax(htup, FirstCommandId, false); /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -530,7 +553,7 @@ void HeapXlogUpdateOperatorOldpage(RedoBufferInfo *buffer, void *recoreddata, bo } void HeapXlogUpdateOperatorNewpage(RedoBufferInfo *buffer, void *recorddata, bool isinit, void *blkdata, Size datalen, - TransactionId recordxid, Size *freespace, bool tde) + TransactionId recordxid, Size *freespace, bool isTupleLockUpgrade, bool tde) { Page page = buffer->pageinfo.page; Pointer rec_data = (Pointer)recorddata; @@ -618,6 +641,9 @@ void HeapXlogUpdateOperatorNewpage(RedoBufferInfo *buffer, void *recorddata, boo HeapTupleHeaderSetXmin(page, htup, recordxid); HeapTupleHeaderSetCmin(htup, FirstCommandId); + if (isTupleLockUpgrade) { + HeapTupleHeaderSetXmax(page, htup, xlrec->new_xmax); + } /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -640,7 +666,7 @@ void HeapXlogPageUpgradeOperatorPage(RedoBufferInfo *buffer) PageSetLSN(page, buffer->lsn); } -void HeapXlogLockOperatorPage(RedoBufferInfo *buffer, void *recorddata) +void HeapXlogLockOperatorPage(RedoBufferInfo *buffer, void *recorddata, bool isTupleLockUpgrade) { xl_heap_lock *xlrec = (xl_heap_lock *)recorddata; Page page = buffer->pageinfo.page; @@ -657,13 +683,26 @@ void HeapXlogLockOperatorPage(RedoBufferInfo *buffer, void *recorddata) htup = (HeapTupleHeader)PageGetItem(page, lp); - htup->t_infomask &= ~(HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_IS_LOCKED | HEAP_MOVED); - if (xlrec->xid_is_mxact) - htup->t_infomask |= HEAP_XMAX_IS_MULTI; - if (xlrec->shared_lock) - htup->t_infomask |= HEAP_XMAX_SHARED_LOCK; - else - htup->t_infomask |= HEAP_XMAX_EXCL_LOCK; + htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); + htup->t_infomask2 &= ~(HEAP_XMAX_LOCK_ONLY | HEAP_KEYS_UPDATED); + + if (isTupleLockUpgrade) { + FixInfomaskFromInfobits(xlrec->infobits_set, &htup->t_infomask, &htup->t_infomask2); + if (xlrec->lock_updated) { + HeapTupleHeaderSetXmax(page, htup, xlrec->locking_xid); + PageSetLSN(page, buffer->lsn); + return; + } + } else { + if (xlrec->xid_is_mxact) + htup->t_infomask |= HEAP_XMAX_IS_MULTI; + if (xlrec->shared_lock) + htup->t_infomask |= HEAP_XMAX_SHARED_LOCK; + else { + htup->t_infomask |= HEAP_XMAX_EXCL_LOCK; + htup->t_infomask2 |= HEAP_KEYS_UPDATED; + } + } HeapTupleHeaderClearHotUpdated(htup); HeapTupleHeaderSetXmax(page, htup, xlrec->locking_xid); HeapTupleHeaderSetCmax(htup, FirstCommandId, false); @@ -1333,8 +1372,9 @@ static void HeapXlogDeleteBlock(XLogBlockHead *blockhead, XLogBlockDataParse *bl action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); if (action == BLK_NEEDS_REDO) { char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + bool isTupleLockUpgrad = (XLogBlockHeadGetInfo(blockhead) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; - HeapXlogDeleteOperatorPage(bufferinfo, (void *)maindata, recordxid); + HeapXlogDeleteOperatorPage(bufferinfo, (void *)maindata, recordxid, isTupleLockUpgrad); MakeRedoBufferDirty(bufferinfo); } } @@ -1346,6 +1386,7 @@ static void HeapXlogUpdateBlock(XLogBlockHead *blockhead, XLogBlockDataParse *bl bool tde = ((blockdatarec->blockhead.cur_block_id) & BKID_HAS_TDE_PAGE) != 0; TransactionId recordxid = XLogBlockHeadGetXid(blockhead); XLogBlockDataParse *datadecode = blockdatarec; + bool isTupleLockUpgrade = (XLogBlockHeadGetInfo(blockhead) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; XLogRedoAction action; @@ -1362,7 +1403,7 @@ static void HeapXlogUpdateBlock(XLogBlockHead *blockhead, XLogBlockDataParse *bl if (oldblk == bufferinfo->blockinfo.blkno) { HeapXlogUpdateOperatorOldpage(bufferinfo, (void *)maindata, hot_update, isinit, oldblk, - recordxid); /* old tuple */ + recordxid, isTupleLockUpgrade); /* old tuple */ } blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); @@ -1370,12 +1411,13 @@ static void HeapXlogUpdateBlock(XLogBlockHead *blockhead, XLogBlockDataParse *bl /* new block */ HeapXlogUpdateOperatorNewpage(bufferinfo, (void *)maindata, isinit, (void *)blkdata, blkdatalen, recordxid, - NULL, tde); + NULL, isTupleLockUpgrade, tde); } else { BlockNumber newblk = XLogBlockDataGetAuxiBlock1(datadecode); /* old block */ - HeapXlogUpdateOperatorOldpage(bufferinfo, (void *)maindata, hot_update, isinit, newblk, recordxid); + HeapXlogUpdateOperatorOldpage(bufferinfo, (void *)maindata, hot_update, isinit, newblk, recordxid, + isTupleLockUpgrade); } MakeRedoBufferDirty(bufferinfo); } @@ -1414,8 +1456,9 @@ static void HeapXlogLockBlock(XLogBlockHead *blockhead, XLogBlockDataParse *bloc action = XLogCheckBlockDataRedoAction(datadecode, bufferinfo); if (action == BLK_NEEDS_REDO) { char *maindata = XLogBlockDataGetMainData(datadecode, NULL); + bool isTupleLockUpgrade = (XLogBlockHeadGetInfo(blockhead) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; - HeapXlogLockOperatorPage(bufferinfo, (void *)maindata); + HeapXlogLockOperatorPage(bufferinfo, (void *)maindata, isTupleLockUpgrade); MakeRedoBufferDirty(bufferinfo); } } @@ -1500,7 +1543,7 @@ static void HeapXlogFreezeBlock(XLogBlockHead *blockhead, XLogBlockDataParse *bl blkdata = XLogBlockDataGetBlockData(datadecode, &blkdatalen); Assert(blkdata != NULL); - HeapXlogFreezeOperatorPage(bufferinfo, (void *)maindata, (void *)blkdata, blkdatalen); + HeapXlogFreezeOperatorPage(bufferinfo, (void *)maindata, (void *)blkdata, blkdatalen, false); MakeRedoBufferDirty(bufferinfo); } } diff --git a/src/gausskernel/storage/access/rmgrdesc/heapdesc.cpp b/src/gausskernel/storage/access/rmgrdesc/heapdesc.cpp index 0152eafe4..018446e94 100644 --- a/src/gausskernel/storage/access/rmgrdesc/heapdesc.cpp +++ b/src/gausskernel/storage/access/rmgrdesc/heapdesc.cpp @@ -37,6 +37,25 @@ void heap_add_lock_info(StringInfo buf, xl_heap_lock *xlrec) } +static void OutInfobits(StringInfo buf, uint8 infobits) +{ + if (infobits & XLHL_XMAX_IS_MULTI) { + appendStringInfo(buf, "IS_MULTI "); + } + if (infobits & XLHL_XMAX_LOCK_ONLY) { + appendStringInfo(buf, "LOCK_ONLY "); + } + if (infobits & XLHL_XMAX_EXCL_LOCK) { + appendStringInfo(buf, "EXCL_LOCK "); + } + if (infobits & XLHL_XMAX_KEYSHR_LOCK) { + appendStringInfo(buf, "KEYSHR_LOCK "); + } + if (infobits & XLHL_KEYS_UPDATED) { + appendStringInfo(buf, "KEYS_UPDATED "); + } +} + void heap3_new_cid(StringInfo buf, int bucket_id, xl_heap_new_cid *xlrec) { if (bucket_id == -1) { @@ -56,6 +75,7 @@ void heap_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + bool isTupleLockUPgrade = (XLogRecGetInfo(record) & XLOG_TUPLE_LOCK_UPGRADE_FLAG) != 0; info &= XLOG_HEAP_OPMASK; if (info == XLOG_HEAP_INSERT) { @@ -71,6 +91,10 @@ void heap_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "delete: "); appendStringInfo(buf, "off %u", (uint32)xlrec->offnum); + if (isTupleLockUPgrade) { + appendStringInfoChar(buf, ' '); + OutInfobits(buf, xlrec->infobits_set); + } } else if (info == XLOG_HEAP_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *)rec; @@ -79,6 +103,10 @@ void heap_desc(StringInfo buf, XLogReaderState *record) else appendStringInfo(buf, "XLOG_HEAP_UPDATE update: "); appendStringInfo(buf, "off %u new off %u", (uint32)xlrec->old_offnum, (uint32)xlrec->new_offnum); + if (isTupleLockUPgrade) { + appendStringInfoChar(buf, ' '); + OutInfobits(buf, xlrec->old_infobits_set); + } } else if (info == XLOG_HEAP_HOT_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *)rec; @@ -87,6 +115,10 @@ void heap_desc(StringInfo buf, XLogReaderState *record) else appendStringInfo(buf, "XLOG_HEAP_HOT_UPDATE hot_update: "); appendStringInfo(buf, "off %u new off %u", (uint32)xlrec->old_offnum, (uint32)xlrec->new_offnum); + if (isTupleLockUPgrade) { + appendStringInfoChar(buf, ' '); + OutInfobits(buf, xlrec->old_infobits_set); + } } else if (info == XLOG_HEAP_NEWPAGE) { appendStringInfo(buf, "new page"); /* no further information */ @@ -94,6 +126,10 @@ void heap_desc(StringInfo buf, XLogReaderState *record) xl_heap_lock *xlrec = (xl_heap_lock *)rec; heap_add_lock_info(buf, xlrec); + if (isTupleLockUPgrade) { + appendStringInfoChar(buf, ' '); + OutInfobits(buf, xlrec->infobits_set); + } } else if (info == XLOG_HEAP_INPLACE) { xl_heap_inplace *xlrec = (xl_heap_inplace *)rec; diff --git a/src/gausskernel/storage/access/rmgrdesc/mxactdesc.cpp b/src/gausskernel/storage/access/rmgrdesc/mxactdesc.cpp index a53066e66..d7f3013b0 100644 --- a/src/gausskernel/storage/access/rmgrdesc/mxactdesc.cpp +++ b/src/gausskernel/storage/access/rmgrdesc/mxactdesc.cpp @@ -20,6 +20,34 @@ #include "common/fe_memutils.h" #endif +static void OutMember(StringInfo buf, TransactionId xidWithStatus) +{ + appendStringInfo(buf, "" XID_FMT " ", GET_MEMBER_XID_FROM_SLRU_XID(xidWithStatus)); + switch (GET_MEMBER_STATUS_FROM_SLRU_XID(xidWithStatus)) { + case MultiXactStatusForKeyShare: + appendStringInfoString(buf, "(keysh) "); + break; + case MultiXactStatusForShare: + appendStringInfoString(buf, "(sh) "); + break; + case MultiXactStatusForNoKeyUpdate: + appendStringInfoString(buf, "(fornokeyupd) "); + break; + case MultiXactStatusForUpdate: + appendStringInfoString(buf, "(forupd) "); + break; + case MultiXactStatusNoKeyUpdate: + appendStringInfoString(buf, "(nokeyupd) "); + break; + case MultiXactStatusUpdate: + appendStringInfoString(buf, "(upd) "); + break; + default: + appendStringInfoString(buf, "(unk) "); + break; + } +} + void multixact_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); @@ -43,9 +71,9 @@ void multixact_desc(StringInfo buf, XLogReaderState *record) xl_multixact_create *xlrec = (xl_multixact_create *)rec; int i = 0; - appendStringInfo(buf, "create multixact " XID_FMT " offset %lu:", xlrec->mid, xlrec->moff); + appendStringInfo(buf, "create multixact " XID_FMT " offset %lu: ", xlrec->mid, xlrec->moff); for (i = 0; i < xlrec->nxids; i++) - appendStringInfo(buf, " " XID_FMT "", xlrec->xids[i]); + OutMember(buf, xlrec->xids[i]); } else appendStringInfo(buf, "UNKNOWN"); } diff --git a/src/gausskernel/storage/access/table/tableam.cpp b/src/gausskernel/storage/access/table/tableam.cpp index df92b2829..30a51f8ba 100644 --- a/src/gausskernel/storage/access/table/tableam.cpp +++ b/src/gausskernel/storage/access/table/tableam.cpp @@ -388,10 +388,10 @@ TM_Result tableam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid TM_Result tableam_tuple_update(Relation relation, Relation parentRelation, ItemPointer otid, Tuple newtup, CommandId cid, Snapshot crosscheck, Snapshot snapshot, bool wait, TupleTableSlot **oldslot, TM_FailureData *tmfd, bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, - bool allow_inplace_update) + bool allow_inplace_update, LockTupleMode *lockmode) { return g_tableam_routines[relation->rd_tam_type]->tuple_update(relation, parentRelation, otid, newtup, cid, - crosscheck, snapshot, wait, oldslot, tmfd, update_indexes, modifiedIdxAttrs, allow_update_self, + crosscheck, snapshot, wait, oldslot, tmfd, lockmode, update_indexes, modifiedIdxAttrs, allow_update_self, allow_inplace_update); } @@ -845,10 +845,11 @@ TM_Result HeapamTupleDelete(Relation relation, ItemPointer tid, /* -------------------------------------------------------------------------- */ TM_Result HeapamTupleUpdate(Relation relation, Relation parentRelation, ItemPointer otid, Tuple newtup, CommandId cid, Snapshot crosscheck, Snapshot snapshot, bool wait, TupleTableSlot **oldslot, TM_FailureData *tmfd, - bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, bool allow_inplace_update) + LockTupleMode* lockmode, bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, + bool allow_inplace_update) { TM_Result result = heap_update(relation, parentRelation, otid, (HeapTuple)newtup, - cid, crosscheck, wait, tmfd, allow_update_self); + cid, crosscheck, wait, tmfd, lockmode, allow_update_self); /* make update_indexes optional */ if(update_indexes) { @@ -863,7 +864,8 @@ TM_Result HeapamTupleLock(Relation relation, Tuple tuple, Buffer *buffer, bool allow_lock_self, bool follow_updates, bool eval, Snapshot snapshot, ItemPointer tid, bool isSelectForUpdate, bool isUpsert, TransactionId conflictXid) { - return heap_lock_tuple(relation, (HeapTuple)tuple, buffer, cid, mode, nowait, tmfd, allow_lock_self); + return heap_lock_tuple(relation, (HeapTuple)tuple, buffer, cid, mode, nowait, follow_updates, tmfd, + allow_lock_self); } Tuple HeapamTupleLockUpdated(CommandId cid, Relation relation, int lockmode, ItemPointer tid, @@ -1598,7 +1600,8 @@ TM_Result UHeapamTupleDelete(Relation relation, ItemPointer tid, CommandId cid, TM_Result UHeapamTupleUpdate(Relation relation, Relation parentRelation, ItemPointer otid, Tuple newtup, CommandId cid, Snapshot crosscheck, Snapshot snapshot, bool wait, TupleTableSlot **oldslot, TM_FailureData *tmfd, - bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, bool allow_inplace_update) + LockTupleMode *mode, bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, + bool allow_inplace_update) { TM_Result result = UHeapUpdate(relation, parentRelation, otid, (UHeapTuple)newtup, cid, crosscheck, snapshot, wait, oldslot, tmfd, update_indexes, modifiedIdxAttrs, allow_inplace_update); diff --git a/src/gausskernel/storage/access/transam/multixact.cpp b/src/gausskernel/storage/access/transam/multixact.cpp index 9b49e1c22..e004bba99 100644 --- a/src/gausskernel/storage/access/transam/multixact.cpp +++ b/src/gausskernel/storage/access/transam/multixact.cpp @@ -3,12 +3,16 @@ * multixact.cpp * PostgreSQL multi-transaction-log manager * - * The pg_multixact manager is a pg_clog-like manager that stores an array - * of TransactionIds for each MultiXactId. It is a fundamental part of the - * shared-row-lock implementation. A share-locked tuple stores a - * MultiXactId in its Xmax, and a transaction that needs to wait for the - * tuple to be unlocked can sleep on the potentially-several TransactionIds - * that compose the MultiXactId. + * The pg_multixact manager is a pg_clog-like manager that stores an array of + * MultiXactMember for each MultiXactId. It is a fundamental part of the + * shared-row-lock implementation. Each MultiXactMember is comprised of a + * TransactionId a set of flag bits(high 3 bits record status, low 60 bits record + * transactionid). + * + * The meaning of the flag bits is opaque to this module, but they are mostly + * used in heapam.cpp to identify lock modes that each of the member transactions + * is holding on any given tuple. This module just contains support to store + * and retrieve the arrays. * * We use two SLRU areas, one for storing the offsets at which the data * starts for each MultiXactId in the other one. This trick allows us to @@ -100,6 +104,14 @@ typedef struct MultiXactStateData { /* the Offset SLRU area was last truncated at this MultiXactId */ MultiXactId lastTruncationPoint; + /* + * oldest multixact that is still on disk. Anything older than this should + * not be consulted. + */ + MultiXactId oldestMultiXactId; + Oid oldestMultiXactDB; + MultiXactId multiVacLimit; + /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -166,8 +178,8 @@ typedef struct MultiXactStateData { typedef struct mXactCacheEnt { struct mXactCacheEnt *next; MultiXactId multi; - int nxids; - TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */ + int nmembers; + MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; } mXactCacheEnt; #ifdef MULTIXACT_DEBUG @@ -182,25 +194,24 @@ typedef struct mXactCacheEnt { /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); -static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids); -static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nxids, TransactionId *xids); +static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members); +static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, TransactionId *xidsWtihStatus); static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset); /* MultiXact cache management */ -static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids); -static int mXactCacheGetById(MultiXactId multi, TransactionId **xids); -static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids); +static int MXactMemberComparator(const void *arg1, const void *arg2); +static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); +static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); +static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members); -#ifdef MULTIXACT_DEBUG -static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids); -#endif +static char *mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members); +static const char *MXStatusToString(MultiXactStatus status); /* management of SLRU infrastructure */ static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog); static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static void TruncateMultiXact(void); static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); static void get_multixact_pageno(uint8 info, int64 *pageno, XLogReaderState *record); @@ -209,32 +220,36 @@ static void get_multixact_pageno(uint8 info, int64 *pageno, XLogReaderState *rec * MultiXactIdCreate * Construct a MultiXactId representing two TransactionIds. * - * The two XIDs must be different. + * The two XIDs must be different, or be requesting different statuses. * * NB - we don't worry about our local MultiXactId cache here, because that * is handled by the lower-level routines. */ -MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2) +MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, + TransactionId xid2, MultiXactStatus status2) { MultiXactId newMulti; - TransactionId xids[2]; + MultiXactMember members[2]; + int nmembers = 2; AssertArg(TransactionIdIsValid(xid1)); AssertArg(TransactionIdIsValid(xid2)); - Assert(!TransactionIdEquals(xid1, xid2)); + Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); /* * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs * are still running. In typical usage, xid2 will be our own XID and the * caller just did a check on xid1, so it'd be wasted effort. */ - xids[0] = xid1; - xids[1] = xid2; + members[0].xid = xid1; + members[0].status = status1; + members[1].xid = xid2; + members[1].status = status2; - newMulti = CreateMultiXactId(2, xids); + newMulti = CreateMultiXactId(2, members); - ereport(DEBUG2, (errmsg("Create: returning " XID_FMT " for " XID_FMT ", " XID_FMT, newMulti, xid1, xid2))); + ereport(DEBUG2, (errmsg("Create: :%s", mxid_to_string(newMulti, nmembers, members)))); return newMulti; } @@ -243,21 +258,22 @@ MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2) * MultiXactIdExpand * Add a TransactionId to a pre-existing MultiXactId. * - * If the TransactionId is already a member of the passed MultiXactId, - * just return it as-is. + * If the TransactionId is already a member of the passed MultiXactId with the, + * same status, just return it as-is. * * Note that we do NOT actually modify the membership of a pre-existing * MultiXactId; instead we create a new one. This is necessary to avoid - * a race condition against MultiXactIdWait (see notes there). + * a race condition against code trying to wait for one MultiXactId to finish; + * see notes in heapam.cpp. * * NB - we don't worry about our local MultiXactId cache here, because that * is handled by the lower-level routines. */ -MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid) +MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) { MultiXactId newMulti; - TransactionId *members = NULL; - TransactionId *newMembers = NULL; + MultiXactMember *members = NULL; + MultiXactMember *newMembers = NULL; int nmembers; int i; int j; @@ -265,7 +281,8 @@ MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid) AssertArg(MultiXactIdIsValid(multi)); AssertArg(TransactionIdIsValid(xid)); - ereport(DEBUG2, (errmsg("Expand: received multi " XID_FMT ", xid " XID_FMT, multi, xid))); + ereport(DEBUG2, (errmsg("Expand: received multi " XID_FMT ", xid " XID_FMT ", status %s", multi, xid, + MXStatusToString(status)))); nmembers = GetMultiXactIdMembers(multi, &members); if (nmembers < 0) { @@ -276,18 +293,21 @@ MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid) * caller, but it would complicate the API and it's unlikely to happen * too often, so just deal with it by creating a singleton MultiXact. */ - newMulti = CreateMultiXactId(1, &xid); + MultiXactMember member; + member.xid = xid; + member.status = status; + newMulti = CreateMultiXactId(1, &member); ereport(DEBUG2, (errmsg("Expand: " XID_FMT " has no members, create singleton " XID_FMT, multi, newMulti))); return newMulti; } /* - * If the TransactionId is already a member of the MultiXactId, just - * return the existing MultiXactId. + * If the TransactionId is already a member of the MultiXactId with the + * same status, just return the existing MultiXactId. */ for (i = 0; i < nmembers; i++) { - if (TransactionIdEquals(members[i], xid)) { + if (TransactionIdEquals(members[i].xid, xid) && members[i].status == status) { ereport(DEBUG2, (errmsg("Expand: " XID_FMT " is already a member of " XID_FMT, xid, multi))); pfree(members); members = NULL; @@ -296,19 +316,27 @@ MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid) } /* - * Determine which of the members of the MultiXactId are still running, - * and use them to create a new one. (Removing dead members is just an - * optimization, but a useful one. Note we have the same race condition - * here as above: j could be 0 at the end of the loop.) + * Determine which of the members of the MultiXactId are still of interest. + * This is any running transaction, and also any transaction that grabbed + * something stronger than just a lock and was committed. (An update that + * aborted is of no interest here.) + * + * (Removing dead members is just an optimization, but a useful one. + * Note we have the same race condition here as above: j could be 0 at the + * end of the loop.) */ - newMembers = (TransactionId *)palloc(sizeof(TransactionId) * (unsigned)(nmembers + 1)); + newMembers = (MultiXactMember *)palloc(sizeof(MultiXactMember) * (unsigned)(nmembers + 1)); for (i = 0, j = 0; i < nmembers; i++) { - if (TransactionIdIsInProgress(members[i])) - newMembers[j++] = members[i]; + if (TransactionIdIsInProgress(members[i].xid) || + (ISUPDATE_from_mxstatus(members[i].status) && TransactionIdDidCommit(members[i].xid))) { + newMembers[j].xid = members[i].xid; + newMembers[j++].status = members[i].status; + } } - newMembers[j++] = xid; + newMembers[j].xid = xid; + newMembers[j++].status = status; newMulti = CreateMultiXactId(j, newMembers); pfree(members); @@ -331,25 +359,27 @@ MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid) */ bool MultiXactIdIsRunning(MultiXactId multi) { - TransactionId *members = NULL; + MultiXactMember *members = NULL; int nmembers; int i; ereport(DEBUG2, (errmsg("IsRunning " XID_FMT "?", multi))); nmembers = GetMultiXactIdMembers(multi, &members); - if (nmembers < 0) { + if (nmembers <= 0) { ereport(DEBUG2, (errmsg("IsRunning: no members"))); return false; } /* - * Checking for myself is cheap compared to looking in shared memory, so - * first do the equivalent of MultiXactIdIsCurrent(). This is not needed - * for correctness, it's just a fast path. + * Checking for myself is cheap compared to looking in shared memory; + * return true if any live subtransaction of the current top-level + * transaction is a member. + * + * This is not needed for correctness, it's just a fast path. */ for (i = 0; i < nmembers; i++) { - if (TransactionIdIsCurrentTransactionId(members[i])) { + if (TransactionIdIsCurrentTransactionId(members[i].xid)) { ereport(DEBUG2, (errmsg("IsRunning: I (%d) am running!", i))); pfree(members); members = NULL; @@ -363,8 +393,8 @@ bool MultiXactIdIsRunning(MultiXactId multi) * cases nmembers should be small enough that it doesn't much matter. */ for (i = 0; i < nmembers; i++) { - if (TransactionIdIsInProgress(members[i])) { - ereport(DEBUG2, (errmsg("IsRunning: member %d (" XID_FMT ") is running", i, members[i]))); + if (TransactionIdIsInProgress(members[i].xid)) { + ereport(DEBUG2, (errmsg("IsRunning: member %d (" XID_FMT ") is running", i, members[i].xid))); pfree(members); members = NULL; return true; @@ -392,7 +422,7 @@ bool MultiXactIdIsRunning(MultiXactId multi) bool MultiXactIdIsCurrent(MultiXactId multi) { bool result = false; - TransactionId *members = NULL; + MultiXactMember *members = NULL; int nmembers; int i; @@ -402,7 +432,7 @@ bool MultiXactIdIsCurrent(MultiXactId multi) } for (i = 0; i < nmembers; i++) { - if (TransactionIdIsCurrentTransactionId(members[i])) { + if (TransactionIdIsCurrentTransactionId(members[i].xid)) { result = true; break; } @@ -415,15 +445,16 @@ bool MultiXactIdIsCurrent(MultiXactId multi) /* * MultiXactIdSetOldestMember - * Save the oldest MultiXactId this transaction could be a member of. + * Save the oldest MultiXactId this transaction could be a member of. * - * We set the OldestMemberMXactId for a given transaction the first time - * it's going to acquire a shared lock. We need to do this even if we end - * up using a TransactionId instead of a MultiXactId, because there is a - * chance that another transaction would add our XID to a MultiXactId. + * We set the OldestMemberMXactId for a given transaction the first time it's + * going to do some operation that might require a MultiXactId (tuple lock, + * update or delete). We need to do this even if we end up using a + * TransactionId instead of a MultiXactId, because there is a chance that + * another transaction would add our XID to a MultiXactId. * - * The value to set is the next-to-be-assigned MultiXactId, so this is meant - * to be called just before acquiring a shared lock. + * The value to set is the next-to-be-assigned MultiXactId, so this is meant to + * be called just before doing any such possibly-MultiXactId-able operation. */ void MultiXactIdSetOldestMember(void) { @@ -438,8 +469,13 @@ void MultiXactIdSetOldestMember(void) * another someone else could compute an OldestVisibleMXactId that * would be after the value we are going to store when we get control * back. Which would be wrong. + * + * Note that a shared lock is sufficient, because it's enough to stop + * someone from advancing nextMXact; and nobody else could be trying to + * write to our OldestMember entry, only reading (and we assume storing + * it is atomic.) */ - (void)LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + (void)LWLockAcquire(MultiXactGenLock, LW_SHARED); /* * We have to beware of the possibility that nextMXact is in the @@ -527,73 +563,101 @@ MultiXactId ReadNextMultiXactId(void) } /* - * MultiXactIdWait - * Sleep on a MultiXactId. + * DoMultiXactIdWait + * Actual implementation for the two functions below. * * We do this by sleeping on each member using XactLockTableWait. Any * members that belong to the current backend are *not* waited for, however; * this would not merely be useless but would lead to Assert failure inside * XactLockTableWait. By the time this returns, it is certain that all * transactions *of other backends* that were members of the MultiXactId - * are dead (and no new ones can have been added, since it is not legal - * to add members to an existing MultiXactId). + * that conflict with the requested status are dead (and no new ones can have + * been added, since it is not legal to add members to an existing + * MultiXactId). * * But by the time we finish sleeping, someone else may have changed the Xmax * of the containing tuple, so the caller needs to iterate on us somehow. + * + * Note that in case we return false, the number of remaining members is + * not to be trusted. */ -void MultiXactIdWait(MultiXactId multi, bool allow_con_update) +bool DoMultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining, bool nowait) { - TransactionId *members = NULL; - int nmembers = 0; + bool result = true; + MultiXactMember *members = NULL; + int nmembers; + int remain = 0; nmembers = GetMultiXactIdMembers(multi, &members); - if (nmembers >= 0) { - int i; - for (i = 0; i < nmembers; i++) { - TransactionId member = members[i]; + for (int i = 0; i < nmembers; i++) { + TransactionId memxid = members[i].xid; + MultiXactStatus memstatus = members[i].status; - ereport(DEBUG2, (errmsg("MultiXactIdWait: waiting for %d (%lu)", i, member))); - if (!TransactionIdIsCurrentTransactionId(member)) - XactLockTableWait(member, allow_con_update); + if (TransactionIdIsCurrentTransactionId(memxid)) { + remain++; + continue; } - pfree(members); - members = NULL; + if (!DoLockModesConflict(LOCKMODE_FROM_MXSTATUS(memstatus), LOCKMODE_FROM_MXSTATUS(status))) { + if (remaining && TransactionIdIsInProgress(memxid)) + remain++; + continue; + } + + /* + * This member conflicts with our multi, so we have to sleep (or + * return failure, if asked to avoid waiting.) + */ + if (nowait) { + result = ConditionalXactLockTableWait(memxid); + if (!result) { + break; + } + } else { + XactLockTableWait(memxid, true); + } } + + pfree_ext(members); + + if (remaining) + *remaining = remain; + + return result; +} + +/* + * MultiXactIdWait + * Sleep on a MultiXactId. + * + * By the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * We return (in *remaining, if not NULL) the number of members that are still + * running, including any (non-aborted) subtransactions of our own transaction. + */ +void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining) +{ + DoMultiXactIdWait(multi, status, remaining, false); } /* * ConditionalMultiXactIdWait - * As above, but only lock if we can get the lock without blocking. + * As above, but only lock if we can get the lock without blocking. + * + * By the time we finish sleeping, someone else may have changed the Xmax + * of the containing tuple, so the caller needs to iterate on us somehow. + * + * If the multixact is now all gone, return true. Returns false if some + * transactions might still be running. + * + * We return (in *remaining, if not NULL) the number of members that are still + * running, including any (non-aborted) subtransactions of our own transaction. */ -bool ConditionalMultiXactIdWait(MultiXactId multi) +bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining) { - bool result = true; - TransactionId *members = NULL; - int nmembers; - - nmembers = GetMultiXactIdMembers(multi, &members); - if (nmembers >= 0) { - int i; - - for (i = 0; i < nmembers; i++) { - TransactionId member = members[i]; - - ereport(DEBUG2, (errmsg("ConditionalMultiXactIdWait: trying %d (%lu)", i, member))); - if (!TransactionIdIsCurrentTransactionId(member)) { - result = ConditionalXactLockTableWait(member); - if (!result) { - break; - } - } - } - - pfree(members); - members = NULL; - } - - return result; + return DoMultiXactIdWait(multi, status, remaining, true); } /* @@ -603,18 +667,30 @@ bool ConditionalMultiXactIdWait(MultiXactId multi) * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the * given TransactionIds as members. Returns the newly created MultiXactId. * - * NB: the passed xids[] array will be sorted in-place. + * NB: the passed members[] array will be sorted in-place. */ -static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids) +static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members) { MultiXactId multi; MultiXactOffset offset; + TransactionId *xidsWithStatus; xl_multixact_create xlrec; - debug_elog3(DEBUG2, "Create: %s", mxid_to_string(InvalidMultiXactId, nxids, xids)); + if (t_thrd.proc->workingVersionNum < ENHANCED_TUPLE_LOCK_VERSION_NUM) { + for (int i = 0; i < nmembers; ++i) { + if (members[i].status != MultiXactStatusForShare) { + ereport(ERROR, (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("New MultiXact feature isn't support in this version. Please upgrade to version: %d" + "i: %d, members[i].status: %d", + ENHANCED_TUPLE_LOCK_VERSION_NUM, i, members[i].status))); + } + } + } + + debug_elog3(DEBUG2, "Create: %s", mxid_to_string(InvalidMultiXactId, nmembers, members)); /* - * See if the same set of XIDs already exists in our cache; if so, just + * See if the same set of members already exists in our cache; if so, just * re-use that MultiXactId. (Note: it might seem that looking in our * cache is insufficient, and we ought to search disk to see if a * duplicate definition already exists. But since we only ever create @@ -623,17 +699,38 @@ static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids) * corner cases where someone else added us to a MultiXact without our * knowledge, but it's not worth checking for.) */ - multi = mXactCacheGetBySet(nxids, xids); + multi = mXactCacheGetBySet(nmembers, members); if (MultiXactIdIsValid(multi)) { ereport(DEBUG2, (errmsg("Create: in cache!"))); return multi; } + /* Verify that there is a single update Xid among the given members. */ + { + int i; + bool has_update = false; + + for (i = 0; i < nmembers; i++) { + if (ISUPDATE_from_mxstatus(members[i].status)) { + if (has_update) + ereport(ERROR, (errmsg("new multixact has more than one updating member"))); + has_update = true; + } + } + } + + xidsWithStatus = (TransactionId *)palloc((unsigned)nmembers * sizeof(TransactionId)); + + /* High 3 bits record the status, low 60 bits record the xid */ + for (int i = 0; i < nmembers; ++i) { + xidsWithStatus[i] = GET_SLRU_XID_FROM_MULTIXACT_MEMBER(members + i); + } + /* * Assign the MXID and offsets range to use, and make sure there is space * in the OFFSETs and MEMBERs files. NB: this routine does START_CRIT_SECTION(). */ - multi = GetNewMultiXactId(nxids, &offset); + multi = GetNewMultiXactId(nmembers, &offset); /* * Make an XLOG entry describing the new MXID. @@ -650,22 +747,24 @@ static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids) */ xlrec.mid = multi; xlrec.moff = offset; - xlrec.nxids = nxids; + xlrec.nxids = nmembers; XLogBeginInsert(); XLogRegisterData((char *)(&xlrec), MinSizeOfMultiXactCreate); - XLogRegisterData((char *)xids, (unsigned)nxids * sizeof(TransactionId)); + XLogRegisterData((char *)xidsWithStatus, (unsigned)nmembers * sizeof(TransactionId)); (void)XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); /* Now enter the information into the OFFSETs and MEMBERs logs */ - RecordNewMultiXact(multi, offset, nxids, xids); + RecordNewMultiXact(multi, offset, nmembers, xidsWithStatus); /* Done with critical section */ END_CRIT_SECTION(); /* Store the new MultiXactId in the local cache, too */ - mXactCachePut(multi, nxids, xids); + mXactCachePut(multi, nmembers, members); + + pfree(xidsWithStatus); ereport(DEBUG2, (errmsg("Create: all done"))); @@ -678,7 +777,7 @@ static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids) * * This is broken out of CreateMultiXactId so that xlog replay can use it. */ -static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nxids, TransactionId *xids) +static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nmembers, TransactionId *xidsWithStatus) { int64 pageno; int64 prev_pageno; @@ -714,7 +813,7 @@ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nx prev_pageno = -1; - for (i = 0; i < nxids; i++, offset++) { + for (i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr = NULL; pageno = (int64)MXOffsetToMemberPage(offset); @@ -728,7 +827,8 @@ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nx memberptr = (TransactionId *)t_thrd.shemem_ptr_cxt.MultiXactMemberCtl->shared->page_buffer[slotno]; memberptr += entryno; - *memberptr = xids[i]; + /* High 3 bits record the status, low 60 bits record the xid */ + *memberptr = xidsWithStatus[i]; t_thrd.shemem_ptr_cxt.MultiXactMemberCtl->shared->page_dirty[slotno] = true; } @@ -751,12 +851,12 @@ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int nx * We start a critical section before advancing the shared counters. The * caller must end the critical section after writing SLRU data. */ -static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset) +static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset) { MultiXactId result; MultiXactOffset nextOffset; - ereport(DEBUG2, (errmsg("GetNew: for %d xids", nxids))); + ereport(DEBUG2, (errmsg("GetNew: for %d xids", nmembers))); /* MultiXactIdSetOldestMember() must have been called already */ Assert(MultiXactIdIsValid(t_thrd.shemem_ptr_cxt.OldestMemberMXactId[t_thrd.proc_cxt.MyBackendId])); @@ -782,11 +882,11 @@ static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset) nextOffset = t_thrd.shemem_ptr_cxt.MultiXactState->nextOffset; if (nextOffset == 0) { *offset = 1; - nxids++; /* allocate member slot 0 too */ + nmembers++; /* allocate member slot 0 too */ } else *offset = nextOffset; - ExtendMultiXactMember(nextOffset, nxids); + ExtendMultiXactMember(nextOffset, nmembers); /* * Critical section from here until caller has written the data into the @@ -808,7 +908,7 @@ static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset) */ (t_thrd.shemem_ptr_cxt.MultiXactState->nextMXact)++; - t_thrd.shemem_ptr_cxt.MultiXactState->nextOffset += (unsigned)nxids; + t_thrd.shemem_ptr_cxt.MultiXactState->nextOffset += (unsigned)nmembers; LWLockRelease(MultiXactGenLock); @@ -818,13 +918,17 @@ static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset) /* * GetMultiXactIdMembers - * Returns the set of TransactionIds that make up a MultiXactId + * Returns the set of MultiXactMembers that make up a MultiXactId * - * We return -1 if the MultiXactId is too old to possibly have any members - * still running; in that case we have not actually looked them up, and - * *xids is not set. + * If the given MultiXactId is older than the value we know to be oldest, we + * return -1. + * + * Other border conditions, such as trying to read a value that's larger than + * the value currently known as the next to assign, raise an error. Previously + * these also returned -1, but since this can lead to the wrong visibility + * results, it is dangerous to do that. */ -int GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) +int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members) { int64 pageno; int64 prev_pageno; @@ -838,16 +942,17 @@ int GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) MultiXactId nextMXact; MultiXactId tmpMXact; MultiXactOffset nextOffset; - TransactionId *ptr = NULL; + MultiXactMember *ptr = NULL; + MultiXactId oldestMXact; ereport(DEBUG2, (errmsg("GetMembers: asked for " XID_FMT, multi))); Assert(MultiXactIdIsValid(multi)); /* See if the MultiXactId is in the local cache */ - length = mXactCacheGetById(multi, xids); + length = mXactCacheGetById(multi, members); if (length >= 0) { - debug_elog3(DEBUG2, "GetMembers: found %s in the cache", mxid_to_string(multi, length, *xids)); + debug_elog3(DEBUG2, "GetMembers: found %s in the cache", mxid_to_string(multi, length, *members)); return length; } @@ -857,36 +962,29 @@ int GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) /* * We check known limits on MultiXact before resorting to the SLRU area. * - * An ID older than our OldestVisibleMXactId[] entry can't possibly still - * be running, and we'd run the risk of trying to read already-truncated - * SLRU data if we did try to examine it. - * - * Conversely, an ID >= nextMXact shouldn't ever be seen here; + * An ID >= nextMXact shouldn't ever be seen here; * * Shared lock is enough here since we aren't modifying any global state. - * Also, we can examine our own OldestVisibleMXactId without the lock, - * since no one else is allowed to change it. - */ - if (MultiXactIdPrecedes(multi, t_thrd.shemem_ptr_cxt.OldestVisibleMXactId[t_thrd.proc_cxt.MyBackendId])) { - ereport(DEBUG2, (errmsg("GetMembers: it's too old"))); - *xids = NULL; - return -1; - } - - /* - * Acquire the shared lock just long enough to grab the current counter - * values. We may need both nextMXact and nextOffset; see below. + * Acquire it just long enough to grab the current counter values. We may + * need both nextMXact and nextOffset; see below. */ (void)LWLockAcquire(MultiXactGenLock, LW_SHARED); + oldestMXact = t_thrd.shemem_ptr_cxt.MultiXactState->oldestMultiXactId; nextMXact = t_thrd.shemem_ptr_cxt.MultiXactState->nextMXact; nextOffset = t_thrd.shemem_ptr_cxt.MultiXactState->nextOffset; LWLockRelease(MultiXactGenLock); + if (MultiXactIdPrecedes(multi, oldestMXact)) { + ereport(DEBUG2, (errmsg("MultiXactId %lu does no longer exist -- apparent wraparound", multi))); + *members = NULL; + return -1; + } + if (!MultiXactIdPrecedes(multi, nextMXact)) { - ereport(DEBUG2, (errmsg("GetMembers: it's too new!"))); - *xids = NULL; + ereport(DEBUG2, (errmsg("MultiXactId %lu has not been created yet -- apparent wraparound", multi))); + *members = NULL; return -1; } @@ -910,8 +1008,9 @@ int GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids) * multixact, when we read zero as the next multixact's offset, we know we * have this case. We sleep for a bit and try again. * - * 3. Because GetNewMultiXactId increments offset zero to offset one - * If we see next multixact's offset is one, is that our multixact's actual + * 3. Because GetNewMultiXactId increments offset zero to offset one to + * handle case #2, there is an ambiguity near the point of offset + * wraparound. If we see next multixact's offset is one, is that our multixact's actual * endpoint, or did it end at zero with a subsequent increment? We * handle this using the knowledge that if the zero'th member slot wasn't * filled, it'll contain zero, and zero isn't a valid transaction ID so it can't @@ -968,8 +1067,8 @@ retry: LWLockRelease(MultiXactOffsetControlLock); - ptr = (TransactionId *)palloc((unsigned)length * sizeof(TransactionId)); - *xids = ptr; + ptr = (MultiXactMember *)palloc((unsigned)length * sizeof(MultiXactMember)); + *members = ptr; /* Now get the members themselves. */ (void)LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); @@ -978,6 +1077,7 @@ retry: prev_pageno = -1; for (i = 0; i < length; i++, offset++) { TransactionId *xactptr = NULL; + TransactionId memberXid; pageno = (int64)MXOffsetToMemberPage(offset); entryno = MXOffsetToMemberEntry(offset); @@ -990,13 +1090,16 @@ retry: xactptr = (TransactionId *)t_thrd.shemem_ptr_cxt.MultiXactMemberCtl->shared->page_buffer[slotno]; xactptr += entryno; - if (!TransactionIdIsValid(*xactptr)) { + memberXid = GET_MEMBER_XID_FROM_SLRU_XID(*xactptr); + if (!TransactionIdIsValid(memberXid)) { /* Corner case 3: we must be looking at unused slot zero */ Assert(offset == 0); continue; } - ptr[truelength++] = *xactptr; + ptr[truelength].xid = memberXid; + ptr[truelength].status = GET_MEMBER_STATUS_FROM_SLRU_XID(*xactptr); + ++truelength; } LWLockRelease(MultiXactMemberControlLock); @@ -1010,6 +1113,41 @@ retry: return truelength; } +/* + * MXactMemberComparator + * qsort comparison function for MultiXactMember + */ +static int MXactMemberComparator(const void *arg1, const void *arg2) +{ + MultiXactMember member1 = *(const MultiXactMember *)arg1; + MultiXactMember member2 = *(const MultiXactMember *)arg2; + + if (member1.xid > member2.xid) { + return 1; + } + if (member1.xid < member2.xid) { + return -1; + } + + /* + * Because of compatibility, we set MultiXactStatusForShare = 0x00. + * But for status strength, MultiXactStatusForKeyShare = 0x01 is + * weaker thean MultiXactStatusForShare, so we exchange these two + * value. + */ + int status1 = (member1.status == MultiXactStatusForShare) ? 1 : + ((member1.status == MultiXactStatusForKeyShare) ? 0 : (int)(member1.status)); + int status2 = (member2.status == MultiXactStatusForShare) ? 1 : + ((member2.status == MultiXactStatusForKeyShare) ? 0 : (int)(member2.status)); + if (status1 > status2) { + return 1; + } + if (status1 < status2) { + return -1; + } + return 0; +} + /* * mXactCacheGetBySet * returns a MultiXactId from the cache based on the set of @@ -1021,23 +1159,23 @@ retry: * for the majority of tuples, thus keeping MultiXactId usage low (saving * both I/O). * - * NB: the passed xids[] array will be sorted in-place. + * NB: the passed members[] array will be sorted in-place. */ -static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids) +static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members) { mXactCacheEnt *entry = NULL; - debug_elog3(DEBUG2, "CacheGet: looking for %s", mxid_to_string(InvalidMultiXactId, nxids, xids)); + debug_elog3(DEBUG2, "CacheGet: looking for %s", mxid_to_string(InvalidMultiXactId, nmembers, members)); /* sort the array so comparison is easy */ - qsort(xids, nxids, sizeof(TransactionId), xidComparator); + qsort(members, nmembers, sizeof(MultiXactMember), MXactMemberComparator); for (entry = t_thrd.xact_cxt.MXactCache; entry != NULL; entry = entry->next) { - if (entry->nxids != nxids) + if (entry->nmembers != nmembers) continue; /* We assume the cache entries are sorted */ - if (memcmp(xids, entry->xids, (unsigned)nxids * sizeof(TransactionId)) == 0) { + if (memcmp(members, entry->members, (unsigned)nmembers * sizeof(MultiXactMember)) == 0) { ereport(DEBUG2, (errmsg("CacheGet: found " XID_FMT, entry->multi))); return entry->multi; } @@ -1049,13 +1187,13 @@ static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids) /* * mXactCacheGetById - * returns the composing TransactionId set from the cache for a + * returns the composing MultiXactMember set from the cache for a * given MultiXactId, if present. * - * If successful, *xids is set to the address of a palloc'd copy of the - * TransactionId set. Return value is number of members, or -1 on failure. + * If successful, *members is set to the address of a palloc'd copy of the + * MultiXactMember set. Return value is number of members, or -1 on failure. */ -static int mXactCacheGetById(MultiXactId multi, TransactionId **xids) +static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members) { mXactCacheEnt *entry = NULL; errno_t rc = EOK; @@ -1064,18 +1202,18 @@ static int mXactCacheGetById(MultiXactId multi, TransactionId **xids) for (entry = t_thrd.xact_cxt.MXactCache; entry != NULL; entry = entry->next) { if (entry->multi == multi) { - TransactionId *ptr = NULL; + MultiXactMember *ptr = NULL; Size size; - size = sizeof(TransactionId) * (unsigned)entry->nxids; - ptr = (TransactionId *)palloc(size); - *xids = ptr; + size = sizeof(MultiXactMember) * (unsigned)entry->nmembers; + ptr = (MultiXactMember *)palloc(size); + *members = ptr; - rc = memcpy_s(ptr, size, entry->xids, size); + rc = memcpy_s(ptr, size, entry->members, size); securec_check(rc, "", ""); - debug_elog3(DEBUG2, "CacheGet: found %s", mxid_to_string(multi, entry->nxids, entry->xids)); - return entry->nxids; + debug_elog3(DEBUG2, "CacheGet: found %s", mxid_to_string(multi, entry->nmembers, entry->members)); + return entry->nmembers; } } @@ -1087,12 +1225,12 @@ static int mXactCacheGetById(MultiXactId multi, TransactionId **xids) * mXactCachePut * Add a new MultiXactId and its composing set into the local cache. */ -static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids) +static void mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) { mXactCacheEnt *entry = NULL; errno_t rc = EOK; - debug_elog3(DEBUG2, "CachePut: storing %s", mxid_to_string(multi, nxids, xids)); + debug_elog3(DEBUG2, "CachePut: storing %s", mxid_to_string(multi, nmembers, members)); if (t_thrd.xact_cxt.MXactContext == NULL) { /* The cache only lives as long as the current transaction */ @@ -1102,51 +1240,66 @@ static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids) ALLOCSET_SMALL_MAXSIZE); } - entry = - (mXactCacheEnt *)MemoryContextAlloc(t_thrd.xact_cxt.MXactContext, - offsetof(mXactCacheEnt, xids) + (unsigned)nxids * sizeof(TransactionId)); + entry = (mXactCacheEnt *)MemoryContextAlloc(t_thrd.xact_cxt.MXactContext, + offsetof(mXactCacheEnt, members) + (unsigned)nmembers * sizeof(MultiXactMember)); entry->multi = multi; - entry->nxids = nxids; - rc = memcpy_s(entry->xids, (unsigned)nxids * sizeof(TransactionId), xids, (unsigned)nxids * sizeof(TransactionId)); + entry->nmembers = nmembers; + rc = memcpy_s(entry->members, (unsigned)nmembers * sizeof(MultiXactMember), members, + (unsigned)nmembers * sizeof(MultiXactMember)); securec_check(rc, "", ""); /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ - qsort(entry->xids, nxids, sizeof(TransactionId), xidComparator); + qsort(entry->members, nmembers, sizeof(MultiXactMember), MXactMemberComparator); entry->next = t_thrd.xact_cxt.MXactCache; t_thrd.xact_cxt.MXactCache = entry; } -#ifdef MULTIXACT_DEBUG -static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids) +static const char *MXStatusToString(MultiXactStatus status) { -#define XIDLEN 17 + switch (status) { + case MultiXactStatusForKeyShare: + return "keysh"; + case MultiXactStatusForShare: + return "sh"; + case MultiXactStatusForNoKeyUpdate: + return "fornokeyupd"; + case MultiXactStatusForUpdate: + return "forupd"; + case MultiXactStatusNoKeyUpdate: + return "nokeyupd"; + case MultiXactStatusUpdate: + return "upd"; + default: + elog(ERROR, "unrecognized multixact status %d", (int)status); + return ""; + } +} - size_t total_len = 15 * (nxids + 1) + 4; - char *str = palloc0(total_len); +static char *mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) +{ + char *str = NULL; + StringInfoData buf; int i; - int len = 0; - errno_t errorno = EOK; - len = total_len; - errorno = snprintf_s(str, len, len - 1, XID_FMT " %d[" XID_FMT, multi, nxids, xids[0]); - securec_check_ss(errorno, "", ""); + if (str != NULL) + pfree(str); - for (i = 1; i < nxids; i++) { - size_t used_len = strlen(str); - len = total_len - used_len; - errorno = snprintf_s(str + used_len, len, len - 1, ", " XID_FMT, xids[i]); - securec_check_ss(errorno, "", ""); + initStringInfo(&buf); + + appendStringInfo(&buf, XID_FMT " %d[" XID_FMT " (%s)", multi, nmembers, + members[0].xid, MXStatusToString(members[0].status)); + + for (i = 1; i < nmembers; i++) { + appendStringInfo(&buf, ", " XID_FMT " (%s)", members[i].xid, MXStatusToString(members[i].status)); } - len = total_len - strlen(str); - errorno = strcat_s(str, len, "]"); - securec_check(errorno, "", ""); - + appendStringInfoChar(&buf, ']'); + str = MemoryContextStrdup(SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_EXECUTOR), buf.data); + pfree(buf.data); return str; } -#endif /* * AtEOXact_MultiXact @@ -1176,7 +1329,7 @@ void AtEOXact_MultiXact(void) /* * AtPrepare_MultiXact - * Save multixact state at 2PC tranasction prepare + * Save multixact state at 2PC transaction prepare * * In this phase, we only store our OldestMemberMXactId value in the two-phase * state file. @@ -1270,7 +1423,7 @@ void multixact_twophase_postcommit(TransactionId xid, uint16 info, void *recdata /* * multixact_twophase_postabort - * This is actually just the same as the COMMIT case. + * This is actually just the same as the COMMIT case. */ void multixact_twophase_postabort(TransactionId xid, uint16 info, void *recdata, uint32 len) { @@ -1526,6 +1679,7 @@ void CheckPointMultiXact(void) flush_num = SimpleLruFlush(t_thrd.shemem_ptr_cxt.MultiXactMemberCtl, true); g_instance.ckpt_cxt_ctl->ckpt_multixact_flush_num += flush_num; +#ifdef ENABLE_MULTIPLE_NODES /* * Truncate the SLRU files. This could be done at any time, but * checkpoint seems a reasonable place for it. There is one exception: if @@ -1536,6 +1690,7 @@ void CheckPointMultiXact(void) */ if (!RecoveryInProgress()) TruncateMultiXact(); +#endif TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); } @@ -1557,9 +1712,51 @@ void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffse LWLockRelease(MultiXactGenLock); } +/* + * Determine the last safe MultiXactId to allocate given the currently oldest + * datminmxid (ie, the oldest MultiXactId that might exist in any database + * of our cluster), and the OID of the (or a) database with that value. + */ +void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) +{ + MultiXactId multiVacLimit; + MultiXactId curMulti; + + Assert(MultiXactIdIsValid(oldest_datminmxid)); + + /* + * We'll start trying to force autovacuums when oldest_datminmxid gets + * to be more than autovacuum_freeze_max_age mxids old. + * + * It's a bit ugly to just reuse limits for xids that way, but it doesn't + * seem worth adding separate GUCs for that purpose. + */ + multiVacLimit = oldest_datminmxid + g_instance.attr.attr_storage.autovacuum_freeze_max_age; + if (multiVacLimit < FirstMultiXactId) + multiVacLimit += FirstMultiXactId; + + /* Grab lock for just long enough to set the new limit values */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + t_thrd.shemem_ptr_cxt.MultiXactState->oldestMultiXactId = oldest_datminmxid; + t_thrd.shemem_ptr_cxt.MultiXactState->oldestMultiXactDB = oldest_datoid; + t_thrd.shemem_ptr_cxt.MultiXactState->multiVacLimit = multiVacLimit; + curMulti = t_thrd.shemem_ptr_cxt.MultiXactState->nextMXact; + LWLockRelease(MultiXactGenLock); + + /* + * If past the autovacuum force point, immediately signal an autovac + * request. The reason for this is that autovac only processes one + * database per invocation. Once it's finished cleaning up the oldest + * database, it'll call here, and we'll signal the postmaster to start + * another iteration immediately if there are still any old databases. + */ + if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster && !t_thrd.xlog_cxt.InRecovery) + SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); +} + /* * Ensure the next-to-be-assigned MultiXactId is at least minMulti, - * and similarly nextOffset is at least minMultiOffset + * and similarly nextOffset is at least minMultiOffset. * * This is used when we can determine minimum safe values from an XLog * record (either an on-line checkpoint or an mxact creation log entry). @@ -1580,6 +1777,16 @@ void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOff LWLockRelease(MultiXactGenLock); } +/* + * Update our oldestMultiXactId value, but only if it's more recent than + * what we had. + */ +void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) +{ + if (MultiXactIdPrecedes(t_thrd.shemem_ptr_cxt.MultiXactState->oldestMultiXactId, oldestMulti)) + SetMultiXactIdLimit(oldestMulti, oldestMultiDB); +} + /* * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. * @@ -1650,26 +1857,24 @@ static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers) } /* - * Remove all MultiXactOffset and MultiXactMember segments before the oldest - * ones still of interest. + * GetOldestMultiXactId * - * This is called only during checkpoints. We assume no more than one - * backend does this at a time. + * Return the oldest MultiXactId that's still possibly still seen as live by + * any running transaction. Older ones might still exist on disk, but they no + * longer have any running member transaction. * - * XXX do we have any issues with needing to checkpoint here? + * It's not safe to truncate MultiXact SLRU segments on the value returned by + * this function; however, it can be used by a full-table vacuum to set the + * point at which it will be possible to truncate SLRU for that table. */ -static void TruncateMultiXact(void) +MultiXactId GetOldestMultiXactId(void) { - MultiXactId nextMXact; - MultiXactOffset nextOffset; MultiXactId oldestMXact; - MultiXactOffset oldestOffset; - int cutoffPage; + MultiXactId nextMXact; int i; /* - * First, compute where we can safely truncate. Per notes above, this is - * the oldest valid value among all the OldestMemberMXactId[] and + * This is the oldest valid value among all the OldestMemberMXactId[] and * OldestVisibleMXactId[] entries, or nextMXact if none are valid. */ (void)LWLockAcquire(MultiXactGenLock, LW_SHARED); @@ -1695,6 +1900,93 @@ static void TruncateMultiXact(void) oldestMXact = thisoldest; } + LWLockRelease(MultiXactGenLock); + + return oldestMXact; +} + +#ifndef ENABLE_MULTIPLE_NODES +typedef struct mxtruncinfo { + int earliestExistingPage; +} mxtruncinfo; + +/* + * Decide whether a MultiXactOffset page number is "older" for truncation + * purposes. Analogous to CLOGPagePrecedes(). + * + * Offsetting the values is optional, because MultiXactIdPrecedes() has + * translational symmetry. + */ +static bool MultiXactOffsetPagePrecedes(int page1, int page2) +{ + MultiXactId multi1; + MultiXactId multi2; + + multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; + multi1 += FirstMultiXactId + 1; + multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; + multi2 += FirstMultiXactId + 1; + + return (MultiXactIdPrecedes(multi1, multi2) && + MultiXactIdPrecedes(multi1, multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1)); +} + +/* + * SlruScanDirectory callback + * This callback determines the earliest existing page number. + */ +static bool SlruScanDirCbFindEarliest(SlruCtl ctl, const char* filename, int64 segpage, const void* data) +{ + mxtruncinfo *trunc = (mxtruncinfo *)data; + + if (trunc->earliestExistingPage == -1 || MultiXactOffsetPagePrecedes(segpage, trunc->earliestExistingPage)) { + trunc->earliestExistingPage = segpage; + } + + return false; /* keep going */ +} +#endif + +/* + * Remove all MultiXactOffset and MultiXactMember segments before the oldest + * ones still of interest. + * + * This is called by vacuum after it has successfully advanced a database's + * datminmxid value; the cutoff value we're passed is the minimum of all + * databases' datminmxid values. + */ +void TruncateMultiXact(MultiXactId oldestMXact) +{ + MultiXactOffset oldestOffset; + +#ifndef ENABLE_MULTIPLE_NODES + mxtruncinfo trunc; + MultiXactId earliest; + /* + * Note we can't just plow ahead with the truncation; it's possible that + * there are no segments to truncate, which is a problem because we are + * going to attempt to read the offsets page to determine where to truncate + * the members SLRU. So we first scan the directory to determine the + * earliest offsets page number that we can read without error. + */ + trunc.earliestExistingPage = -1; + SlruScanDirectory(t_thrd.shemem_ptr_cxt.MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); + earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; + + /* nothing to do */ + if (MultiXactIdPrecedes(oldestMXact, earliest)) + return; +#else + MultiXactOffset nextOffset; + MultiXactId nextMXact; + oldestMXact = GetOldestMultiXactId(); + + (void)LWLockAcquire(MultiXactGenLock, LW_SHARED); + + nextMXact = t_thrd.shemem_ptr_cxt.MultiXactState->nextMXact; + if (nextMXact < FirstMultiXactId) + nextMXact = FirstMultiXactId; + /* Save the current nextOffset too */ nextOffset = t_thrd.shemem_ptr_cxt.MultiXactState->nextOffset; @@ -1716,7 +2008,14 @@ static void TruncateMultiXact(void) */ if (oldestMXact == nextMXact) oldestOffset = nextOffset; - else { + else +#endif + /* + * First, compute the safe truncation point for MultiXactMember. + * This is the starting offset of the multixact we were passed + * as MultiXactOffset cutoff. + */ + { int64 pageno; int slotno; int entryno; @@ -1734,20 +2033,13 @@ static void TruncateMultiXact(void) LWLockRelease(MultiXactOffsetControlLock); } - /* - * The cutoff point is the start of the segment containing oldestMXact. We - * pass the *page* containing oldestMXact to SimpleLruTruncate. - */ - cutoffPage = (int)MultiXactIdToOffsetPage(oldestMXact); + /* truncate MultiXactOffset */ + SimpleLruTruncate(t_thrd.shemem_ptr_cxt.MultiXactOffsetCtl, MultiXactIdToOffsetPage(oldestMXact), + NUM_SLRU_DEFAULT_PARTITION); - SimpleLruTruncate(t_thrd.shemem_ptr_cxt.MultiXactOffsetCtl, cutoffPage, NUM_SLRU_DEFAULT_PARTITION); - - /* - * Also truncate MultiXactMember at the previously determined offset. - */ - cutoffPage = (int)MXOffsetToMemberPage(oldestOffset); - - SimpleLruTruncate(t_thrd.shemem_ptr_cxt.MultiXactMemberCtl, cutoffPage, NUM_SLRU_DEFAULT_PARTITION); + /* truncate MultiXactMembers and we're done */ + SimpleLruTruncate(t_thrd.shemem_ptr_cxt.MultiXactMemberCtl, MXOffsetToMemberPage(oldestOffset), + NUM_SLRU_DEFAULT_PARTITION); /* * Set the last known truncation point. We don't need a lock for this @@ -1926,8 +2218,9 @@ XLogRecParseState *multixact_xlog_updateoid_parse_to_block(XLogReaderState *reco nextoffset = xlrec->moff + xlrec->nxids; max_xid = XLogRecGetXid(record); for (int32 i = 0; i < xlrec->nxids; i++) { - if (TransactionIdPrecedes(max_xid, xlrec->xids[i])) - max_xid = xlrec->xids[i]; + TransactionId memberXid = GET_MEMBER_XID_FROM_SLRU_XID(xlrec->xids[i]); + if (TransactionIdPrecedes(max_xid, memberXid)) + max_xid = memberXid; } XLogRecSetMultiXactUpdatOidState(&(blockstate->blockparse.extra_rec.blockmultiupdate), nextoffset, nextmulti, max_xid); @@ -2012,12 +2305,12 @@ void multixact_redo(XLogReaderState *record) LWLockRelease(MultiXactMemberControlLock); } else if (mask_info == XLOG_MULTIXACT_CREATE_ID) { xl_multixact_create *xlrec = (xl_multixact_create *)XLogRecGetData(record); - TransactionId *xids = xlrec->xids; + TransactionId *xidsWithStatus = xlrec->xids; TransactionId max_xid; int i; /* Store the data back into the SLRU files */ - RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xids); + RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nxids, xidsWithStatus); /* Make sure nextMXact/nextOffset are beyond what this record has */ MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids); @@ -2029,8 +2322,9 @@ void multixact_redo(XLogReaderState *record) */ max_xid = XLogRecGetXid(record); for (i = 0; i < xlrec->nxids; i++) { - if (TransactionIdPrecedes(max_xid, xids[i])) - max_xid = xids[i]; + TransactionId memberXid = GET_MEMBER_XID_FROM_SLRU_XID(xidsWithStatus[i]); + if (TransactionIdPrecedes(max_xid, memberXid)) + max_xid = memberXid; } /* diff --git a/src/gausskernel/storage/access/transam/xlog.cpp b/src/gausskernel/storage/access/transam/xlog.cpp index ba70a0f83..5cfa31410 100755 --- a/src/gausskernel/storage/access/transam/xlog.cpp +++ b/src/gausskernel/storage/access/transam/xlog.cpp @@ -7175,6 +7175,7 @@ void BootStrapXLOG(void) t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid); /* Set up the XLOG page header */ page->xlp_magic = XLOG_PAGE_MAGIC; @@ -9688,6 +9689,7 @@ void StartupXLOG(void) t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); + SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid); t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.oldestXid; t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; diff --git a/src/gausskernel/storage/access/ustore/knl_uheap.cpp b/src/gausskernel/storage/access/ustore/knl_uheap.cpp index 70a0c2a90..600d1672a 100644 --- a/src/gausskernel/storage/access/ustore/knl_uheap.cpp +++ b/src/gausskernel/storage/access/ustore/knl_uheap.cpp @@ -1501,12 +1501,12 @@ static bool UHeapWait(Relation relation, Buffer buffer, UHeapTuple utuple, LockT // wait multixid if (nowait) { - if (!ConditionalMultiXactIdWait((MultiXactId)xwait)) { + if (!ConditionalMultiXactIdWait((MultiXactId)xwait, GetMXactStatusForLock(mode, false), NULL)) { ereport(ERROR, (errcode(ERRCODE_LOCK_NOT_AVAILABLE), errmsg("could not obtain lock on row in relation \"%s\"", RelationGetRelationName(relation)))); } } else { - MultiXactIdWait(xwait, true); + MultiXactIdWait(xwait, GetMXactStatusForLock(mode, false), NULL); } // reacquire lock @@ -1667,7 +1667,7 @@ static void UHeapExecuteLockTuple(Relation relation, Buffer buffer, UHeapTuple u if (SINGLE_LOCKER_XID_IS_SHR_LOCKED(oldinfomask) && TransactionIdIsInProgress(xidOnTup)) { // create a multixid MultiXactIdSetOldestMember(); - xid = MultiXactIdCreate(xidOnTup, curxid); + xid = MultiXactIdCreate(xidOnTup, MultiXactStatusForShare, curxid, MultiXactStatusForShare); multi = true; utuple->disk_tuple->flag |= UHEAP_MULTI_LOCKERS; elog(DEBUG5, "locker %ld + locker %ld = multi %ld", curxid, xidOnTup, xid); @@ -1677,7 +1677,7 @@ static void UHeapExecuteLockTuple(Relation relation, Buffer buffer, UHeapTuple u * expand multixid to contain the current transaction id. */ MultiXactIdSetOldestMember(); - xid = MultiXactIdExpand((MultiXactId)xidOnTup, curxid); + xid = MultiXactIdExpand((MultiXactId)xidOnTup, curxid, MultiXactStatusForShare); multi = true; utuple->disk_tuple->flag |= UHEAP_MULTI_LOCKERS; elog(DEBUG5, "locker %ld + multi %ld = multi %ld", curxid, xidOnTup, xid); diff --git a/src/gausskernel/storage/access/ustore/knl_uvacuumlazy.cpp b/src/gausskernel/storage/access/ustore/knl_uvacuumlazy.cpp index f48441260..6a0c1e833 100644 --- a/src/gausskernel/storage/access/ustore/knl_uvacuumlazy.cpp +++ b/src/gausskernel/storage/access/ustore/knl_uvacuumlazy.cpp @@ -640,7 +640,8 @@ void LazyVacuumUHeapRel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrate if (RelationIsPartition(onerel)) { Assert(vacstmt->onepart != NULL); - vac_update_partstats(vacstmt->onepart, newRelPages, newRelTuples, newRelAllvisible, InvalidTransactionId); + vac_update_partstats(vacstmt->onepart, newRelPages, newRelTuples, newRelAllvisible, InvalidTransactionId, + InvalidMultiXactId); /* * when vacuum partition, do not change the relhasindex field in pg_class * for partitioned table, as some partition may be altered as "all local @@ -649,11 +650,11 @@ void LazyVacuumUHeapRel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrate * misdguge as hot update even if update indexes columns. */ vac_update_pgclass_partitioned_table(vacstmt->onepartrel, vacstmt->onepartrel->rd_rel->relhasindex, - InvalidTransactionId); + InvalidTransactionId, InvalidMultiXactId); } else { Relation classRel = heap_open(RelationRelationId, RowExclusiveLock); vac_update_relstats(onerel, classRel, newRelPages, newRelTuples, newRelAllvisible, nindexes > 0, - InvalidTransactionId); + InvalidTransactionId, InvalidMultiXactId); heap_close(classRel, RowExclusiveLock); } diff --git a/src/gausskernel/storage/cstore/cstore_rewrite.cpp b/src/gausskernel/storage/cstore/cstore_rewrite.cpp index b9d3975ce..c0269651b 100644 --- a/src/gausskernel/storage/cstore/cstore_rewrite.cpp +++ b/src/gausskernel/storage/cstore/cstore_rewrite.cpp @@ -33,6 +33,7 @@ #include "catalog/objectaccess.h" #include "access/cstore_insert.h" #include "access/tableam.h" +#include "access/multixact.h" #include "catalog/dependency.h" #include "utils/lsyscache.h" #include "catalog/index.h" @@ -593,8 +594,8 @@ void CStoreRewriter::RewriteColsData() index_close(oldCudescIndex, NoLock); // finish rewriting cudesc relation. - finish_heap_swap( - m_OldHeapRel->rd_rel->relcudescrelid, m_NewCuDescHeap, false, swapToastByContent, false, m_NewCudescFrozenXid); + finish_heap_swap(m_OldHeapRel->rd_rel->relcudescrelid, m_NewCuDescHeap, false, + swapToastByContent, false, m_NewCudescFrozenXid, FirstMultiXactId); } void CStoreRewriter::EndRewriteCols() @@ -894,7 +895,7 @@ void CStoreRewriter::FetchCudescFrozenXid(Relation oldCudescHeap) // Assert(oldCudescHeap->rd_rel->relisshared == false); TransactionId OldestXmin = InvalidTransactionId; - vacuum_set_xid_limits(oldCudescHeap, -1, -1, &OldestXmin, &m_NewCudescFrozenXid, NULL); + vacuum_set_xid_limits(oldCudescHeap, -1, -1, &OldestXmin, &m_NewCudescFrozenXid, NULL, NULL); // FreezeXid will become the table's new relfrozenxid, and that mustn't go // backwards, so take the max. @@ -2117,7 +2118,7 @@ void ATExecCStoreMergePartition(Relation partTableRel, AlterTableCmd* cmd) getPartitionName(destPartOid, false)))); } - finishPartitionHeapSwap(destPartOid, tempTableOid, true, u_sess->utils_cxt.RecentXmin); + finishPartitionHeapSwap(destPartOid, tempTableOid, true, u_sess->utils_cxt.RecentXmin, InvalidMultiXactId); partitionClose(partTableRel, destPart, NoLock); #ifndef ENABLE_MULTIPLE_NODES diff --git a/src/gausskernel/storage/lmgr/predicate.cpp b/src/gausskernel/storage/lmgr/predicate.cpp index 35643dd81..c3e9b6a93 100644 --- a/src/gausskernel/storage/lmgr/predicate.cpp +++ b/src/gausskernel/storage/lmgr/predicate.cpp @@ -3456,10 +3456,10 @@ void CheckForSerializableConflictOut(bool visible, Relation relation, void* stup case HEAPTUPLE_RECENTLY_DEAD: if (!visible) return; - xid = HeapTupleGetRawXmax(tuple); + xid = HeapTupleGetUpdateXid(tuple); break; case HEAPTUPLE_DELETE_IN_PROGRESS: - xid = HeapTupleGetRawXmax(tuple); + xid = HeapTupleGetUpdateXid(tuple); break; case HEAPTUPLE_INSERT_IN_PROGRESS: xid = HeapTupleGetRawXmin(tuple); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index fd4d46d05..b42f6b4fd 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -18,6 +18,7 @@ #include "access/sdir.h" #include "access/skey.h" #include "access/xlogrecord.h" +#include "access/multixact.h" #include "executor/tuptable.h" #include "nodes/primnodes.h" #include "storage/lock/lock.h" @@ -139,6 +140,62 @@ typedef enum { #define MaxLockTupleMode LockTupleExclusive +static const struct { + LOCKMODE hwlock; + MultiXactStatus lockstatus; + MultiXactStatus updstatus; +} TupleLockExtraInfo[MaxLockTupleMode + 1] = { + { + /* LockTupleKeyShare */ + AccessShareLock, + MultiXactStatusForKeyShare, + (MultiXactStatus)-1 /* KeyShare does not allow updating tuples */ + }, + { + RowShareLock, /* LockTupleShared */ + MultiXactStatusForShare, + (MultiXactStatus)-1 + }, + { + ExclusiveLock, /* LockTupleNoKeyExclusive */ + MultiXactStatusForNoKeyUpdate, + MultiXactStatusNoKeyUpdate + }, + { + AccessExclusiveLock, /* LockTupleExclusive */ + MultiXactStatusForUpdate, + MultiXactStatusUpdate + } +}; + +/* + * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. + * This is more readable than having every caller translate it to lock.h's + * LOCKMODE. + */ +#define LOCK_TUPLE_TUP_LOCK(rel, tup, mode) LockTuple((rel), (tup), TupleLockExtraInfo[mode].hwlock, true) +#define UNLOCK_TUPLE_TUP_LOCK(rel, tup, mode) UnlockTuple((rel), (tup), TupleLockExtraInfo[mode].hwlock) +#define ConditionalLockTupleTuplock(_rel, _tup, _mode) \ + ConditionalLockTuple((_rel), (_tup), TupleLockExtraInfo[_mode].hwlock) + +/* + * This table maps tuple lock strength values for each particular + * MultiXactStatus value. + */ +static const LockTupleMode MULTIXACT_STATUS_LOCK[MultiXactStatusUpdate + 1] = { + LockTupleShared, /* ForShare */ + LockTupleKeyShare, /* ForKeyShare */ + LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ + LockTupleExclusive, /* ForUpdate */ + LockTupleNoKeyExclusive, /* NoKeyUpdate */ + LockTupleExclusive /* Update */ +}; + +/* Get the LockTupleMode for a given MultiXactStatus */ +#define TUPLOCK_FROM_MXSTATUS(status) (MULTIXACT_STATUS_LOCK[(status)]) +/* Get the LOCKMODE for a given MultiXactStatus */ +#define LOCKMODE_FROM_MXSTATUS(status) (TupleLockExtraInfo[TUPLOCK_FROM_MXSTATUS((status))].hwlock) + /* the last arguments info for heap_multi_insert() */ typedef struct { /* compression info: dictionary buffer and its size */ @@ -266,17 +323,22 @@ extern int heap_multi_insert(Relation relation, Relation parent, HeapTuple* tupl extern TM_Result heap_delete(Relation relation, ItemPointer tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool allow_delete_self = false); extern TM_Result heap_update(Relation relation, Relation parentRelation, ItemPointer otid, HeapTuple newtup, - CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool allow_delete_self = false); + CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, + bool allow_delete_self = false); extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer* buffer, - CommandId cid, LockTupleMode mode, bool nowait, TM_FailureData *tmfd, bool allow_lock_self = false); + CommandId cid, LockTupleMode mode, bool nowait, bool follow_updates, TM_FailureData *tmfd, + bool allow_lock_self = false); +void FixInfomaskFromInfobits(uint8 infobits, uint16 *infomask, uint16 *infomask2); extern void heap_inplace_update(Relation relation, HeapTuple tuple); -extern bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid); -extern bool heap_tuple_needs_freeze(HeapTuple tuple, TransactionId cutoff_xid, Buffer buf); +extern bool heap_freeze_tuple(HeapTuple tuple, TransactionId cutoff_xid, TransactionId cutoff_multi, + bool *changedMultiXid = NULL); +extern bool heap_tuple_needs_freeze(HeapTuple tuple, TransactionId cutoff_xid, MultiXactId cutoff_multi, Buffer buf); extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid, int options = 0, bool allow_update_self = false); extern void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup); +extern MultiXactStatus GetMXactStatusForLock(LockTupleMode mode, bool isUpdate); extern void heap_markpos(TableScanDesc scan); extern void heap_restrpos(TableScanDesc scan); @@ -301,7 +363,7 @@ extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, OffsetNumber* red OffsetNumber* nowdead, int ndead, OffsetNumber* nowunused, int nunused, TransactionId latestRemovedXid, bool repair_fragmentation); extern XLogRecPtr log_heap_freeze( - Relation reln, Buffer buffer, TransactionId cutoff_xid, OffsetNumber* offsets, int offcnt); + Relation reln, Buffer buffer, TransactionId cutoff_xid, MultiXactId cutoff_multi, OffsetNumber* offsets, int offcnt); extern XLogRecPtr log_heap_visible(RelFileNode rnode, BlockNumber block, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid, bool free_dict); extern XLogRecPtr log_cu_bcm(const RelFileNode* rnode, int col, uint64 block, int status, int count); @@ -361,6 +423,7 @@ extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buff extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer, bool isAnalyzing = false); extern bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); +extern bool HeapTupleIsOnlyLocked(HeapTuple tuple); /* * To avoid leaking to much knowledge about reorderbuffer implementation diff --git a/src/include/access/htup.h b/src/include/access/htup.h index f9c76ea6a..6927a5174 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -194,8 +194,9 @@ typedef HeapTupleHeaderData* HeapTupleHeader; #define HEAP_COMBOCID 0x0020 /* t_cid is a combo cid */ #define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */ #define HEAP_XMAX_SHARED_LOCK 0x0080 /* xmax is shared locker */ -/* if either LOCK bit is set, xmax hasn't deleted the tuple, only locked it */ -#define HEAP_IS_LOCKED (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK) +/* xmax is a key-shared locker */ +#define HEAP_XMAX_KEYSHR_LOCK (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK) +#define HEAP_LOCK_MASK (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK | HEAP_XMAX_KEYSHR_LOCK) #define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */ #define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */ #define HEAP_XMIN_FROZEN (HEAP_XMIN_INVALID | HEAP_XMIN_COMMITTED) @@ -219,12 +220,13 @@ typedef HeapTupleHeaderData* HeapTupleHeader; * information stored in t_infomask2: */ #define HEAP_NATTS_MASK 0x07FF /* 11 bits for number of attributes */ -/* bits 0x1800 are available */ +#define HEAP_XMAX_LOCK_ONLY 0x0800 /* xmax, if valid, is only a locker */ +#define HEAP_KEYS_UPDATED 0x1000 /* tuple was updated and key cols modified, or tuple deleted */ #define HEAP_HAS_REDIS_COLUMNS 0x2000 /* tuple has hidden columns added by redis */ #define HEAP_HOT_UPDATED 0x4000 /* tuple was HOT-updated */ #define HEAP_ONLY_TUPLE 0x8000 /* this is heap-only tuple */ -#define HEAP2_XACT_MASK 0xC000 /* visibility-related bits */ +#define HEAP2_XACT_MASK 0xD800 /* visibility-related bits */ /* * HEAP_TUPLE_HAS_MATCH is a temporary flag used during hash joins. It is @@ -234,6 +236,28 @@ typedef HeapTupleHeaderData* HeapTupleHeader; */ #define HEAP_TUPLE_HAS_MATCH HEAP_ONLY_TUPLE /* tuple has a join match */ +/* + * A tuple is only locked (i.e. not updated by its Xmax) if it the + * HEAP_XMAX_LOCK_ONLY bit is set. + * + * See also HeapTupleIsOnlyLocked, which also checks for a possible + * aborted updater transaction. + */ +#define HEAP_XMAX_IS_LOCKED_ONLY(infomask, infomask2) \ + (((infomask2) & HEAP_XMAX_LOCK_ONLY) || \ + ((infomask) & HEAP_XMAX_SHARED_LOCK) || \ + (((infomask) & (HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK)) == HEAP_XMAX_EXCL_LOCK)) + +/* + * Use these to test whether a particular lock is applied to a tuple + */ +#define HEAP_XMAX_IS_SHR_LOCKED(infomask) (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_SHARED_LOCK) +#define HEAP_XMAX_IS_EXCL_LOCKED(infomask) (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_EXCL_LOCK) +#define HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) (((infomask) & HEAP_LOCK_MASK) == HEAP_XMAX_KEYSHR_LOCK) + +/* turn these all off when Xmax is to change */ +#define HEAP_XMAX_BITS (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID | HEAP_XMAX_IS_MULTI | HEAP_LOCK_MASK) + /* * HeapTupleHeader accessor macros * @@ -301,6 +325,27 @@ typedef HeapTupleHeaderData* HeapTupleHeader; ((tup)->t_data->t_choice.t_heap.t_xmax) \ )) +/* + * HeapTupleGetRawXmax gets you the raw Xmax field. To find out the Xid + * that updated a tuple, you might need to resolve the MultiXactId if certain + * bits are set. HeapTupleGetUpdateXid checks those bits and takes care + * to resolve the MultiXactId if necessary. This might involve multixact I/O, + * so it should only be used if absolutely necessary. + */ +#define HeapTupleGetUpdateXid(tup) \ + ((!((tup)->t_data->t_infomask & HEAP_XMAX_INVALID) && \ + ((tup)->t_data->t_infomask & HEAP_XMAX_IS_MULTI) && \ + !((tup)->t_data->t_infomask2 & HEAP_XMAX_LOCK_ONLY)) ? \ + HeapTupleMultiXactGetUpdateXid(tup) : \ + HeapTupleGetRawXmax(tup)) + +#define HeapTupleHeaderGetUpdateXid(page, tup) \ + ((!((tup)->t_infomask & HEAP_XMAX_INVALID) && \ + ((tup)->t_infomask & HEAP_XMAX_IS_MULTI) && \ + !((tup)->t_infomask2 & HEAP_XMAX_LOCK_ONLY)) ? \ + HeapTupleHeaderMultiXactGetUpdateXid(page, tup) : \ + HeapTupleHeaderGetRawXmax(page, tup)) + #define HeapTupleHeaderGetXmax(page, tup) \ (ShortTransactionIdToNormal(((tup)->t_infomask & HEAP_XMAX_IS_MULTI) \ ? (PageIs8BXidHeapVersion(page) ? ((HeapPageHeader)(page))->pd_multi_base : 0) \ @@ -319,10 +364,10 @@ typedef HeapTupleHeaderData* HeapTupleHeader; #define HeapTupleSetXmax(tup, xid) \ ((tup)->t_data->t_choice.t_heap.t_xmax = NormalTransactionIdToShort( \ - ((tup)->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ? tup->t_multi_base : tup->t_xid_base, (xid))) + ((tup)->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ? (tup)->t_multi_base : (tup)->t_xid_base, (xid))) #define HeapTupleSetXmin(tup, xid) \ - ((tup)->t_data->t_choice.t_heap.t_xmin = NormalTransactionIdToShort(tup->t_xid_base, (xid))) + ((tup)->t_data->t_choice.t_heap.t_xmin = NormalTransactionIdToShort((tup)->t_xid_base, (xid))) /* * HeapTupleHeaderGetRawCommandId will give you what's in the header whether @@ -671,6 +716,10 @@ inline HeapTuple heaptup_alloc(Size size) * or MULTI_INSERT, we can (and we do) restore entire page in redo */ #define XLOG_HEAP_INIT_PAGE 0x80 + +/* Upgrade support for enhanced tupl lock mode */ +#define XLOG_TUPLE_LOCK_UPGRADE_FLAG 0x01 + /* * We ran out of opcodes, so heapam.c now has a second RmgrId. These opcodes * are associated with RM_HEAP2_ID, but are not logically different from @@ -744,9 +793,12 @@ inline HeapTuple heaptup_alloc(Size size) typedef struct xl_heap_delete { OffsetNumber offnum; /* deleted tuple's offset */ uint8 flags; + TransactionId xmax; /* xmax of the deleted tuple */ + uint8 infobits_set; /* infomask bits */ } xl_heap_delete; -#define SizeOfHeapDelete (offsetof(xl_heap_delete, flags) + sizeof(bool)) +#define SizeOfOldHeapDelete (offsetof(xl_heap_delete, flags) + sizeof(uint8)) +#define SizeOfHeapDelete (offsetof(xl_heap_delete, infobits_set) + sizeof(uint8)) /* * We don't store the whole fixed part (HeapTupleHeaderData) of an inserted @@ -829,9 +881,13 @@ typedef struct xl_heap_update { OffsetNumber old_offnum; /* old tuple's offset */ OffsetNumber new_offnum; /* new tuple's offset */ uint8 flags; /* NEW TUPLE xl_heap_header AND TUPLE DATA FOLLOWS AT END OF STRUCT */ + TransactionId old_xmax; /* xmax of the old tuple */ + TransactionId new_xmax; /* xmax of the new tuple */ + uint8 old_infobits_set; /* infomask bits to set on old tuple */ } xl_heap_update; -#define SizeOfHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(uint8)) +#define SizeOfOldHeapUpdate (offsetof(xl_heap_update, flags) + sizeof(uint8)) +#define SizeOfHeapUpdate (offsetof(xl_heap_update, old_infobits_set) + sizeof(uint8)) /* * This is what we need to know about vacuum page cleanup/redirect * @@ -879,15 +935,25 @@ typedef struct xl_heap_logical_newpage { #define SizeOfHeapLogicalNewPage (offsetof(xl_heap_logical_newpage, blockSize) + sizeof(int32)) +/* flags for infobits_set */ +#define XLHL_XMAX_IS_MULTI 0x01 +#define XLHL_XMAX_LOCK_ONLY 0x02 +#define XLHL_XMAX_EXCL_LOCK 0x04 +#define XLHL_XMAX_KEYSHR_LOCK 0x08 +#define XLHL_KEYS_UPDATED 0x10 + /* This is what we need to know about lock */ typedef struct xl_heap_lock { TransactionId locking_xid; /* might be a MultiXactId not xid */ OffsetNumber offnum; /* locked tuple's offset on page */ bool xid_is_mxact; /* is it? */ bool shared_lock; /* shared or exclusive row lock? */ + uint8 infobits_set; /* infomask and infomask2 bits to set */ + bool lock_updated; /* lock an updated version of a row */ } xl_heap_lock; -#define SizeOfHeapLock (offsetof(xl_heap_lock, shared_lock) + sizeof(bool)) +#define SizeOfOldHeapLock (offsetof(xl_heap_lock, shared_lock) + sizeof(bool)) +#define SizeOfHeapLock (offsetof(xl_heap_lock, lock_updated) + sizeof(bool)) /* This is what we need to know about in-place update */ typedef struct xl_heap_inplace { @@ -905,10 +971,12 @@ typedef struct xl_heap_inplace { */ typedef struct xl_heap_freeze { TransactionId cutoff_xid; + MultiXactId cutoff_multi; /* TUPLE OFFSET NUMBERS FOLLOW AT THE END */ } xl_heap_freeze; -#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId)) +#define SizeOfOldHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId)) +#define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_multi) + sizeof(MultiXactId)) typedef struct xl_heap_freeze_tuple { TransactionId xmax; @@ -988,6 +1056,8 @@ extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup, Page page); extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup, Page page); extern bool CheckStreamCombocid(HeapTupleHeader tup, CommandId current_cid, Page page); extern void HeapTupleHeaderAdjustCmax(HeapTupleHeader tup, CommandId* cmax, bool* iscombo, Buffer buffer); +extern TransactionId HeapTupleMultiXactGetUpdateXid(HeapTuple tuple); +extern TransactionId HeapTupleHeaderMultiXactGetUpdateXid(Page page, HeapTupleHeader tuple); /* ---------------- * fastgetattr && fastgetattr_with_dict diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 2fb3c3e8a..80cb009b5 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -33,8 +33,8 @@ * next two are used for update and delete modes. */ typedef enum { - MultiXactStatusForKeyShare = 0x00, - MultiXactStatusForShare = 0x01, + MultiXactStatusForShare = 0x00, /* set FOR_SHARE = 0 here for compatibility */ + MultiXactStatusForKeyShare = 0x01, MultiXactStatusForNoKeyUpdate = 0x02, MultiXactStatusForUpdate = 0x03, /* an update that doesn't touch "key" columns */ @@ -54,6 +54,17 @@ typedef struct MultiXactMember { MultiXactStatus status; } MultiXactMember; +/* + * In htup.h, we define MaxTransactionId, and first four bits are reserved. + * We use low 60 bits to record member xid and high 3 bits to record member status. + */ +#define MULTIXACT_MEMBER_XID_MASK UINT64CONST((UINT64CONST(1) << 60) - 1) +#define GET_MEMBER_XID_FROM_SLRU_XID(xid) ((xid) & MULTIXACT_MEMBER_XID_MASK) +#define GET_MEMBER_STATUS_FROM_SLRU_XID(xid) (MultiXactStatus((xid) >> 61)) +#define GET_SLRU_XID_FROM_MULTIXACT_MEMBER(member) \ + (((TransactionId)((member)->status) << 61) | (((member)->xid) & MULTIXACT_MEMBER_XID_MASK)) + + /* ---------------- * multixact-related XLOG entries * ---------------- @@ -71,20 +82,22 @@ typedef struct xl_multixact_create { MultiXactId mid; /* new MultiXact's ID */ MultiXactOffset moff; /* its starting offset in members file */ int32 nxids; /* number of member XIDs */ - TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; /* VARIABLE LENGTH ARRAY */ + TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; /* low 60 bits record member xid, high 3 bits record member status */ } xl_multixact_create; #define MinSizeOfMultiXactCreate offsetof(xl_multixact_create, xids) -extern MultiXactId MultiXactIdCreate(TransactionId xid1, TransactionId xid2); -extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid); +MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, + TransactionId xid2, MultiXactStatus status2); +extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status); extern bool MultiXactIdIsRunning(MultiXactId multi); extern bool MultiXactIdIsCurrent(MultiXactId multi); extern MultiXactId ReadNextMultiXactId(void); -extern void MultiXactIdWait(MultiXactId multi, bool allow_con_update = false); -extern bool ConditionalMultiXactIdWait(MultiXactId multi); +extern bool DoMultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining, bool nowait); +extern void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining); +extern bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, int *remaining); extern void MultiXactIdSetOldestMember(void); -extern int GetMultiXactIdMembers(MultiXactId multi, TransactionId** xids); +extern int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember** members); extern void AtEOXact_MultiXact(void); extern void AtPrepare_MultiXact(void); @@ -95,10 +108,14 @@ extern void MultiXactShmemInit(void); extern void BootStrapMultiXact(void); extern void StartupMultiXact(void); extern void ShutdownMultiXact(void); +extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid); extern void MultiXactGetCheckptMulti(bool is_shutdown, MultiXactId* nextMulti, MultiXactOffset* nextMultiOffset); extern void CheckPointMultiXact(void); +extern MultiXactId GetOldestMultiXactId(void); +extern void TruncateMultiXact(MultiXactId cutoff_multi = InvalidMultiXactId); extern void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset); extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); +extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); extern void multixact_twophase_recover(TransactionId xid, uint16 info, void* recdata, uint32 len); extern void multixact_twophase_postcommit(TransactionId xid, uint16 info, void* recdata, uint32 len); diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index bedd8ea72..c1b31e00a 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -260,6 +260,7 @@ extern int8 heaprel_get_compression_from_modes(int16 modes); extern bool get_crossbucket_option(List **options_ptr, bool stmtoptgpi = false, char *accessmethod = NULL, int *crossbucketopt = NULL); extern bool is_contain_crossbucket(List *defList); +extern bool is_cstore_option(char relkind, Datum reloptions); extern void CheckGetServerIpAndPort(const char* Address, List** AddrList, bool IsCheck, int real_addr_max); extern void CheckFoldernameOrFilenamesOrCfgPtah(const char* OptStr, char* OptType); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 4dcc134cb..2c709ee74 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -452,7 +452,8 @@ typedef struct TableAmRoutine { TM_Result (*tuple_update)(Relation relation, Relation parentRelation, ItemPointer otid, Tuple newtup, CommandId cid, Snapshot crosscheck, Snapshot snapshot, bool wait, TupleTableSlot **oldslot, TM_FailureData *tmfd, - bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, bool allow_inplace_update); + LockTupleMode *mode, bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self, + bool allow_inplace_update); TM_Result (*tuple_lock)(Relation relation, Tuple tuple, Buffer *buffer, CommandId cid, LockTupleMode mode, bool nowait, TM_FailureData *tmfd, bool allow_lock_self, bool follow_updates, bool eval, Snapshot snapshot, @@ -583,7 +584,7 @@ extern TM_Result tableam_tuple_delete(Relation relation, ItemPointer tid, Comman extern TM_Result tableam_tuple_update(Relation relation, Relation parentRelation, ItemPointer otid, Tuple newtup, CommandId cid, Snapshot crosscheck, Snapshot snapshot, bool wait, TupleTableSlot **oldslot, TM_FailureData *tmfd, bool *update_indexes, Bitmapset **modifiedIdxAttrs, bool allow_update_self = false, - bool allow_inplace_update = true); + bool allow_inplace_update = true, LockTupleMode *lockmode = NULL); extern TM_Result tableam_tuple_lock(Relation relation, Tuple tuple, Buffer *buffer, CommandId cid, LockTupleMode mode, bool nowait, TM_FailureData *tmfd, bool allow_lock_self, bool follow_updates, bool eval, Snapshot snapshot, ItemPointer tid, bool isSelectForUpdate, bool isUpsert = false, diff --git a/src/include/access/ustore/knl_uheap.h b/src/include/access/ustore/knl_uheap.h index 886265511..ea131e7d1 100644 --- a/src/include/access/ustore/knl_uheap.h +++ b/src/include/access/ustore/knl_uheap.h @@ -36,9 +36,6 @@ #define NUM_BLOCKS_FOR_NON_INPLACE_UPDATES 200 #define MIN_SAVING_LEN 3 -#define ConditionalLockTupleTuplock(_rel, _tup, _mode) \ - ConditionalLockTuple((_rel), (_tup), TupleLockExtraInfo[_mode].hwlock) - typedef struct UHeapWALInfo { Oid relOid; Oid partitionOid; diff --git a/src/include/access/ustore/knl_umultilocker.h b/src/include/access/ustore/knl_umultilocker.h index 428468755..ae4b8de07 100644 --- a/src/include/access/ustore/knl_umultilocker.h +++ b/src/include/access/ustore/knl_umultilocker.h @@ -22,30 +22,6 @@ #include "access/multixact.h" #include "access/ustore/knl_utuple.h" -const struct LockExtraInfo TupleLockExtraInfo[MaxLockTupleMode + 1] = { - { - /* LockTupleKeyShare */ - AccessShareLock, - MultiXactStatusForKeyShare, - -1 /* KeyShare does not allow updating tuples */ - }, - { - ShareLock, /* LockTupleShared */ - MultiXactStatusForShare, - -1 - }, - { - ExclusiveLock, /* LockTupleNoKeyExclusive */ - MultiXactStatusForNoKeyUpdate, - MultiXactStatusNoKeyUpdate - }, - { - ExclusiveLock, /* LockTupleExclusive */ - MultiXactStatusForUpdate, - MultiXactStatusUpdate - } -}; - /* * Get the heavy-weight lock mode from lock tuple mode. */ diff --git a/src/include/access/xlogproc.h b/src/include/access/xlogproc.h index a7875e4c9..9e7156852 100755 --- a/src/include/access/xlogproc.h +++ b/src/include/access/xlogproc.h @@ -901,20 +901,22 @@ extern AbnormalProcFunc g_AbFunList[ABNORMAL_NUM]; void HeapXlogCleanOperatorPage( RedoBufferInfo* buffer, void* recorddata, void* blkdata, Size datalen, Size* freespace, bool repairFragmentation); -void HeapXlogFreezeOperatorPage(RedoBufferInfo* buffer, void* recorddata, void* blkdata, Size datalen); +void HeapXlogFreezeOperatorPage(RedoBufferInfo* buffer, void* recorddata, void* blkdata, Size datalen, + bool isTupleLockUpgrade); void HeapXlogVisibleOperatorPage(RedoBufferInfo* buffer, void* recorddata); void HeapXlogVisibleOperatorVmpage(RedoBufferInfo* vmbuffer, void* recorddata); -void HeapXlogDeleteOperatorPage(RedoBufferInfo* buffer, void* recorddata, TransactionId recordxid); +void HeapXlogDeleteOperatorPage(RedoBufferInfo* buffer, void* recorddata, TransactionId recordxid, + bool isTupleLockUpgrade); void HeapXlogInsertOperatorPage(RedoBufferInfo* buffer, void* recorddata, bool isinit, void* blkdata, Size datalen, TransactionId recxid, Size* freespace, bool tde = false); void HeapXlogMultiInsertOperatorPage(RedoBufferInfo* buffer, const void* recoreddata, bool isinit, const void* blkdata, Size len, TransactionId recordxid, Size* freespace, bool tde = false); void HeapXlogUpdateOperatorOldpage(RedoBufferInfo* buffer, void* recoreddata, bool hot_update, bool isnewinit, - BlockNumber newblk, TransactionId recordxid); + BlockNumber newblk, TransactionId recordxid, bool isTupleLockUpgrade); void HeapXlogUpdateOperatorNewpage(RedoBufferInfo* buffer, void* recorddata, bool isinit, void* blkdata, - Size datalen, TransactionId recordxid, Size* freespace, bool tde = false); + Size datalen, TransactionId recordxid, Size* freespace, bool isTupleLockUpgrade, bool tde = false); void HeapXlogPageUpgradeOperatorPage(RedoBufferInfo* buffer); -void HeapXlogLockOperatorPage(RedoBufferInfo* buffer, void* recorddata); +void HeapXlogLockOperatorPage(RedoBufferInfo* buffer, void* recorddata, bool isTupleLockUpgrade); void HeapXlogInplaceOperatorPage(RedoBufferInfo* buffer, void* recorddata, void* blkdata, Size newlen); void HeapXlogBaseShiftOperatorPage(RedoBufferInfo* buffer, void* recorddata); diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index 6f0e76cd1..fe8c55e4b 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -84,6 +84,10 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO TransactionId relfrozenxid64; /* all Xids < this are frozen in this rel */ Oid relbucket; /* bucket info in pg_hashbucket */ int2vector relbucketkey; /* Column number of hash partition */ +#ifdef CATALOG_VARLEN + TransactionId relminmxid; /* all multixacts in this rel are >= this. + * this is really a MultiXactId */ +#endif } FormData_pg_class; @@ -102,7 +106,7 @@ typedef FormData_pg_class* Form_pg_class; * ---------------- */ -#define Natts_pg_class 39 +#define Natts_pg_class 40 #define Anum_pg_class_relname 1 #define Anum_pg_class_relnamespace 2 #define Anum_pg_class_reltype 3 @@ -142,6 +146,7 @@ typedef FormData_pg_class* Form_pg_class; #define Anum_pg_class_relfrozenxid64 37 #define Anum_pg_class_relbucket 38 #define Anum_pg_class_relbucketkey 39 +#define Anum_pg_class_relminmxid 40 /* ---------------- * initial contents of pg_class @@ -152,16 +157,19 @@ typedef FormData_pg_class* Form_pg_class; * ---------------- */ -/* Note: "3" in the relfrozenxid and the relfrozenxid64 column stands for FirstNormalTransactionId */ -DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_)); +/* + * Note: "3" in the relfrozenxid and the relfrozenxid64 column stands for FirstNormalTransactionId; + * similarly, "1" in relminmxid stands for FirstMultiXactId. + */ +DATA(insert OID = 1247 ( pg_type PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 30 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_ 1)); DESCR(""); -DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 24 0 f f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_)); +DATA(insert OID = 1249 ( pg_attribute PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 24 0 f f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_ 1)); DESCR(""); -DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 37 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_)); +DATA(insert OID = 1255 ( pg_proc PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 37 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_ 1)); DESCR(""); -DATA(insert OID = 7815 ( gs_package PGNSP 9745 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 7 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_)); +DATA(insert OID = 7815 ( gs_package PGNSP 9745 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 7 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_ 1)); DESCR(""); -DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 39 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_)); +DATA(insert OID = 1259 ( pg_class PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 0 0 0 0 f f p r 40 0 t f f f f 0 f f n 3 _null_ _null_ n 3 _null_ _null_ 1)); DESCR(""); #define RELKIND_RELATION 'r' /* ordinary table */ diff --git a/src/include/catalog/pg_database.h b/src/include/catalog/pg_database.h index 49ef8f752..447cc729e 100644 --- a/src/include/catalog/pg_database.h +++ b/src/include/catalog/pg_database.h @@ -48,6 +48,7 @@ CATALOG(pg_database,1262) BKI_SHARED_RELATION BKI_ROWTYPE_OID(1248) BKI_SCHEMA_M aclitem datacl[1]; /* access permissions */ #endif TransactionId datfrozenxid64; /* all Xids < this are frozen in this DB */ + TransactionId datminmxid; /* all multixacts in the DB are >= this */ } FormData_pg_database; /* Size of fixed part of pg_database tuples, not counting var-length fields */ @@ -65,7 +66,7 @@ typedef FormData_pg_database *Form_pg_database; * compiler constants for pg_database * ---------------- */ -#define Natts_pg_database 14 +#define Natts_pg_database 15 #define Anum_pg_database_datname 1 #define Anum_pg_database_datdba 2 #define Anum_pg_database_encoding 3 @@ -80,8 +81,9 @@ typedef FormData_pg_database *Form_pg_database; #define Anum_pg_database_compatibility 12 #define Anum_pg_database_datacl 13 #define Anum_pg_database_datfrozenxid64 14 +#define Anum_pg_database_datminmxid 15 -DATA(insert OID = 1 ( template1 PGUID ENCODING "LC_COLLATE" "LC_CTYPE" t t -1 0 0 1663 "DB_COMPATIBILITY" _null_ 3)); +DATA(insert OID = 1 ( template1 PGUID ENCODING "LC_COLLATE" "LC_CTYPE" t t -1 0 0 1663 "DB_COMPATIBILITY" _null_ 3 1)); SHDESCR("unmodifiable empty database"); #define TemplateDbOid 1 diff --git a/src/include/catalog/pg_partition.h b/src/include/catalog/pg_partition.h index 8d5812b64..4b7fd915d 100644 --- a/src/include/catalog/pg_partition.h +++ b/src/include/catalog/pg_partition.h @@ -62,6 +62,8 @@ CATALOG(pg_partition,9016) BKI_ROWTYPE_OID(3790) BKI_SCHEMA_MACRO text reloptions[1]; /* access-method-specific options */ #endif TransactionId relfrozenxid64; + TransactionId relminmxid; /* all multixacts in this rel are >= this. + * this is really a MultiXactId */ } FormData_pg_partition; /* Size of fixed part of pg_partition tuples, not counting var-length fields */ #define PARTITION_TUPLE_SIZE \ @@ -91,7 +93,7 @@ typedef FormData_pg_partition *Form_pg_partition; #define PART_OBJ_TYPE_TABLE_PARTITION 'p' #define PART_OBJ_TYPE_INDEX_PARTITION 'x' -#define Natts_pg_partition 28 +#define Natts_pg_partition 29 #define Anum_pg_partition_relname 1 #define Anum_pg_partition_parttype 2 #define Anum_pg_partition_parentid 3 @@ -120,5 +122,6 @@ typedef FormData_pg_partition *Form_pg_partition; #define Anum_pg_partition_transit 26 #define Anum_pg_partition_reloptions 27 #define Anum_pg_partition_relfrozenxid64 28 +#define Anum_pg_partition_relminmxid 29 #endif/*PG_PARTITION_H*/ diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index 8bd1d57bd..7bf9525a8 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -32,15 +32,15 @@ extern double copy_heap_data_internal(Relation OldHeap, Relation OldIndex, Relat TransactionId FreezeXid, bool verbose, bool use_sort, AdaptMem* memUsage); extern double CopyUHeapDataInternal(Relation oldHeap, Relation oldIndex, Relation newHeap, TransactionId oldestXmin, TransactionId freezeXid, bool verbose, bool useSort, const AdaptMem* memUsage); -extern TransactionId getPartitionRelfrozenxid(Relation ordTableRel); -extern TransactionId getRelationRelfrozenxid(Relation ordTableRel); +extern void getPartitionRelxids(Relation ordTableRel, TransactionId* frozenXid, MultiXactId* multiXid = NULL); +extern void getRelationRelxids(Relation ordTableRel, TransactionId* frozenXid, MultiXactId* multiXid = NULL); extern void setRelationRelfrozenxid(Oid relid, TransactionId frozenXid); extern void setPartitionRelfrozenxid(Oid partid, TransactionId frozenXid); extern void finishPartitionHeapSwap(Oid partitionOid, Oid tempTableOid, bool swapToastByContent, - TransactionId frozenXid, bool tempTableIsPartition = false); + TransactionId frozenXid, MultiXactId multiXid, bool tempTableIsPartition = false); extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bool swap_toast_by_content, - bool check_constraints, TransactionId frozenXid, AdaptMem* memInfo = NULL); + bool check_constraints, TransactionId frozenXid, MultiXactId frozenMulti, AdaptMem* memInfo = NULL); extern void vacuumFullPart(Oid partOid, VacuumStmt* vacstmt, int freeze_min_age, int freeze_table_age); extern void GpiVacuumFullMainPartiton(Oid parentOid); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 9a74f38c1..a3cc903b5 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -408,9 +408,10 @@ extern void vac_close_indexes(int nindexes, Relation* Irel, LOCKMODE lockmode); extern double vac_estimate_reltuples( Relation relation, BlockNumber total_pages, BlockNumber scanned_pages, double scanned_tuples); extern void vac_update_relstats(Relation relation, Relation classRel, RelPageType num_pages, double num_tuples, - BlockNumber num_all_visible_pages, bool hasindex, TransactionId frozenxid); + BlockNumber num_all_visible_pages, bool hasindex, TransactionId frozenxid, + MultiXactId minmulti = InvalidMultiXactId); extern void vacuum_set_xid_limits(Relation rel, int64 freeze_min_age, int64 freeze_table_age, TransactionId* oldestXmin, - TransactionId* freezeLimit, TransactionId* freezeTableLimit); + TransactionId* freezeLimit, TransactionId* freezeTableLimit, MultiXactId* multiXactFrzLimit); extern void vac_update_datfrozenxid(void); extern void vacuum_delay_point(void); @@ -445,19 +446,20 @@ extern void delete_attstats_replication(Oid relid, VacuumStmt* stmt); extern int compute_attr_target(Form_pg_attribute attr); extern void vac_update_partstats(Partition part, BlockNumber num_pages, double num_tuples, - BlockNumber num_all_visible_pages, TransactionId frozenxid); + BlockNumber num_all_visible_pages, TransactionId frozenxid, MultiXactId minmulti = InvalidMultiXactId); extern void vac_open_part_indexes(VacuumStmt* vacstmt, LOCKMODE lockmode, int* nindexes, int* nindexesGlobal, Relation** Irel, Relation** indexrel, Partition** indexpart); extern void vac_close_part_indexes( int nindexes, int nindexesGlobal, Relation* Irel, Relation* indexrel, Partition* indexpart, LOCKMODE lockmode); -extern void vac_update_pgclass_partitioned_table(Relation partitionRel, bool hasIndex, TransactionId newFrozenXid); +extern void vac_update_pgclass_partitioned_table(Relation partitionRel, bool hasIndex, TransactionId newFrozenXid, + MultiXactId newMultiXid); extern void CStoreVacUpdateNormalRelStats(Oid relid, TransactionId frozenxid, Relation pgclassRel); extern void CStoreVacUpdatePartitionRelStats(Relation partitionRel, TransactionId newFrozenXid); extern void CStoreVacUpdatePartitionStats(Oid relid, TransactionId frozenxid); extern void CalculatePartitionedRelStats(_in_ Relation partitionRel, _in_ Relation pgPartitionRel, _out_ BlockNumber* totalPages, _out_ BlockNumber* totalVisiblePages, _out_ double* totalTuples, - _out_ TransactionId* minFrozenXid); + _out_ TransactionId* minFrozenXid, _out_ MultiXactId* minMultiXid); extern bool IsToastRelationbyOid(Oid relid); extern Oid pg_toast_get_baseid(Oid relOid, bool* isPartToast); diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 2e5ae386e..d7c5c8811 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -226,7 +226,7 @@ extern void ExecConstraints(ResultRelInfo* resultRelInfo, TupleTableSlot* slot, extern ExecRowMark* ExecFindRowMark(EState* estate, Index rti); extern ExecAuxRowMark* ExecBuildAuxRowMark(ExecRowMark* erm, List* targetlist); extern TupleTableSlot* EvalPlanQual(EState* estate, EPQState* epqstate, Relation relation, Index rti, - ItemPointer tid, TransactionId priorXmax, bool partRowMoveUpdate); + int lockmode, ItemPointer tid, TransactionId priorXmax, bool partRowMoveUpdate); extern HeapTuple heap_lock_updated( CommandId cid, Relation relation, int lockmode, ItemPointer tid, TransactionId priorXmax); extern TupleTableSlot* EvalPlanQualUHeap(EState* estate, EPQState* epqstate, Relation relation, Index rti, ItemPointer tid, TransactionId priorXmax); diff --git a/src/include/knl/knl_session.h b/src/include/knl/knl_session.h index eef9ce03c..6f40cc02e 100644 --- a/src/include/knl/knl_session.h +++ b/src/include/knl/knl_session.h @@ -728,6 +728,7 @@ typedef struct knl_u_commands_context { List* PendingLibraryDeletes; TransactionId OldestXmin; TransactionId FreezeLimit; + MultiXactId MultiXactFrzLimit; struct SeqTableData* seqtab; /* Head of list of SeqTable items */ /* * last_used_seq is updated by nextval() to point to the last used diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index 1fae36e36..28b7b5d3d 100644 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -1404,8 +1404,9 @@ typedef struct knl_t_autovacuum_context { volatile sig_atomic_t got_SIGUSR2; volatile sig_atomic_t got_SIGTERM; - /* Comparison point for determining whether freeze_max_age is exceeded */ + /* Comparison points for determining whether freeze_max_age is exceeded */ TransactionId recentXid; + MultiXactId recentMulti; /* Default freeze ages to use for autovacuum (varies by database) */ int64 default_freeze_min_age; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index cde7fed15..8c1842a30 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -72,6 +72,7 @@ extern void register_backend_version(uint32 backend_version); extern bool contain_backend_version(uint32 version_number); extern const uint32 ANALYZER_HOOK_VERSION_NUM; extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM; +extern const uint32 ENHANCED_TUPLE_LOCK_VERSION_NUM; #define INPLACE_UPGRADE_PRECOMMIT_VERSION 1 diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index fbdaa5a08..2ac2eee04 100755 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2375,7 +2375,7 @@ typedef struct SetOpState { /* ---------------- * LockRowsState information * - * LockRows nodes are used to enforce FOR UPDATE/FOR SHARE locking. + * LockRows nodes are used to enforce FOR [KEY] UPDATE/FOR SHARE locking. * ---------------- */ typedef struct LockRowsState { diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 2c6c4f1a1..51c798ada 100755 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -87,7 +87,7 @@ typedef uint32 AclMode; /* a bitmask of privilege bits */ #define ACL_WRITE (1 << 14) /* for pg_directory */ #define N_ACL_RIGHTS 15 /* 1 plus the last 1<