Skip to content

Commit e4b1986

Browse files
committed
WAL-log inplace update before revealing it to other sessions.
A buffer lock won't stop a reader having already checked tuple visibility. If a vac_update_datfrozenid() and then a crash happened during inplace update of a relfrozenxid value, datfrozenxid could overtake relfrozenxid. That could lead to "could not access status of transaction" errors. Back-patch to v14 - v17. This is a back-patch of commits: - 8e7e672 (main change, on master, before v18 branched) - 8180136 (defect fix, on master, before v18 branched) It reverses commit bc6bad8, my revert of the original back-patch. In v14, this also back-patches the assertion removal from commit 7fcf2fa. Discussion: https://postgr.es/m/20240620012908.92.nmisch@google.com Backpatch-through: 14-17
1 parent 8114224 commit e4b1986

File tree

3 files changed

+55
-19
lines changed

3 files changed

+55
-19
lines changed

src/backend/access/heap/README.tuplock

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,7 @@ Inplace updates create an exception to the rule that tuple data won't change
198198
under a reader holding a pin. A reader of a heap_fetch() result tuple may
199199
witness a torn read. Current inplace-updated fields are aligned and are no
200200
wider than four bytes, and current readers don't need consistency across
201-
fields. Hence, they get by with just fetching each field once. XXX such a
202-
caller may also read a value that has not reached WAL; see
203-
systable_inplace_update_finish().
201+
fields. Hence, they get by with just fetching each field once.
204202

205203
During logical decoding, caches reflect an inplace update no later than the
206204
next XLOG_XACT_INVALIDATIONS. That record witnesses the end of a command.

src/backend/access/heap/heapam.c

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6437,13 +6437,18 @@ heap_inplace_update_and_unlock(Relation relation,
64376437
HeapTupleHeader htup = oldtup->t_data;
64386438
uint32 oldlen;
64396439
uint32 newlen;
6440+
char *dst;
6441+
char *src;
64406442

64416443
Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
64426444
oldlen = oldtup->t_len - htup->t_hoff;
64436445
newlen = tuple->t_len - tuple->t_data->t_hoff;
64446446
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
64456447
elog(ERROR, "wrong tuple length");
64466448

6449+
dst = (char *) htup + htup->t_hoff;
6450+
src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6451+
64476452
/*
64486453
* Unlink relcache init files as needed. If unlinking, acquire
64496454
* RelCacheInitLock until after associated invalidations. By doing this
@@ -6454,15 +6459,15 @@ heap_inplace_update_and_unlock(Relation relation,
64546459
*/
64556460
PreInplace_Inval();
64566461

6457-
/* NO EREPORT(ERROR) from here till changes are logged */
6458-
START_CRIT_SECTION();
6459-
6460-
memcpy((char *) htup + htup->t_hoff,
6461-
(char *) tuple->t_data + tuple->t_data->t_hoff,
6462-
newlen);
6463-
64646462
/*----------
6465-
* XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6463+
* NO EREPORT(ERROR) from here till changes are complete
6464+
*
6465+
* Our buffer lock won't stop a reader having already pinned and checked
6466+
* visibility for this tuple. Hence, we write WAL first, then mutate the
6467+
* buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6468+
* checkpoint delay makes that acceptable. With the usual order of
6469+
* changes, a crash after memcpy() and before XLogInsert() could allow
6470+
* datfrozenxid to overtake relfrozenxid:
64666471
*
64676472
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
64686473
* ["R" is a VACUUM tbl]
@@ -6472,31 +6477,65 @@ heap_inplace_update_and_unlock(Relation relation,
64726477
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
64736478
* [crash]
64746479
* [recovery restores datfrozenxid w/o relfrozenxid]
6475-
*/
6476-
6477-
MarkBufferDirty(buffer);
6480+
*
6481+
* Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6482+
* Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6483+
* The stack copy facilitates a FPI of the post-mutation block before we
6484+
* accept other sessions seeing it. DELAY_CHKPT_START allows us to
6485+
* XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6486+
* can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6487+
* This function, however, likely could avoid it with the following order
6488+
* of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6489+
* DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6490+
* patterns to analyze. Inplace update isn't so frequent that it should
6491+
* pursue the small optimization of skipping DELAY_CHKPT_START.
6492+
*/
6493+
Assert(!MyProc->delayChkpt);
6494+
START_CRIT_SECTION();
6495+
MyProc->delayChkpt = true;
64786496

64796497
/* XLOG stuff */
64806498
if (RelationNeedsWAL(relation))
64816499
{
64826500
xl_heap_inplace xlrec;
6501+
PGAlignedBlock copied_buffer;
6502+
char *origdata = (char *) BufferGetBlock(buffer);
6503+
Page page = BufferGetPage(buffer);
6504+
uint16 lower = ((PageHeader) page)->pd_lower;
6505+
uint16 upper = ((PageHeader) page)->pd_upper;
6506+
uintptr_t dst_offset_in_block;
6507+
RelFileNode rnode;
6508+
ForkNumber forkno;
6509+
BlockNumber blkno;
64836510
XLogRecPtr recptr;
64846511

64856512
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
64866513

64876514
XLogBeginInsert();
64886515
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
64896516

6490-
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6491-
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6517+
/* register block matching what buffer will look like after changes */
6518+
memcpy(copied_buffer.data, origdata, lower);
6519+
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
6520+
dst_offset_in_block = dst - origdata;
6521+
memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
6522+
BufferGetTag(buffer, &rnode, &forkno, &blkno);
6523+
Assert(forkno == MAIN_FORKNUM);
6524+
XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data,
6525+
REGBUF_STANDARD);
6526+
XLogRegisterBufData(0, src, newlen);
64926527

64936528
/* inplace updates aren't decoded atm, don't log the origin */
64946529

64956530
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
64966531

6497-
PageSetLSN(BufferGetPage(buffer), recptr);
6532+
PageSetLSN(page, recptr);
64986533
}
64996534

6535+
memcpy(dst, src, newlen);
6536+
6537+
MarkBufferDirty(buffer);
6538+
65006539
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
65016540

65026541
/*
@@ -6505,6 +6544,7 @@ heap_inplace_update_and_unlock(Relation relation,
65056544
*/
65066545
AtInplace_Inval();
65076546

6547+
MyProc->delayChkpt = false;
65086548
END_CRIT_SECTION();
65096549
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
65106550

src/backend/access/transam/xloginsert.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,8 +275,6 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
275275
{
276276
registered_buffer *regbuf;
277277

278-
/* This is currently only used to WAL-log a full-page image of a page */
279-
Assert(flags & REGBUF_FORCE_IMAGE);
280278
Assert(begininsert_called);
281279

282280
if (block_id >= max_registered_block_id)

0 commit comments

Comments
 (0)