@@ -6288,13 +6288,18 @@ heap_inplace_update_and_unlock(Relation relation,
62886288 HeapTupleHeader htup = oldtup -> t_data ;
62896289 uint32 oldlen ;
62906290 uint32 newlen ;
6291+ char * dst ;
6292+ char * src ;
62916293
62926294 Assert (ItemPointerEquals (& oldtup -> t_self , & tuple -> t_self ));
62936295 oldlen = oldtup -> t_len - htup -> t_hoff ;
62946296 newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
62956297 if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
62966298 elog (ERROR , "wrong tuple length" );
62976299
6300+ dst = (char * ) htup + htup -> t_hoff ;
6301+ src = (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ;
6302+
62986303 /*
62996304 * Unlink relcache init files as needed. If unlinking, acquire
63006305 * RelCacheInitLock until after associated invalidations. By doing this
@@ -6305,15 +6310,15 @@ heap_inplace_update_and_unlock(Relation relation,
63056310 */
63066311 PreInplace_Inval ();
63076312
6308- /* NO EREPORT(ERROR) from here till changes are logged */
6309- START_CRIT_SECTION ();
6310-
6311- memcpy ((char * ) htup + htup -> t_hoff ,
6312- (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ,
6313- newlen );
6314-
63156313 /*----------
6316- * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6314+ * NO EREPORT(ERROR) from here till changes are complete
6315+ *
6316+ * Our buffer lock won't stop a reader having already pinned and checked
6317+ * visibility for this tuple. Hence, we write WAL first, then mutate the
6318+ * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6319+ * checkpoint delay makes that acceptable. With the usual order of
6320+ * changes, a crash after memcpy() and before XLogInsert() could allow
6321+ * datfrozenxid to overtake relfrozenxid:
63176322 *
63186323 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
63196324 * ["R" is a VACUUM tbl]
@@ -6323,31 +6328,65 @@ heap_inplace_update_and_unlock(Relation relation,
63236328 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
63246329 * [crash]
63256330 * [recovery restores datfrozenxid w/o relfrozenxid]
6326- */
6327-
6328- MarkBufferDirty (buffer );
6331+ *
6332+ * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6333+ * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6334+ * The stack copy facilitates a FPI of the post-mutation block before we
6335+ * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6336+ * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6337+ * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6338+ * This function, however, likely could avoid it with the following order
6339+ * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6340+ * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6341+ * patterns to analyze. Inplace update isn't so frequent that it should
6342+ * pursue the small optimization of skipping DELAY_CHKPT_START.
6343+ */
6344+ Assert ((MyProc -> delayChkptFlags & DELAY_CHKPT_START ) == 0 );
6345+ START_CRIT_SECTION ();
6346+ MyProc -> delayChkptFlags |= DELAY_CHKPT_START ;
63296347
63306348 /* XLOG stuff */
63316349 if (RelationNeedsWAL (relation ))
63326350 {
63336351 xl_heap_inplace xlrec ;
6352+ PGAlignedBlock copied_buffer ;
6353+ char * origdata = (char * ) BufferGetBlock (buffer );
6354+ Page page = BufferGetPage (buffer );
6355+ uint16 lower = ((PageHeader ) page )-> pd_lower ;
6356+ uint16 upper = ((PageHeader ) page )-> pd_upper ;
6357+ uintptr_t dst_offset_in_block ;
6358+ RelFileLocator rlocator ;
6359+ ForkNumber forkno ;
6360+ BlockNumber blkno ;
63346361 XLogRecPtr recptr ;
63356362
63366363 xlrec .offnum = ItemPointerGetOffsetNumber (& tuple -> t_self );
63376364
63386365 XLogBeginInsert ();
63396366 XLogRegisterData ((char * ) & xlrec , SizeOfHeapInplace );
63406367
6341- XLogRegisterBuffer (0 , buffer , REGBUF_STANDARD );
6342- XLogRegisterBufData (0 , (char * ) htup + htup -> t_hoff , newlen );
6368+ /* register block matching what buffer will look like after changes */
6369+ memcpy (copied_buffer .data , origdata , lower );
6370+ memcpy (copied_buffer .data + upper , origdata + upper , BLCKSZ - upper );
6371+ dst_offset_in_block = dst - origdata ;
6372+ memcpy (copied_buffer .data + dst_offset_in_block , src , newlen );
6373+ BufferGetTag (buffer , & rlocator , & forkno , & blkno );
6374+ Assert (forkno == MAIN_FORKNUM );
6375+ XLogRegisterBlock (0 , & rlocator , forkno , blkno , copied_buffer .data ,
6376+ REGBUF_STANDARD );
6377+ XLogRegisterBufData (0 , src , newlen );
63436378
63446379 /* inplace updates aren't decoded atm, don't log the origin */
63456380
63466381 recptr = XLogInsert (RM_HEAP_ID , XLOG_HEAP_INPLACE );
63476382
6348- PageSetLSN (BufferGetPage ( buffer ) , recptr );
6383+ PageSetLSN (page , recptr );
63496384 }
63506385
6386+ memcpy (dst , src , newlen );
6387+
6388+ MarkBufferDirty (buffer );
6389+
63516390 LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
63526391
63536392 /*
@@ -6356,6 +6395,7 @@ heap_inplace_update_and_unlock(Relation relation,
63566395 */
63576396 AtInplace_Inval ();
63586397
6398+ MyProc -> delayChkptFlags &= ~DELAY_CHKPT_START ;
63596399 END_CRIT_SECTION ();
63606400 UnlockTuple (relation , & tuple -> t_self , InplaceUpdateTupleLock );
63616401
0 commit comments