@@ -6397,13 +6397,18 @@ heap_inplace_update_and_unlock(Relation relation,
63976397 HeapTupleHeader htup = oldtup -> t_data ;
63986398 uint32 oldlen ;
63996399 uint32 newlen ;
6400+ char * dst ;
6401+ char * src ;
64006402
64016403 Assert (ItemPointerEquals (& oldtup -> t_self , & tuple -> t_self ));
64026404 oldlen = oldtup -> t_len - htup -> t_hoff ;
64036405 newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
64046406 if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
64056407 elog (ERROR , "wrong tuple length" );
64066408
6409+ dst = (char * ) htup + htup -> t_hoff ;
6410+ src = (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ;
6411+
64076412 /*
64086413 * Unlink relcache init files as needed. If unlinking, acquire
64096414 * RelCacheInitLock until after associated invalidations. By doing this
@@ -6414,15 +6419,15 @@ heap_inplace_update_and_unlock(Relation relation,
64146419 */
64156420 PreInplace_Inval ();
64166421
6417- /* NO EREPORT(ERROR) from here till changes are logged */
6418- START_CRIT_SECTION ();
6419-
6420- memcpy ((char * ) htup + htup -> t_hoff ,
6421- (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ,
6422- newlen );
6423-
64246422 /*----------
6425- * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6423+ * NO EREPORT(ERROR) from here till changes are complete
6424+ *
6425+ * Our buffer lock won't stop a reader having already pinned and checked
6426+ * visibility for this tuple. Hence, we write WAL first, then mutate the
6427+ * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6428+ * checkpoint delay makes that acceptable. With the usual order of
6429+ * changes, a crash after memcpy() and before XLogInsert() could allow
6430+ * datfrozenxid to overtake relfrozenxid:
64266431 *
64276432 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
64286433 * ["R" is a VACUUM tbl]
@@ -6432,31 +6437,65 @@ heap_inplace_update_and_unlock(Relation relation,
64326437 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
64336438 * [crash]
64346439 * [recovery restores datfrozenxid w/o relfrozenxid]
6435- */
6436-
6437- MarkBufferDirty (buffer );
6440+ *
6441+ * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6442+ * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6443+ * The stack copy facilitates a FPI of the post-mutation block before we
6444+ * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6445+ * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6446+ * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6447+ * This function, however, likely could avoid it with the following order
6448+ * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6449+ * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6450+ * patterns to analyze. Inplace update isn't so frequent that it should
6451+ * pursue the small optimization of skipping DELAY_CHKPT_START.
6452+ */
6453+ Assert ((MyProc -> delayChkptFlags & DELAY_CHKPT_START ) == 0 );
6454+ START_CRIT_SECTION ();
6455+ MyProc -> delayChkptFlags |= DELAY_CHKPT_START ;
64386456
64396457 /* XLOG stuff */
64406458 if (RelationNeedsWAL (relation ))
64416459 {
64426460 xl_heap_inplace xlrec ;
6461+ PGAlignedBlock copied_buffer ;
6462+ char * origdata = (char * ) BufferGetBlock (buffer );
6463+ Page page = BufferGetPage (buffer );
6464+ uint16 lower = ((PageHeader ) page )-> pd_lower ;
6465+ uint16 upper = ((PageHeader ) page )-> pd_upper ;
6466+ uintptr_t dst_offset_in_block ;
6467+ RelFileNode rnode ;
6468+ ForkNumber forkno ;
6469+ BlockNumber blkno ;
64436470 XLogRecPtr recptr ;
64446471
64456472 xlrec .offnum = ItemPointerGetOffsetNumber (& tuple -> t_self );
64466473
64476474 XLogBeginInsert ();
64486475 XLogRegisterData ((char * ) & xlrec , SizeOfHeapInplace );
64496476
6450- XLogRegisterBuffer (0 , buffer , REGBUF_STANDARD );
6451- XLogRegisterBufData (0 , (char * ) htup + htup -> t_hoff , newlen );
6477+ /* register block matching what buffer will look like after changes */
6478+ memcpy (copied_buffer .data , origdata , lower );
6479+ memcpy (copied_buffer .data + upper , origdata + upper , BLCKSZ - upper );
6480+ dst_offset_in_block = dst - origdata ;
6481+ memcpy (copied_buffer .data + dst_offset_in_block , src , newlen );
6482+ BufferGetTag (buffer , & rnode , & forkno , & blkno );
6483+ Assert (forkno == MAIN_FORKNUM );
6484+ XLogRegisterBlock (0 , & rnode , forkno , blkno , copied_buffer .data ,
6485+ REGBUF_STANDARD );
6486+ XLogRegisterBufData (0 , src , newlen );
64526487
64536488 /* inplace updates aren't decoded atm, don't log the origin */
64546489
64556490 recptr = XLogInsert (RM_HEAP_ID , XLOG_HEAP_INPLACE );
64566491
6457- PageSetLSN (BufferGetPage ( buffer ) , recptr );
6492+ PageSetLSN (page , recptr );
64586493 }
64596494
6495+ memcpy (dst , src , newlen );
6496+
6497+ MarkBufferDirty (buffer );
6498+
64606499 LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
64616500
64626501 /*
@@ -6465,6 +6504,7 @@ heap_inplace_update_and_unlock(Relation relation,
64656504 */
64666505 AtInplace_Inval ();
64676506
6507+ MyProc -> delayChkptFlags &= ~DELAY_CHKPT_START ;
64686508 END_CRIT_SECTION ();
64696509 UnlockTuple (relation , & tuple -> t_self , InplaceUpdateTupleLock );
64706510
0 commit comments