@@ -6437,13 +6437,18 @@ heap_inplace_update_and_unlock(Relation relation,
64376437 HeapTupleHeader htup = oldtup -> t_data ;
64386438 uint32 oldlen ;
64396439 uint32 newlen ;
6440+ char * dst ;
6441+ char * src ;
64406442
64416443 Assert (ItemPointerEquals (& oldtup -> t_self , & tuple -> t_self ));
64426444 oldlen = oldtup -> t_len - htup -> t_hoff ;
64436445 newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
64446446 if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
64456447 elog (ERROR , "wrong tuple length" );
64466448
6449+ dst = (char * ) htup + htup -> t_hoff ;
6450+ src = (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ;
6451+
64476452 /*
64486453 * Unlink relcache init files as needed. If unlinking, acquire
64496454 * RelCacheInitLock until after associated invalidations. By doing this
@@ -6454,15 +6459,15 @@ heap_inplace_update_and_unlock(Relation relation,
64546459 */
64556460 PreInplace_Inval ();
64566461
6457- /* NO EREPORT(ERROR) from here till changes are logged */
6458- START_CRIT_SECTION ();
6459-
6460- memcpy ((char * ) htup + htup -> t_hoff ,
6461- (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ,
6462- newlen );
6463-
64646462 /*----------
6465- * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6463+ * NO EREPORT(ERROR) from here till changes are complete
6464+ *
6465+ * Our buffer lock won't stop a reader having already pinned and checked
6466+ * visibility for this tuple. Hence, we write WAL first, then mutate the
6467+ * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6468+ * checkpoint delay makes that acceptable. With the usual order of
6469+ * changes, a crash after memcpy() and before XLogInsert() could allow
6470+ * datfrozenxid to overtake relfrozenxid:
64666471 *
64676472 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
64686473 * ["R" is a VACUUM tbl]
@@ -6472,31 +6477,65 @@ heap_inplace_update_and_unlock(Relation relation,
64726477 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
64736478 * [crash]
64746479 * [recovery restores datfrozenxid w/o relfrozenxid]
6475- */
6476-
6477- MarkBufferDirty (buffer );
6480+ *
6481+ * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6482+ * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6483+ * The stack copy facilitates a FPI of the post-mutation block before we
6484+ * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6485+ * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6486+ * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6487+ * This function, however, likely could avoid it with the following order
6488+ * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6489+ * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6490+ * patterns to analyze. Inplace update isn't so frequent that it should
6491+ * pursue the small optimization of skipping DELAY_CHKPT_START.
6492+ */
6493+ Assert (!MyProc -> delayChkpt );
6494+ START_CRIT_SECTION ();
6495+ MyProc -> delayChkpt = true;
64786496
64796497 /* XLOG stuff */
64806498 if (RelationNeedsWAL (relation ))
64816499 {
64826500 xl_heap_inplace xlrec ;
6501+ PGAlignedBlock copied_buffer ;
6502+ char * origdata = (char * ) BufferGetBlock (buffer );
6503+ Page page = BufferGetPage (buffer );
6504+ uint16 lower = ((PageHeader ) page )-> pd_lower ;
6505+ uint16 upper = ((PageHeader ) page )-> pd_upper ;
6506+ uintptr_t dst_offset_in_block ;
6507+ RelFileNode rnode ;
6508+ ForkNumber forkno ;
6509+ BlockNumber blkno ;
64836510 XLogRecPtr recptr ;
64846511
64856512 xlrec .offnum = ItemPointerGetOffsetNumber (& tuple -> t_self );
64866513
64876514 XLogBeginInsert ();
64886515 XLogRegisterData ((char * ) & xlrec , SizeOfHeapInplace );
64896516
6490- XLogRegisterBuffer (0 , buffer , REGBUF_STANDARD );
6491- XLogRegisterBufData (0 , (char * ) htup + htup -> t_hoff , newlen );
6517+ /* register block matching what buffer will look like after changes */
6518+ memcpy (copied_buffer .data , origdata , lower );
6519+ memcpy (copied_buffer .data + upper , origdata + upper , BLCKSZ - upper );
6520+ dst_offset_in_block = dst - origdata ;
6521+ memcpy (copied_buffer .data + dst_offset_in_block , src , newlen );
6522+ BufferGetTag (buffer , & rnode , & forkno , & blkno );
6523+ Assert (forkno == MAIN_FORKNUM );
6524+ XLogRegisterBlock (0 , & rnode , forkno , blkno , copied_buffer .data ,
6525+ REGBUF_STANDARD );
6526+ XLogRegisterBufData (0 , src , newlen );
64926527
64936528 /* inplace updates aren't decoded atm, don't log the origin */
64946529
64956530 recptr = XLogInsert (RM_HEAP_ID , XLOG_HEAP_INPLACE );
64966531
6497- PageSetLSN (BufferGetPage ( buffer ) , recptr );
6532+ PageSetLSN (page , recptr );
64986533 }
64996534
6535+ memcpy (dst , src , newlen );
6536+
6537+ MarkBufferDirty (buffer );
6538+
65006539 LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
65016540
65026541 /*
@@ -6505,6 +6544,7 @@ heap_inplace_update_and_unlock(Relation relation,
65056544 */
65066545 AtInplace_Inval ();
65076546
6547+ MyProc -> delayChkpt = false;
65086548 END_CRIT_SECTION ();
65096549 UnlockTuple (relation , & tuple -> t_self , InplaceUpdateTupleLock );
65106550
0 commit comments