@@ -274,12 +274,6 @@ typedef struct MultiXactStateData
274274 /* support for members anti-wraparound measures */
275275 MultiXactOffset offsetStopLimit ; /* known if oldestOffsetKnown */
276276
277- /*
278- * This is used to sleep until a multixact offset is written when we want
279- * to create the next one.
280- */
281- ConditionVariable nextoff_cv ;
282-
283277 /*
284278 * Per-backend data starts here. We have two arrays stored in the area
285279 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -384,6 +378,9 @@ static MemoryContext MXactContext = NULL;
384378#define debug_elog6 (a ,b ,c ,d ,e ,f )
385379#endif
386380
381+ /* hack to deal with WAL generated with older minor versions */
382+ static int64 pre_initialized_offsets_page = -1 ;
383+
387384/* internal MultiXactId management */
388385static void MultiXactIdSetOldestVisible (void );
389386static void RecordNewMultiXact (MultiXactId multi , MultiXactOffset offset ,
@@ -915,13 +912,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
915912 int entryno ;
916913 int slotno ;
917914 MultiXactOffset * offptr ;
918- int i ;
915+ MultiXactId next ;
916+ int64 next_pageno ;
917+ int next_entryno ;
918+ MultiXactOffset * next_offptr ;
919919 LWLock * lock ;
920920 LWLock * prevlock = NULL ;
921921
922+ /* position of this multixid in the offsets SLRU area */
922923 pageno = MultiXactIdToOffsetPage (multi );
923924 entryno = MultiXactIdToOffsetEntry (multi );
924925
926+ /* position of the next multixid */
927+ next = multi + 1 ;
928+ if (next < FirstMultiXactId )
929+ next = FirstMultiXactId ;
930+ next_pageno = MultiXactIdToOffsetPage (next );
931+ next_entryno = MultiXactIdToOffsetEntry (next );
932+
933+ /*
934+ * Older minor versions didn't set the next multixid's offset in this
935+ * function, and therefore didn't initialize the next page until the next
936+ * multixid was assigned. If we're replaying WAL that was generated by
937+ * such a version, the next page might not be initialized yet. Initialize
938+ * it now.
939+ */
940+ if (InRecovery &&
941+ next_pageno != pageno &&
942+ pg_atomic_read_u64 (& MultiXactOffsetCtl -> shared -> latest_page_number ) == pageno )
943+ {
944+ elog (DEBUG1 , "next offsets page is not initialized, initializing it now" );
945+
946+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , next_pageno );
947+ LWLockAcquire (lock , LW_EXCLUSIVE );
948+
949+ /* Create and zero the page */
950+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , next_pageno );
951+
952+ /* Make sure it's written out */
953+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
954+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
955+
956+ LWLockRelease (lock );
957+
958+ /*
959+ * Remember that we initialized the page, so that we don't zero it
960+ * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
961+ */
962+ pre_initialized_offsets_page = next_pageno ;
963+ }
964+
965+ /*
966+ * Set the starting offset of this multixid's members.
967+ *
968+ * In the common case, it was already be set by the previous
969+ * RecordNewMultiXact call, as this was the next multixid of the previous
970+ * multixid. But if multiple backends are generating multixids
971+ * concurrently, we might race ahead and get called before the previous
972+ * multixid.
973+ */
925974 lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
926975 LWLockAcquire (lock , LW_EXCLUSIVE );
927976
@@ -936,22 +985,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
936985 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
937986 offptr += entryno ;
938987
939- * offptr = offset ;
988+ if (* offptr != offset )
989+ {
990+ /* should already be set to the correct value, or not at all */
991+ Assert (* offptr == 0 );
992+ * offptr = offset ;
993+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
994+ }
940995
941- MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
996+ /*
997+ * Set the next multixid's offset to the end of this multixid's members.
998+ */
999+ if (next_pageno == pageno )
1000+ {
1001+ next_offptr = offptr + 1 ;
1002+ }
1003+ else
1004+ {
1005+ /* must be the first entry on the page */
1006+ Assert (next_entryno == 0 || next == FirstMultiXactId );
1007+
1008+ /* Swap the lock for a lock on the next page */
1009+ LWLockRelease (lock );
1010+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , next_pageno );
1011+ LWLockAcquire (lock , LW_EXCLUSIVE );
1012+
1013+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , next_pageno , true, next );
1014+ next_offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
1015+ next_offptr += next_entryno ;
1016+ }
1017+
1018+ if (* next_offptr != offset + nmembers )
1019+ {
1020+ /* should already be set to the correct value, or not at all */
1021+ Assert (* next_offptr == 0 );
1022+ * next_offptr = offset + nmembers ;
1023+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
1024+ }
9421025
9431026 /* Release MultiXactOffset SLRU lock. */
9441027 LWLockRelease (lock );
9451028
946- /*
947- * If anybody was waiting to know the offset of this multixact ID we just
948- * wrote, they can read it now, so wake them up.
949- */
950- ConditionVariableBroadcast (& MultiXactState -> nextoff_cv );
951-
9521029 prev_pageno = -1 ;
9531030
954- for (i = 0 ; i < nmembers ; i ++ , offset ++ )
1031+ for (int i = 0 ; i < nmembers ; i ++ , offset ++ )
9551032 {
9561033 TransactionId * memberptr ;
9571034 uint32 * flagsptr ;
@@ -1141,8 +1218,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
11411218 result = FirstMultiXactId ;
11421219 }
11431220
1144- /* Make sure there is room for the MXID in the file. */
1145- ExtendMultiXactOffset (result );
1221+ /*
1222+ * Make sure there is room for the next MXID in the file. Assigning this
1223+ * MXID sets the next MXID's offset already.
1224+ */
1225+ ExtendMultiXactOffset (result + 1 );
11461226
11471227 /*
11481228 * Reserve the members space, similarly to above. Also, be careful not to
@@ -1307,7 +1387,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13071387 MultiXactOffset nextOffset ;
13081388 MultiXactMember * ptr ;
13091389 LWLock * lock ;
1310- bool slept = false;
13111390
13121391 debug_elog3 (DEBUG2 , "GetMembers: asked for %u" , multi );
13131392
@@ -1384,23 +1463,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13841463 * one's. However, there are some corner cases to worry about:
13851464 *
13861465 * 1. This multixact may be the latest one created, in which case there is
1387- * no next one to look at. In this case the nextOffset value we just
1388- * saved is the correct endpoint.
1466+ * no next one to look at. The next multixact's offset should be set
1467+ * already, as we set it in RecordNewMultiXact(), but we used to not do
1468+ * that in older minor versions. To cope with that case, if this
1469+ * multixact is the latest one created, use the nextOffset value we read
1470+ * above as the endpoint.
13891471 *
1390- * 2. The next multixact may still be in process of being filled in: that
1391- * is, another process may have done GetNewMultiXactId but not yet written
1392- * the offset entry for that ID. In that scenario, it is guaranteed that
1393- * the offset entry for that multixact exists (because GetNewMultiXactId
1394- * won't release MultiXactGenLock until it does) but contains zero
1395- * (because we are careful to pre-zero offset pages). Because
1396- * GetNewMultiXactId will never return zero as the starting offset for a
1397- * multixact, when we read zero as the next multixact's offset, we know we
1398- * have this case. We handle this by sleeping on the condition variable
1399- * we have just for this; the process in charge will signal the CV as soon
1400- * as it has finished writing the multixact offset.
1401- *
1402- * 3. Because GetNewMultiXactId increments offset zero to offset one to
1403- * handle case #2, there is an ambiguity near the point of offset
1472+ * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
1473+ * for to mean "unset", there is an ambiguity near the point of offset
14041474 * wraparound. If we see next multixact's offset is one, is that our
14051475 * multixact's actual endpoint, or did it end at zero with a subsequent
14061476 * increment? We handle this using the knowledge that if the zero'th
@@ -1412,7 +1482,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14121482 * cases, so it seems better than holding the MultiXactGenLock for a long
14131483 * time on every multixact creation.
14141484 */
1415- retry :
14161485 pageno = MultiXactIdToOffsetPage (multi );
14171486 entryno = MultiXactIdToOffsetEntry (multi );
14181487
@@ -1475,29 +1544,17 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14751544 nextMXOffset = * offptr ;
14761545
14771546 if (nextMXOffset == 0 )
1478- {
1479- /* Corner case 2: next multixact is still being filled in */
1480- LWLockRelease (lock );
1481- CHECK_FOR_INTERRUPTS ();
1482-
1483- ConditionVariableSleep (& MultiXactState -> nextoff_cv ,
1484- WAIT_EVENT_MULTIXACT_CREATION );
1485- slept = true;
1486- goto retry ;
1487- }
1547+ ereport (ERROR ,
1548+ (errcode (ERRCODE_DATA_CORRUPTED ),
1549+ errmsg ("MultiXact %u has invalid next offset" ,
1550+ multi )));
14881551
14891552 length = nextMXOffset - offset ;
14901553 }
14911554
14921555 LWLockRelease (lock );
14931556 lock = NULL ;
14941557
1495- /*
1496- * If we slept above, clean up state; it's no longer needed.
1497- */
1498- if (slept )
1499- ConditionVariableCancelSleep ();
1500-
15011558 ptr = (MultiXactMember * ) palloc (length * sizeof (MultiXactMember ));
15021559
15031560 truelength = 0 ;
@@ -1540,7 +1597,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
15401597
15411598 if (!TransactionIdIsValid (* xactptr ))
15421599 {
1543- /* Corner case 3 : we must be looking at unused slot zero */
1600+ /* Corner case 2 : we must be looking at unused slot zero */
15441601 Assert (offset == 0 );
15451602 continue ;
15461603 }
@@ -1987,7 +2044,6 @@ MultiXactShmemInit(void)
19872044
19882045 /* Make sure we zero out the per-backend state */
19892046 MemSet (MultiXactState , 0 , SHARED_MULTIXACT_STATE_SIZE );
1990- ConditionVariableInit (& MultiXactState -> nextoff_cv );
19912047 }
19922048 else
19932049 Assert (found );
@@ -2194,26 +2250,34 @@ TrimMultiXact(void)
21942250 pageno );
21952251
21962252 /*
2197- * Zero out the remainder of the current offsets page. See notes in
2198- * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2199- * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2200- * rule "write xlog before data," nextMXact successors may carry obsolete,
2201- * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2202- * operates normally.
2253+ * Set the offset of nextMXact on the offsets page. This is normally done
2254+ * in RecordNewMultiXact() of the previous multixact, but we used to not
2255+ * do that in older minor versions. To ensure that the next offset is set
2256+ * if the binary was just upgraded from an older minor version, do it now.
2257+ *
2258+ * Zero out the remainder of the page. See notes in TrimCLOG() for
2259+ * background. Unlike CLOG, some WAL record covers every pg_multixact
2260+ * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2261+ * xlog before data," nextMXact successors may carry obsolete, nonzero
2262+ * offset values.
22032263 */
22042264 entryno = MultiXactIdToOffsetEntry (nextMXact );
2205- if (entryno != 0 )
22062265 {
22072266 int slotno ;
22082267 MultiXactOffset * offptr ;
22092268 LWLock * lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
22102269
22112270 LWLockAcquire (lock , LW_EXCLUSIVE );
2212- slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
2271+ if (entryno == 0 )
2272+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , pageno );
2273+ else
2274+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
22132275 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
22142276 offptr += entryno ;
22152277
2216- MemSet (offptr , 0 , BLCKSZ - (entryno * sizeof (MultiXactOffset )));
2278+ * offptr = offset ;
2279+ if (entryno != 0 && (entryno + 1 ) * sizeof (MultiXactOffset ) != BLCKSZ )
2280+ MemSet (offptr + 1 , 0 , BLCKSZ - (entryno + 1 ) * sizeof (MultiXactOffset ));
22172281
22182282 MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
22192283 LWLockRelease (lock );
@@ -3398,14 +3462,24 @@ multixact_redo(XLogReaderState *record)
33983462
33993463 memcpy (& pageno , XLogRecGetData (record ), sizeof (pageno ));
34003464
3401- lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
3402- LWLockAcquire (lock , LW_EXCLUSIVE );
3465+ /*
3466+ * Skip the record if we already initialized the page at the previous
3467+ * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3468+ */
3469+ if (pre_initialized_offsets_page != pageno )
3470+ {
3471+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
3472+ LWLockAcquire (lock , LW_EXCLUSIVE );
34033473
3404- slotno = ZeroMultiXactOffsetPage (pageno , false);
3405- SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3406- Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
3474+ slotno = ZeroMultiXactOffsetPage (pageno , false);
3475+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3476+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
34073477
3408- LWLockRelease (lock );
3478+ LWLockRelease (lock );
3479+ }
3480+ else
3481+ elog (DEBUG1 , "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation" , pageno );
3482+ pre_initialized_offsets_page = -1 ;
34093483 }
34103484 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE )
34113485 {
@@ -3431,6 +3505,22 @@ multixact_redo(XLogReaderState *record)
34313505 TransactionId max_xid ;
34323506 int i ;
34333507
3508+ if (pre_initialized_offsets_page != -1 )
3509+ {
3510+ /*
3511+ * If we implicitly initialized the next offsets page while
3512+ * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3513+ * with an older minor version, we still expect to see an
3514+ * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3515+ * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3516+ * not happen. If it does, we'll continue with the replay, but
3517+ * log a message to note that something's funny.
3518+ */
3519+ elog (LOG , "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier" ,
3520+ pre_initialized_offsets_page );
3521+ pre_initialized_offsets_page = -1 ;
3522+ }
3523+
34343524 /* Store the data back into the SLRU files */
34353525 RecordNewMultiXact (xlrec -> mid , xlrec -> moff , xlrec -> nmembers ,
34363526 xlrec -> members );
0 commit comments