@@ -338,6 +338,9 @@ static MemoryContext MXactContext = NULL;
338338#define debug_elog6 (a ,b ,c ,d ,e ,f )
339339#endif
340340
341+ /* hack to deal with WAL generated with older minor versions */
342+ static int pre_initialized_offsets_page = -1 ;
343+
341344/* internal MultiXactId management */
342345static void MultiXactIdSetOldestVisible (void );
343346static void RecordNewMultiXact (MultiXactId multi , MultiXactOffset offset ,
@@ -869,13 +872,61 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
869872 int entryno ;
870873 int slotno ;
871874 MultiXactOffset * offptr ;
872- int i ;
875+ MultiXactId next ;
876+ int next_pageno ;
877+ int next_entryno ;
878+ MultiXactOffset * next_offptr ;
873879
874880 LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
875881
882+ /* position of this multixid in the offsets SLRU area */
876883 pageno = MultiXactIdToOffsetPage (multi );
877884 entryno = MultiXactIdToOffsetEntry (multi );
878885
886+ /* position of the next multixid */
887+ next = multi + 1 ;
888+ if (next < FirstMultiXactId )
889+ next = FirstMultiXactId ;
890+ next_pageno = MultiXactIdToOffsetPage (next );
891+ next_entryno = MultiXactIdToOffsetEntry (next );
892+
893+ /*
894+ * Older minor versions didn't set the next multixid's offset in this
895+ * function, and therefore didn't initialize the next page until the next
896+ * multixid was assigned. If we're replaying WAL that was generated by
897+ * such a version, the next page might not be initialized yet. Initialize
898+ * it now.
899+ */
900+ if (InRecovery &&
901+ next_pageno != pageno &&
902+ MultiXactOffsetCtl -> shared -> latest_page_number == pageno )
903+ {
904+ elog (DEBUG1 , "next offsets page is not initialized, initializing it now" );
905+
906+ /* Create and zero the page */
907+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , next_pageno );
908+
909+ /* Make sure it's written out */
910+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
911+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
912+
913+ /*
914+ * Remember that we initialized the page, so that we don't zero it
915+ * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
916+ */
917+ pre_initialized_offsets_page = next_pageno ;
918+ }
919+
920+ /*
921+ * Set the starting offset of this multixid's members.
922+ *
923+ * In the common case, it was already be set by the previous
924+ * RecordNewMultiXact call, as this was the next multixid of the previous
925+ * multixid. But if multiple backends are generating multixids
926+ * concurrently, we might race ahead and get called before the previous
927+ * multixid.
928+ */
929+
879930 /*
880931 * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
881932 * to complain about if there's any I/O error. This is kinda bogus, but
@@ -887,9 +938,37 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
887938 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
888939 offptr += entryno ;
889940
890- * offptr = offset ;
941+ if (* offptr != offset )
942+ {
943+ /* should already be set to the correct value, or not at all */
944+ Assert (* offptr == 0 );
945+ * offptr = offset ;
946+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
947+ }
891948
892- MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
949+ /*
950+ * Set the next multixid's offset to the end of this multixid's members.
951+ */
952+ if (next_pageno == pageno )
953+ {
954+ next_offptr = offptr + 1 ;
955+ }
956+ else
957+ {
958+ /* must be the first entry on the page */
959+ Assert (next_entryno == 0 || next == FirstMultiXactId );
960+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , next_pageno , true, next );
961+ next_offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
962+ next_offptr += next_entryno ;
963+ }
964+
965+ if (* next_offptr != offset + nmembers )
966+ {
967+ /* should already be set to the correct value, or not at all */
968+ Assert (* next_offptr == 0 );
969+ * next_offptr = offset + nmembers ;
970+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
971+ }
893972
894973 /* Exchange our lock */
895974 LWLockRelease (MultiXactOffsetSLRULock );
@@ -898,7 +977,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
898977
899978 prev_pageno = -1 ;
900979
901- for (i = 0 ; i < nmembers ; i ++ , offset ++ )
980+ for (int i = 0 ; i < nmembers ; i ++ , offset ++ )
902981 {
903982 TransactionId * memberptr ;
904983 uint32 * flagsptr ;
@@ -1073,8 +1152,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
10731152 result = FirstMultiXactId ;
10741153 }
10751154
1076- /* Make sure there is room for the MXID in the file. */
1077- ExtendMultiXactOffset (result );
1155+ /*
1156+ * Make sure there is room for the next MXID in the file. Assigning this
1157+ * MXID sets the next MXID's offset already.
1158+ */
1159+ ExtendMultiXactOffset (result + 1 );
10781160
10791161 /*
10801162 * Reserve the members space, similarly to above. Also, be careful not to
@@ -1315,21 +1397,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13151397 * one's. However, there are some corner cases to worry about:
13161398 *
13171399 * 1. This multixact may be the latest one created, in which case there is
1318- * no next one to look at. In this case the nextOffset value we just
1319- * saved is the correct endpoint.
1320- *
1321- * 2. The next multixact may still be in process of being filled in: that
1322- * is, another process may have done GetNewMultiXactId but not yet written
1323- * the offset entry for that ID. In that scenario, it is guaranteed that
1324- * the offset entry for that multixact exists (because GetNewMultiXactId
1325- * won't release MultiXactGenLock until it does) but contains zero
1326- * (because we are careful to pre-zero offset pages). Because
1327- * GetNewMultiXactId will never return zero as the starting offset for a
1328- * multixact, when we read zero as the next multixact's offset, we know we
1329- * have this case. We sleep for a bit and try again.
1400+ * no next one to look at. The next multixact's offset should be set
1401+ * already, as we set it in RecordNewMultiXact(), but we used to not do
1402+ * that in older minor versions. To cope with that case, if this
1403+ * multixact is the latest one created, use the nextOffset value we read
1404+ * above as the endpoint.
13301405 *
1331- * 3 . Because GetNewMultiXactId increments offset zero to offset one to
1332- * handle case #2 , there is an ambiguity near the point of offset
1406+ * 2 . Because GetNewMultiXactId skips over offset zero, to reserve zero
1407+ * for to mean "unset" , there is an ambiguity near the point of offset
13331408 * wraparound. If we see next multixact's offset is one, is that our
13341409 * multixact's actual endpoint, or did it end at zero with a subsequent
13351410 * increment? We handle this using the knowledge that if the zero'th
@@ -1341,7 +1416,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13411416 * cases, so it seems better than holding the MultiXactGenLock for a long
13421417 * time on every multixact creation.
13431418 */
1344- retry :
13451419 LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
13461420
13471421 pageno = MultiXactIdToOffsetPage (multi );
@@ -1386,13 +1460,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13861460 nextMXOffset = * offptr ;
13871461
13881462 if (nextMXOffset == 0 )
1389- {
1390- /* Corner case 2: next multixact is still being filled in */
1391- LWLockRelease (MultiXactOffsetSLRULock );
1392- CHECK_FOR_INTERRUPTS ();
1393- pg_usleep (1000L );
1394- goto retry ;
1395- }
1463+ ereport (ERROR ,
1464+ (errcode (ERRCODE_DATA_CORRUPTED ),
1465+ errmsg ("MultiXact %u has invalid next offset" ,
1466+ multi )));
13961467
13971468 length = nextMXOffset - offset ;
13981469 }
@@ -1428,7 +1499,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14281499
14291500 if (!TransactionIdIsValid (* xactptr ))
14301501 {
1431- /* Corner case 3 : we must be looking at unused slot zero */
1502+ /* Corner case 2 : we must be looking at unused slot zero */
14321503 Assert (offset == 0 );
14331504 continue ;
14341505 }
@@ -2055,24 +2126,32 @@ TrimMultiXact(void)
20552126 MultiXactOffsetCtl -> shared -> latest_page_number = pageno ;
20562127
20572128 /*
2058- * Zero out the remainder of the current offsets page. See notes in
2059- * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2060- * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2061- * rule "write xlog before data," nextMXact successors may carry obsolete,
2062- * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2063- * operates normally.
2129+ * Set the offset of nextMXact on the offsets page. This is normally done
2130+ * in RecordNewMultiXact() of the previous multixact, but we used to not
2131+ * do that in older minor versions. To ensure that the next offset is set
2132+ * if the binary was just upgraded from an older minor version, do it now.
2133+ *
2134+ * Zero out the remainder of the page. See notes in TrimCLOG() for
2135+ * background. Unlike CLOG, some WAL record covers every pg_multixact
2136+ * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2137+ * xlog before data," nextMXact successors may carry obsolete, nonzero
2138+ * offset values.
20642139 */
20652140 entryno = MultiXactIdToOffsetEntry (nextMXact );
2066- if (entryno != 0 )
20672141 {
20682142 int slotno ;
20692143 MultiXactOffset * offptr ;
20702144
2071- slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
2145+ if (entryno == 0 )
2146+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , pageno );
2147+ else
2148+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
20722149 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
20732150 offptr += entryno ;
20742151
2075- MemSet (offptr , 0 , BLCKSZ - (entryno * sizeof (MultiXactOffset )));
2152+ * offptr = offset ;
2153+ if (entryno != 0 && (entryno + 1 ) * sizeof (MultiXactOffset ) != BLCKSZ )
2154+ MemSet (offptr + 1 , 0 , BLCKSZ - (entryno + 1 ) * sizeof (MultiXactOffset ));
20762155
20772156 MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
20782157 }
@@ -3251,13 +3330,21 @@ multixact_redo(XLogReaderState *record)
32513330
32523331 memcpy (& pageno , XLogRecGetData (record ), sizeof (int ));
32533332
3254- LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
3255-
3256- slotno = ZeroMultiXactOffsetPage (pageno , false);
3257- SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3258- Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
3259-
3260- LWLockRelease (MultiXactOffsetSLRULock );
3333+ /*
3334+ * Skip the record if we already initialized the page at the previous
3335+ * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3336+ */
3337+ if (pre_initialized_offsets_page != pageno )
3338+ {
3339+ LWLockAcquire (MultiXactOffsetSLRULock , LW_EXCLUSIVE );
3340+ slotno = ZeroMultiXactOffsetPage (pageno , false);
3341+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3342+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
3343+ LWLockRelease (MultiXactOffsetSLRULock );
3344+ }
3345+ else
3346+ elog (DEBUG1 , "skipping initialization of offsets page %d because it was already initialized on multixid creation" , pageno );
3347+ pre_initialized_offsets_page = -1 ;
32613348 }
32623349 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE )
32633350 {
@@ -3281,6 +3368,22 @@ multixact_redo(XLogReaderState *record)
32813368 TransactionId max_xid ;
32823369 int i ;
32833370
3371+ if (pre_initialized_offsets_page != -1 )
3372+ {
3373+ /*
3374+ * If we implicitly initialized the next offsets page while
3375+ * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3376+ * with an older minor version, we still expect to see an
3377+ * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3378+ * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3379+ * not happen. If it does, we'll continue with the replay, but
3380+ * log a message to note that something's funny.
3381+ */
3382+ elog (LOG , "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page %d that was implicitly initialized earlier" ,
3383+ pre_initialized_offsets_page );
3384+ pre_initialized_offsets_page = -1 ;
3385+ }
3386+
32843387 /* Store the data back into the SLRU files */
32853388 RecordNewMultiXact (xlrec -> mid , xlrec -> moff , xlrec -> nmembers ,
32863389 xlrec -> members );
0 commit comments