Skip to content

Commit 6351669

Browse files
committed
Set next multixid's offset when creating a new multixid
With this commit, the next multixid's offset will always be set on the offsets page, by the time that a backend might try to read it, so we no longer need the waiting mechanism with the condition variable. In other words, this eliminates "corner case 2" mentioned in the comments. The waiting mechanism was broken in a few scenarios: - When nextMulti was advanced without WAL-logging the next multixid. For example, if a later multixid was already assigned and WAL-logged before the previous one was WAL-logged, and then the server crashed. In that case the next offset would never be set in the offsets SLRU, and a query trying to read it would get stuck waiting for it. Same thing could happen if pg_resetwal was used to forcibly advance nextMulti. - In hot standby mode, a deadlock could happen where one backend waits for the next multixid assignment record, but WAL replay is not advancing because of a recovery conflict with the waiting backend. The old TAP test used carefully placed injection points to exercise the old waiting code, but now that the waiting code is gone, much of the old test is no longer relevant. Rewrite the test to reproduce the IPC/MultixactCreation hang after crash recovery instead, and to verify that previously recorded multixids stay readable. Backpatch to all supported versions. In back-branches, we still need to be able to read WAL that was generated before this fix, so in the back-branches this includes a hack to initialize the next offsets page when replaying XLOG_MULTIXACT_CREATE_ID for the last multixid on a page. On 'master', bump XLOG_PAGE_MAGIC instead to indicate that the WAL is not compatible. Author: Andrey Borodin <amborodin@acm.org> Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://www.postgresql.org/message-id/172e5723-d65f-4eec-b512-14beacb326ce@yandex.ru Backpatch-through: 14
1 parent 1829016 commit 6351669

File tree

1 file changed

+148
-45
lines changed

1 file changed

+148
-45
lines changed

src/backend/access/transam/multixact.c

Lines changed: 148 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,9 @@ static MemoryContext MXactContext = NULL;
338338
#define debug_elog6(a,b,c,d,e,f)
339339
#endif
340340

341+
/* hack to deal with WAL generated with older minor versions */
342+
static int pre_initialized_offsets_page = -1;
343+
341344
/* internal MultiXactId management */
342345
static void MultiXactIdSetOldestVisible(void);
343346
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
@@ -869,13 +872,61 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
869872
int entryno;
870873
int slotno;
871874
MultiXactOffset *offptr;
872-
int i;
875+
MultiXactId next;
876+
int next_pageno;
877+
int next_entryno;
878+
MultiXactOffset *next_offptr;
873879

874880
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
875881

882+
/* position of this multixid in the offsets SLRU area */
876883
pageno = MultiXactIdToOffsetPage(multi);
877884
entryno = MultiXactIdToOffsetEntry(multi);
878885

886+
/* position of the next multixid */
887+
next = multi + 1;
888+
if (next < FirstMultiXactId)
889+
next = FirstMultiXactId;
890+
next_pageno = MultiXactIdToOffsetPage(next);
891+
next_entryno = MultiXactIdToOffsetEntry(next);
892+
893+
/*
894+
* Older minor versions didn't set the next multixid's offset in this
895+
* function, and therefore didn't initialize the next page until the next
896+
* multixid was assigned. If we're replaying WAL that was generated by
897+
* such a version, the next page might not be initialized yet. Initialize
898+
* it now.
899+
*/
900+
if (InRecovery &&
901+
next_pageno != pageno &&
902+
MultiXactOffsetCtl->shared->latest_page_number == pageno)
903+
{
904+
elog(DEBUG1, "next offsets page is not initialized, initializing it now");
905+
906+
/* Create and zero the page */
907+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
908+
909+
/* Make sure it's written out */
910+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
911+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
912+
913+
/*
914+
* Remember that we initialized the page, so that we don't zero it
915+
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
916+
*/
917+
pre_initialized_offsets_page = next_pageno;
918+
}
919+
920+
/*
921+
* Set the starting offset of this multixid's members.
922+
*
923+
* In the common case, it was already be set by the previous
924+
* RecordNewMultiXact call, as this was the next multixid of the previous
925+
* multixid. But if multiple backends are generating multixids
926+
* concurrently, we might race ahead and get called before the previous
927+
* multixid.
928+
*/
929+
879930
/*
880931
* Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
881932
* to complain about if there's any I/O error. This is kinda bogus, but
@@ -887,9 +938,37 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
887938
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
888939
offptr += entryno;
889940

890-
*offptr = offset;
941+
if (*offptr != offset)
942+
{
943+
/* should already be set to the correct value, or not at all */
944+
Assert(*offptr == 0);
945+
*offptr = offset;
946+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
947+
}
891948

892-
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
949+
/*
950+
* Set the next multixid's offset to the end of this multixid's members.
951+
*/
952+
if (next_pageno == pageno)
953+
{
954+
next_offptr = offptr + 1;
955+
}
956+
else
957+
{
958+
/* must be the first entry on the page */
959+
Assert(next_entryno == 0 || next == FirstMultiXactId);
960+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
961+
next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
962+
next_offptr += next_entryno;
963+
}
964+
965+
if (*next_offptr != offset + nmembers)
966+
{
967+
/* should already be set to the correct value, or not at all */
968+
Assert(*next_offptr == 0);
969+
*next_offptr = offset + nmembers;
970+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
971+
}
893972

894973
/* Exchange our lock */
895974
LWLockRelease(MultiXactOffsetSLRULock);
@@ -898,7 +977,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
898977

899978
prev_pageno = -1;
900979

901-
for (i = 0; i < nmembers; i++, offset++)
980+
for (int i = 0; i < nmembers; i++, offset++)
902981
{
903982
TransactionId *memberptr;
904983
uint32 *flagsptr;
@@ -1073,8 +1152,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
10731152
result = FirstMultiXactId;
10741153
}
10751154

1076-
/* Make sure there is room for the MXID in the file. */
1077-
ExtendMultiXactOffset(result);
1155+
/*
1156+
* Make sure there is room for the next MXID in the file. Assigning this
1157+
* MXID sets the next MXID's offset already.
1158+
*/
1159+
ExtendMultiXactOffset(result + 1);
10781160

10791161
/*
10801162
* Reserve the members space, similarly to above. Also, be careful not to
@@ -1315,21 +1397,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13151397
* one's. However, there are some corner cases to worry about:
13161398
*
13171399
* 1. This multixact may be the latest one created, in which case there is
1318-
* no next one to look at. In this case the nextOffset value we just
1319-
* saved is the correct endpoint.
1320-
*
1321-
* 2. The next multixact may still be in process of being filled in: that
1322-
* is, another process may have done GetNewMultiXactId but not yet written
1323-
* the offset entry for that ID. In that scenario, it is guaranteed that
1324-
* the offset entry for that multixact exists (because GetNewMultiXactId
1325-
* won't release MultiXactGenLock until it does) but contains zero
1326-
* (because we are careful to pre-zero offset pages). Because
1327-
* GetNewMultiXactId will never return zero as the starting offset for a
1328-
* multixact, when we read zero as the next multixact's offset, we know we
1329-
* have this case. We sleep for a bit and try again.
1400+
* no next one to look at. The next multixact's offset should be set
1401+
* already, as we set it in RecordNewMultiXact(), but we used to not do
1402+
* that in older minor versions. To cope with that case, if this
1403+
* multixact is the latest one created, use the nextOffset value we read
1404+
* above as the endpoint.
13301405
*
1331-
* 3. Because GetNewMultiXactId increments offset zero to offset one to
1332-
* handle case #2, there is an ambiguity near the point of offset
1406+
* 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
1407+
* for to mean "unset", there is an ambiguity near the point of offset
13331408
* wraparound. If we see next multixact's offset is one, is that our
13341409
* multixact's actual endpoint, or did it end at zero with a subsequent
13351410
* increment? We handle this using the knowledge that if the zero'th
@@ -1341,7 +1416,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13411416
* cases, so it seems better than holding the MultiXactGenLock for a long
13421417
* time on every multixact creation.
13431418
*/
1344-
retry:
13451419
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
13461420

13471421
pageno = MultiXactIdToOffsetPage(multi);
@@ -1386,13 +1460,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13861460
nextMXOffset = *offptr;
13871461

13881462
if (nextMXOffset == 0)
1389-
{
1390-
/* Corner case 2: next multixact is still being filled in */
1391-
LWLockRelease(MultiXactOffsetSLRULock);
1392-
CHECK_FOR_INTERRUPTS();
1393-
pg_usleep(1000L);
1394-
goto retry;
1395-
}
1463+
ereport(ERROR,
1464+
(errcode(ERRCODE_DATA_CORRUPTED),
1465+
errmsg("MultiXact %u has invalid next offset",
1466+
multi)));
13961467

13971468
length = nextMXOffset - offset;
13981469
}
@@ -1428,7 +1499,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14281499

14291500
if (!TransactionIdIsValid(*xactptr))
14301501
{
1431-
/* Corner case 3: we must be looking at unused slot zero */
1502+
/* Corner case 2: we must be looking at unused slot zero */
14321503
Assert(offset == 0);
14331504
continue;
14341505
}
@@ -2055,24 +2126,32 @@ TrimMultiXact(void)
20552126
MultiXactOffsetCtl->shared->latest_page_number = pageno;
20562127

20572128
/*
2058-
* Zero out the remainder of the current offsets page. See notes in
2059-
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2060-
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2061-
* rule "write xlog before data," nextMXact successors may carry obsolete,
2062-
* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2063-
* operates normally.
2129+
* Set the offset of nextMXact on the offsets page. This is normally done
2130+
* in RecordNewMultiXact() of the previous multixact, but we used to not
2131+
* do that in older minor versions. To ensure that the next offset is set
2132+
* if the binary was just upgraded from an older minor version, do it now.
2133+
*
2134+
* Zero out the remainder of the page. See notes in TrimCLOG() for
2135+
* background. Unlike CLOG, some WAL record covers every pg_multixact
2136+
* SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2137+
* xlog before data," nextMXact successors may carry obsolete, nonzero
2138+
* offset values.
20642139
*/
20652140
entryno = MultiXactIdToOffsetEntry(nextMXact);
2066-
if (entryno != 0)
20672141
{
20682142
int slotno;
20692143
MultiXactOffset *offptr;
20702144

2071-
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2145+
if (entryno == 0)
2146+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2147+
else
2148+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
20722149
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
20732150
offptr += entryno;
20742151

2075-
MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2152+
*offptr = offset;
2153+
if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2154+
MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
20762155

20772156
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
20782157
}
@@ -3251,13 +3330,21 @@ multixact_redo(XLogReaderState *record)
32513330

32523331
memcpy(&pageno, XLogRecGetData(record), sizeof(int));
32533332

3254-
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
3255-
3256-
slotno = ZeroMultiXactOffsetPage(pageno, false);
3257-
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3258-
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3259-
3260-
LWLockRelease(MultiXactOffsetSLRULock);
3333+
/*
3334+
* Skip the record if we already initialized the page at the previous
3335+
* XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3336+
*/
3337+
if (pre_initialized_offsets_page != pageno)
3338+
{
3339+
LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE);
3340+
slotno = ZeroMultiXactOffsetPage(pageno, false);
3341+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3342+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3343+
LWLockRelease(MultiXactOffsetSLRULock);
3344+
}
3345+
else
3346+
elog(DEBUG1, "skipping initialization of offsets page %d because it was already initialized on multixid creation", pageno);
3347+
pre_initialized_offsets_page = -1;
32613348
}
32623349
else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
32633350
{
@@ -3281,6 +3368,22 @@ multixact_redo(XLogReaderState *record)
32813368
TransactionId max_xid;
32823369
int i;
32833370

3371+
if (pre_initialized_offsets_page != -1)
3372+
{
3373+
/*
3374+
* If we implicitly initialized the next offsets page while
3375+
* replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3376+
* with an older minor version, we still expect to see an
3377+
* XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3378+
* XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3379+
* not happen. If it does, we'll continue with the replay, but
3380+
* log a message to note that something's funny.
3381+
*/
3382+
elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page %d that was implicitly initialized earlier",
3383+
pre_initialized_offsets_page);
3384+
pre_initialized_offsets_page = -1;
3385+
}
3386+
32843387
/* Store the data back into the SLRU files */
32853388
RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
32863389
xlrec->members);

0 commit comments

Comments
 (0)