Skip to content

Commit 8ba61bc

Browse files
committed
Set next multixid's offset when creating a new multixid
With this commit, the next multixid's offset will always be set on the offsets page, by the time that a backend might try to read it, so we no longer need the waiting mechanism with the condition variable. In other words, this eliminates "corner case 2" mentioned in the comments. The waiting mechanism was broken in a few scenarios: - When nextMulti was advanced without WAL-logging the next multixid. For example, if a later multixid was already assigned and WAL-logged before the previous one was WAL-logged, and then the server crashed. In that case the next offset would never be set in the offsets SLRU, and a query trying to read it would get stuck waiting for it. Same thing could happen if pg_resetwal was used to forcibly advance nextMulti. - In hot standby mode, a deadlock could happen where one backend waits for the next multixid assignment record, but WAL replay is not advancing because of a recovery conflict with the waiting backend. The old TAP test used carefully placed injection points to exercise the old waiting code, but now that the waiting code is gone, much of the old test is no longer relevant. Rewrite the test to reproduce the IPC/MultixactCreation hang after crash recovery instead, and to verify that previously recorded multixids stay readable. Backpatch to all supported versions. In back-branches, we still need to be able to read WAL that was generated before this fix, so in the back-branches this includes a hack to initialize the next offsets page when replaying XLOG_MULTIXACT_CREATE_ID for the last multixid on a page. On 'master', bump XLOG_PAGE_MAGIC instead to indicate that the WAL is not compatible. Author: Andrey Borodin <amborodin@acm.org> Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://www.postgresql.org/message-id/172e5723-d65f-4eec-b512-14beacb326ce@yandex.ru Backpatch-through: 14
1 parent e8ae594 commit 8ba61bc

File tree

1 file changed

+159
-69
lines changed

1 file changed

+159
-69
lines changed

src/backend/access/transam/multixact.c

Lines changed: 159 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -274,12 +274,6 @@ typedef struct MultiXactStateData
274274
/* support for members anti-wraparound measures */
275275
MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
276276

277-
/*
278-
* This is used to sleep until a multixact offset is written when we want
279-
* to create the next one.
280-
*/
281-
ConditionVariable nextoff_cv;
282-
283277
/*
284278
* Per-backend data starts here. We have two arrays stored in the area
285279
* immediately following the MultiXactStateData struct. Each is indexed by
@@ -384,6 +378,9 @@ static MemoryContext MXactContext = NULL;
384378
#define debug_elog6(a,b,c,d,e,f)
385379
#endif
386380

381+
/* hack to deal with WAL generated with older minor versions */
382+
static int64 pre_initialized_offsets_page = -1;
383+
387384
/* internal MultiXactId management */
388385
static void MultiXactIdSetOldestVisible(void);
389386
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
@@ -915,13 +912,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
915912
int entryno;
916913
int slotno;
917914
MultiXactOffset *offptr;
918-
int i;
915+
MultiXactId next;
916+
int64 next_pageno;
917+
int next_entryno;
918+
MultiXactOffset *next_offptr;
919919
LWLock *lock;
920920
LWLock *prevlock = NULL;
921921

922+
/* position of this multixid in the offsets SLRU area */
922923
pageno = MultiXactIdToOffsetPage(multi);
923924
entryno = MultiXactIdToOffsetEntry(multi);
924925

926+
/* position of the next multixid */
927+
next = multi + 1;
928+
if (next < FirstMultiXactId)
929+
next = FirstMultiXactId;
930+
next_pageno = MultiXactIdToOffsetPage(next);
931+
next_entryno = MultiXactIdToOffsetEntry(next);
932+
933+
/*
934+
* Older minor versions didn't set the next multixid's offset in this
935+
* function, and therefore didn't initialize the next page until the next
936+
* multixid was assigned. If we're replaying WAL that was generated by
937+
* such a version, the next page might not be initialized yet. Initialize
938+
* it now.
939+
*/
940+
if (InRecovery &&
941+
next_pageno != pageno &&
942+
pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno)
943+
{
944+
elog(DEBUG1, "next offsets page is not initialized, initializing it now");
945+
946+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
947+
LWLockAcquire(lock, LW_EXCLUSIVE);
948+
949+
/* Create and zero the page */
950+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
951+
952+
/* Make sure it's written out */
953+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
954+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
955+
956+
LWLockRelease(lock);
957+
958+
/*
959+
* Remember that we initialized the page, so that we don't zero it
960+
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
961+
*/
962+
pre_initialized_offsets_page = next_pageno;
963+
}
964+
965+
/*
966+
* Set the starting offset of this multixid's members.
967+
*
968+
* In the common case, it was already be set by the previous
969+
* RecordNewMultiXact call, as this was the next multixid of the previous
970+
* multixid. But if multiple backends are generating multixids
971+
* concurrently, we might race ahead and get called before the previous
972+
* multixid.
973+
*/
925974
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
926975
LWLockAcquire(lock, LW_EXCLUSIVE);
927976

@@ -936,22 +985,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
936985
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
937986
offptr += entryno;
938987

939-
*offptr = offset;
988+
if (*offptr != offset)
989+
{
990+
/* should already be set to the correct value, or not at all */
991+
Assert(*offptr == 0);
992+
*offptr = offset;
993+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
994+
}
940995

941-
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
996+
/*
997+
* Set the next multixid's offset to the end of this multixid's members.
998+
*/
999+
if (next_pageno == pageno)
1000+
{
1001+
next_offptr = offptr + 1;
1002+
}
1003+
else
1004+
{
1005+
/* must be the first entry on the page */
1006+
Assert(next_entryno == 0 || next == FirstMultiXactId);
1007+
1008+
/* Swap the lock for a lock on the next page */
1009+
LWLockRelease(lock);
1010+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
1011+
LWLockAcquire(lock, LW_EXCLUSIVE);
1012+
1013+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
1014+
next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1015+
next_offptr += next_entryno;
1016+
}
1017+
1018+
if (*next_offptr != offset + nmembers)
1019+
{
1020+
/* should already be set to the correct value, or not at all */
1021+
Assert(*next_offptr == 0);
1022+
*next_offptr = offset + nmembers;
1023+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
1024+
}
9421025

9431026
/* Release MultiXactOffset SLRU lock. */
9441027
LWLockRelease(lock);
9451028

946-
/*
947-
* If anybody was waiting to know the offset of this multixact ID we just
948-
* wrote, they can read it now, so wake them up.
949-
*/
950-
ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
951-
9521029
prev_pageno = -1;
9531030

954-
for (i = 0; i < nmembers; i++, offset++)
1031+
for (int i = 0; i < nmembers; i++, offset++)
9551032
{
9561033
TransactionId *memberptr;
9571034
uint32 *flagsptr;
@@ -1141,8 +1218,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
11411218
result = FirstMultiXactId;
11421219
}
11431220

1144-
/* Make sure there is room for the MXID in the file. */
1145-
ExtendMultiXactOffset(result);
1221+
/*
1222+
* Make sure there is room for the next MXID in the file. Assigning this
1223+
* MXID sets the next MXID's offset already.
1224+
*/
1225+
ExtendMultiXactOffset(result + 1);
11461226

11471227
/*
11481228
* Reserve the members space, similarly to above. Also, be careful not to
@@ -1307,7 +1387,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13071387
MultiXactOffset nextOffset;
13081388
MultiXactMember *ptr;
13091389
LWLock *lock;
1310-
bool slept = false;
13111390

13121391
debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
13131392

@@ -1384,23 +1463,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13841463
* one's. However, there are some corner cases to worry about:
13851464
*
13861465
* 1. This multixact may be the latest one created, in which case there is
1387-
* no next one to look at. In this case the nextOffset value we just
1388-
* saved is the correct endpoint.
1466+
* no next one to look at. The next multixact's offset should be set
1467+
* already, as we set it in RecordNewMultiXact(), but we used to not do
1468+
* that in older minor versions. To cope with that case, if this
1469+
* multixact is the latest one created, use the nextOffset value we read
1470+
* above as the endpoint.
13891471
*
1390-
* 2. The next multixact may still be in process of being filled in: that
1391-
* is, another process may have done GetNewMultiXactId but not yet written
1392-
* the offset entry for that ID. In that scenario, it is guaranteed that
1393-
* the offset entry for that multixact exists (because GetNewMultiXactId
1394-
* won't release MultiXactGenLock until it does) but contains zero
1395-
* (because we are careful to pre-zero offset pages). Because
1396-
* GetNewMultiXactId will never return zero as the starting offset for a
1397-
* multixact, when we read zero as the next multixact's offset, we know we
1398-
* have this case. We handle this by sleeping on the condition variable
1399-
* we have just for this; the process in charge will signal the CV as soon
1400-
* as it has finished writing the multixact offset.
1401-
*
1402-
* 3. Because GetNewMultiXactId increments offset zero to offset one to
1403-
* handle case #2, there is an ambiguity near the point of offset
1472+
* 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
1473+
* for to mean "unset", there is an ambiguity near the point of offset
14041474
* wraparound. If we see next multixact's offset is one, is that our
14051475
* multixact's actual endpoint, or did it end at zero with a subsequent
14061476
* increment? We handle this using the knowledge that if the zero'th
@@ -1412,7 +1482,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14121482
* cases, so it seems better than holding the MultiXactGenLock for a long
14131483
* time on every multixact creation.
14141484
*/
1415-
retry:
14161485
pageno = MultiXactIdToOffsetPage(multi);
14171486
entryno = MultiXactIdToOffsetEntry(multi);
14181487

@@ -1475,29 +1544,17 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14751544
nextMXOffset = *offptr;
14761545

14771546
if (nextMXOffset == 0)
1478-
{
1479-
/* Corner case 2: next multixact is still being filled in */
1480-
LWLockRelease(lock);
1481-
CHECK_FOR_INTERRUPTS();
1482-
1483-
ConditionVariableSleep(&MultiXactState->nextoff_cv,
1484-
WAIT_EVENT_MULTIXACT_CREATION);
1485-
slept = true;
1486-
goto retry;
1487-
}
1547+
ereport(ERROR,
1548+
(errcode(ERRCODE_DATA_CORRUPTED),
1549+
errmsg("MultiXact %u has invalid next offset",
1550+
multi)));
14881551

14891552
length = nextMXOffset - offset;
14901553
}
14911554

14921555
LWLockRelease(lock);
14931556
lock = NULL;
14941557

1495-
/*
1496-
* If we slept above, clean up state; it's no longer needed.
1497-
*/
1498-
if (slept)
1499-
ConditionVariableCancelSleep();
1500-
15011558
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
15021559

15031560
truelength = 0;
@@ -1540,7 +1597,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
15401597

15411598
if (!TransactionIdIsValid(*xactptr))
15421599
{
1543-
/* Corner case 3: we must be looking at unused slot zero */
1600+
/* Corner case 2: we must be looking at unused slot zero */
15441601
Assert(offset == 0);
15451602
continue;
15461603
}
@@ -1987,7 +2044,6 @@ MultiXactShmemInit(void)
19872044

19882045
/* Make sure we zero out the per-backend state */
19892046
MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1990-
ConditionVariableInit(&MultiXactState->nextoff_cv);
19912047
}
19922048
else
19932049
Assert(found);
@@ -2194,26 +2250,34 @@ TrimMultiXact(void)
21942250
pageno);
21952251

21962252
/*
2197-
* Zero out the remainder of the current offsets page. See notes in
2198-
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2199-
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2200-
* rule "write xlog before data," nextMXact successors may carry obsolete,
2201-
* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2202-
* operates normally.
2253+
* Set the offset of nextMXact on the offsets page. This is normally done
2254+
* in RecordNewMultiXact() of the previous multixact, but we used to not
2255+
* do that in older minor versions. To ensure that the next offset is set
2256+
* if the binary was just upgraded from an older minor version, do it now.
2257+
*
2258+
* Zero out the remainder of the page. See notes in TrimCLOG() for
2259+
* background. Unlike CLOG, some WAL record covers every pg_multixact
2260+
* SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2261+
* xlog before data," nextMXact successors may carry obsolete, nonzero
2262+
* offset values.
22032263
*/
22042264
entryno = MultiXactIdToOffsetEntry(nextMXact);
2205-
if (entryno != 0)
22062265
{
22072266
int slotno;
22082267
MultiXactOffset *offptr;
22092268
LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
22102269

22112270
LWLockAcquire(lock, LW_EXCLUSIVE);
2212-
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2271+
if (entryno == 0)
2272+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2273+
else
2274+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
22132275
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
22142276
offptr += entryno;
22152277

2216-
MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2278+
*offptr = offset;
2279+
if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2280+
MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
22172281

22182282
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
22192283
LWLockRelease(lock);
@@ -3398,14 +3462,24 @@ multixact_redo(XLogReaderState *record)
33983462

33993463
memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
34003464

3401-
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3402-
LWLockAcquire(lock, LW_EXCLUSIVE);
3465+
/*
3466+
* Skip the record if we already initialized the page at the previous
3467+
* XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3468+
*/
3469+
if (pre_initialized_offsets_page != pageno)
3470+
{
3471+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3472+
LWLockAcquire(lock, LW_EXCLUSIVE);
34033473

3404-
slotno = ZeroMultiXactOffsetPage(pageno, false);
3405-
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3406-
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3474+
slotno = ZeroMultiXactOffsetPage(pageno, false);
3475+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3476+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
34073477

3408-
LWLockRelease(lock);
3478+
LWLockRelease(lock);
3479+
}
3480+
else
3481+
elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno);
3482+
pre_initialized_offsets_page = -1;
34093483
}
34103484
else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
34113485
{
@@ -3431,6 +3505,22 @@ multixact_redo(XLogReaderState *record)
34313505
TransactionId max_xid;
34323506
int i;
34333507

3508+
if (pre_initialized_offsets_page != -1)
3509+
{
3510+
/*
3511+
* If we implicitly initialized the next offsets page while
3512+
* replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3513+
* with an older minor version, we still expect to see an
3514+
* XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3515+
* XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3516+
* not happen. If it does, we'll continue with the replay, but
3517+
* log a message to note that something's funny.
3518+
*/
3519+
elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier",
3520+
pre_initialized_offsets_page);
3521+
pre_initialized_offsets_page = -1;
3522+
}
3523+
34343524
/* Store the data back into the SLRU files */
34353525
RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
34363526
xlrec->members);

0 commit comments

Comments
 (0)