Skip to content

Commit 0c86264

Browse files
committed
Fix remaining race condition with CLOG truncation and LISTEN/NOTIFY
Previous commit fixed a bug where VACUUM would truncate the CLOG that's still needed to check the commit status of XIDs in the async notify queue, but as mentioned in the commit message, it wasn't a full fix. If a backend is executing asyncQueueReadAllNotifications() and has just made a local copy of an async SLRU page which contains old XIDs, vacuum can concurrently truncate the CLOG covering those XIDs, and the backend still gets an error when it calls TransactionIdDidCommit() on those XIDs in the local copy. This commit fixes that race condition. To fix, hold the SLRU bank lock across the TransactionIdDidCommit() calls in NOTIFY processing. Per Tom Lane's idea. Backpatch to all supported versions. Reviewed-by: Joel Jacobson <joel@compiler.org> Reviewed-by: Arseniy Mukhin <arseniy.mukhin.dev@gmail.com> Discussion: https://www.postgresql.org/message-id/2759499.1761756503@sss.pgh.pa.us Backpatch-through: 14
1 parent 1a469d7 commit 0c86264

File tree

1 file changed

+62
-61
lines changed

1 file changed

+62
-61
lines changed

src/backend/commands/async.c

Lines changed: 62 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ static void SignalBackends(void);
463463
static void asyncQueueReadAllNotifications(void);
464464
static bool asyncQueueProcessPageEntries(QueuePosition *current,
465465
QueuePosition stop,
466-
char *page_buffer,
467466
Snapshot snapshot);
468467
static void asyncQueueAdvanceTail(void);
469468
static void ProcessIncomingNotify(bool flush);
@@ -1903,13 +1902,6 @@ asyncQueueReadAllNotifications(void)
19031902
QueuePosition head;
19041903
Snapshot snapshot;
19051904

1906-
/* page_buffer must be adequately aligned, so use a union */
1907-
union
1908-
{
1909-
char buf[QUEUE_PAGESIZE];
1910-
AsyncQueueEntry align;
1911-
} page_buffer;
1912-
19131905
/* Fetch current state */
19141906
LWLockAcquire(NotifyQueueLock, LW_SHARED);
19151907
/* Assert checks that we have a valid state entry */
@@ -1990,37 +1982,6 @@ asyncQueueReadAllNotifications(void)
19901982

19911983
do
19921984
{
1993-
int curpage = QUEUE_POS_PAGE(pos);
1994-
int curoffset = QUEUE_POS_OFFSET(pos);
1995-
int slotno;
1996-
int copysize;
1997-
1998-
/*
1999-
* We copy the data from SLRU into a local buffer, so as to avoid
2000-
* holding the NotifySLRULock while we are examining the entries
2001-
* and possibly transmitting them to our frontend. Copy only the
2002-
* part of the page we will actually inspect.
2003-
*/
2004-
slotno = SimpleLruReadPage_ReadOnly(NotifyCtl, curpage,
2005-
InvalidTransactionId);
2006-
if (curpage == QUEUE_POS_PAGE(head))
2007-
{
2008-
/* we only want to read as far as head */
2009-
copysize = QUEUE_POS_OFFSET(head) - curoffset;
2010-
if (copysize < 0)
2011-
copysize = 0; /* just for safety */
2012-
}
2013-
else
2014-
{
2015-
/* fetch all the rest of the page */
2016-
copysize = QUEUE_PAGESIZE - curoffset;
2017-
}
2018-
memcpy(page_buffer.buf + curoffset,
2019-
NotifyCtl->shared->page_buffer[slotno] + curoffset,
2020-
copysize);
2021-
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
2022-
LWLockRelease(NotifySLRULock);
2023-
20241985
/*
20251986
* Process messages up to the stop position, end of page, or an
20261987
* uncommitted message.
@@ -2036,9 +1997,7 @@ asyncQueueReadAllNotifications(void)
20361997
* rewrite pages under us. Especially we don't want to hold a lock
20371998
* while sending the notifications to the frontend.
20381999
*/
2039-
reachedStop = asyncQueueProcessPageEntries(&pos, head,
2040-
page_buffer.buf,
2041-
snapshot);
2000+
reachedStop = asyncQueueProcessPageEntries(&pos, head, snapshot);
20422001
} while (!reachedStop);
20432002

20442003
/* Update shared state */
@@ -2057,13 +2016,6 @@ asyncQueueReadAllNotifications(void)
20572016
* Fetch notifications from the shared queue, beginning at position current,
20582017
* and deliver relevant ones to my frontend.
20592018
*
2060-
* The current page must have been fetched into page_buffer from shared
2061-
* memory. (We could access the page right in shared memory, but that
2062-
* would imply holding the NotifySLRULock throughout this routine.)
2063-
*
2064-
* We stop if we reach the "stop" position, or reach a notification from an
2065-
* uncommitted transaction, or reach the end of the page.
2066-
*
20672019
* The function returns true once we have reached the stop position or an
20682020
* uncommitted notification, and false if we have finished with the page.
20692021
* In other words: once it returns true there is no need to look further.
@@ -2072,16 +2024,34 @@ asyncQueueReadAllNotifications(void)
20722024
static bool
20732025
asyncQueueProcessPageEntries(QueuePosition *current,
20742026
QueuePosition stop,
2075-
char *page_buffer,
20762027
Snapshot snapshot)
20772028
{
2029+
int64 curpage = QUEUE_POS_PAGE(*current);
2030+
int slotno;
2031+
char *page_buffer;
20782032
bool reachedStop = false;
20792033
bool reachedEndOfPage;
2080-
AsyncQueueEntry *qe;
2034+
2035+
/*
2036+
* We copy the entries into a local buffer to avoid holding the SLRU lock
2037+
* while we transmit them to our frontend. The local buffer must be
2038+
* adequately aligned, so use a union.
2039+
*/
2040+
union
2041+
{
2042+
char buf[QUEUE_PAGESIZE];
2043+
AsyncQueueEntry align;
2044+
} local_buf;
2045+
char *local_buf_end = local_buf.buf;
2046+
2047+
slotno = SimpleLruReadPage_ReadOnly(NotifyCtl, curpage,
2048+
InvalidTransactionId);
2049+
page_buffer = NotifyCtl->shared->page_buffer[slotno];
20812050

20822051
do
20832052
{
20842053
QueuePosition thisentry = *current;
2054+
AsyncQueueEntry *qe;
20852055

20862056
if (QUEUE_POS_EQUAL(thisentry, stop))
20872057
break;
@@ -2123,18 +2093,23 @@ asyncQueueProcessPageEntries(QueuePosition *current,
21232093
reachedStop = true;
21242094
break;
21252095
}
2126-
else if (TransactionIdDidCommit(qe->xid))
2127-
{
2128-
/* qe->data is the null-terminated channel name */
2129-
char *channel = qe->data;
21302096

2131-
if (IsListeningOn(channel))
2132-
{
2133-
/* payload follows channel name */
2134-
char *payload = qe->data + strlen(channel) + 1;
2097+
/*
2098+
* Quick check for the case that we're not listening on any
2099+
* channels, before calling TransactionIdDidCommit(). This makes
2100+
* that case a little faster, but more importantly, it ensures
2101+
* that if there's a bad entry in the queue for which
2102+
* TransactionIdDidCommit() fails for some reason, we can skip
2103+
* over it on the first LISTEN in a session, and not get stuck on
2104+
* it indefinitely.
2105+
*/
2106+
if (listenChannels == NIL)
2107+
continue;
21352108

2136-
NotifyMyFrontEnd(channel, payload, qe->srcPid);
2137-
}
2109+
if (TransactionIdDidCommit(qe->xid))
2110+
{
2111+
memcpy(local_buf_end, qe, qe->length);
2112+
local_buf_end += qe->length;
21382113
}
21392114
else
21402115
{
@@ -2148,6 +2123,32 @@ asyncQueueProcessPageEntries(QueuePosition *current,
21482123
/* Loop back if we're not at end of page */
21492124
} while (!reachedEndOfPage);
21502125

2126+
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
2127+
LWLockRelease(NotifySLRULock);
2128+
2129+
/*
2130+
* Now that we have let go of the SLRU bank lock, send the notifications
2131+
* to our backend
2132+
*/
2133+
Assert(local_buf_end - local_buf.buf <= BLCKSZ);
2134+
for (char *p = local_buf.buf; p < local_buf_end;)
2135+
{
2136+
AsyncQueueEntry *qe = (AsyncQueueEntry *) p;
2137+
2138+
/* qe->data is the null-terminated channel name */
2139+
char *channel = qe->data;
2140+
2141+
if (IsListeningOn(channel))
2142+
{
2143+
/* payload follows channel name */
2144+
char *payload = qe->data + strlen(channel) + 1;
2145+
2146+
NotifyMyFrontEnd(channel, payload, qe->srcPid);
2147+
}
2148+
2149+
p += qe->length;
2150+
}
2151+
21512152
if (QUEUE_POS_EQUAL(*current, stop))
21522153
reachedStop = true;
21532154

0 commit comments

Comments
 (0)