Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 164 additions & 72 deletions src/backend/access/transam/multixact.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "storage/condition_variable.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
Expand Down Expand Up @@ -276,12 +275,6 @@ typedef struct MultiXactStateData
/* support for members anti-wraparound measures */
MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */

/*
* This is used to sleep until a multixact offset is written when we want
* to create the next one.
*/
ConditionVariable nextoff_cv;

/*
* Per-backend data starts here. We have two arrays stored in the area
* immediately following the MultiXactStateData struct. Each is indexed by
Expand Down Expand Up @@ -386,6 +379,9 @@ static MemoryContext MXactContext = NULL;
#define debug_elog6(a,b,c,d,e,f)
#endif

/* hack to deal with WAL generated with older minor versions */
static int64 pre_initialized_offsets_page = -1;

/* internal MultiXactId management */
static void MultiXactIdSetOldestVisible(void);
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
Expand Down Expand Up @@ -922,13 +918,66 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
int entryno;
int slotno;
MultiXactOffset *offptr;
int i;
MultiXactId next;
int64 next_pageno;
int next_entryno;
MultiXactOffset *next_offptr;
MultiXactOffset next_offset;
LWLock *lock;
LWLock *prevlock = NULL;

/* position of this multixid in the offsets SLRU area */
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);

/* position of the next multixid */
next = multi + 1;
if (next < FirstMultiXactId)
next = FirstMultiXactId;
next_pageno = MultiXactIdToOffsetPage(next);
next_entryno = MultiXactIdToOffsetEntry(next);

/*
* Older minor versions didn't set the next multixid's offset in this
* function, and therefore didn't initialize the next page until the next
* multixid was assigned. If we're replaying WAL that was generated by
* such a version, the next page might not be initialized yet. Initialize
* it now.
*/
if (InRecovery &&
next_pageno != pageno &&
pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno)
{
elog(DEBUG1, "next offsets page is not initialized, initializing it now");

lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
LWLockAcquire(lock, LW_EXCLUSIVE);

/* Create and zero the page */
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);

/* Make sure it's written out */
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);

LWLockRelease(lock);

/*
* Remember that we initialized the page, so that we don't zero it
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
*/
pre_initialized_offsets_page = next_pageno;
}

/*
* Set the starting offset of this multixid's members.
*
* In the common case, it was already be set by the previous
* RecordNewMultiXact call, as this was the next multixid of the previous
* multixid. But if multiple backends are generating multixids
* concurrently, we might race ahead and get called before the previous
* multixid.
*/
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
LWLockAcquire(lock, LW_EXCLUSIVE);

Expand All @@ -943,22 +992,54 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;

*offptr = offset;
if (*offptr != offset)
{
/* should already be set to the correct value, or not at all */
Assert(*offptr == 0);
*offptr = offset;
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}

/*
* Set the next multixid's offset to the end of this multixid's members.
*/
if (next_pageno == pageno)
{
next_offptr = offptr + 1;
}
else
{
/* must be the first entry on the page */
Assert(next_entryno == 0 || next == FirstMultiXactId);

/* Swap the lock for a lock on the next page */
LWLockRelease(lock);
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
LWLockAcquire(lock, LW_EXCLUSIVE);

slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
next_offptr += next_entryno;
}

MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
/* Like in GetNewMultiXactId(), skip over offset 0 */
next_offset = offset + nmembers;
if (next_offset == 0)
next_offset = 1;
if (*next_offptr != next_offset)
{
/* should already be set to the correct value, or not at all */
Assert(*next_offptr == 0);
*next_offptr = next_offset;
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}

/* Release MultiXactOffset SLRU lock. */
LWLockRelease(lock);

/*
* If anybody was waiting to know the offset of this multixact ID we just
* wrote, they can read it now, so wake them up.
*/
ConditionVariableBroadcast(&MultiXactState->nextoff_cv);

prev_pageno = -1;

for (i = 0; i < nmembers; i++, offset++)
for (int i = 0; i < nmembers; i++, offset++)
{
TransactionId *memberptr;
uint32 *flagsptr;
Expand Down Expand Up @@ -1148,8 +1229,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
result = FirstMultiXactId;
}

/* Make sure there is room for the MXID in the file. */
ExtendMultiXactOffset(result);
/*
* Make sure there is room for the next MXID in the file. Assigning this
* MXID sets the next MXID's offset already.
*/
ExtendMultiXactOffset(result + 1);

/*
* Reserve the members space, similarly to above. Also, be careful not to
Expand Down Expand Up @@ -1314,7 +1398,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
MultiXactOffset nextOffset;
MultiXactMember *ptr;
LWLock *lock;
bool slept = false;

debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);

Expand Down Expand Up @@ -1391,23 +1474,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* one's. However, there are some corner cases to worry about:
*
* 1. This multixact may be the latest one created, in which case there is
* no next one to look at. In this case the nextOffset value we just
* saved is the correct endpoint.
* no next one to look at. The next multixact's offset should be set
* already, as we set it in RecordNewMultiXact(), but we used to not do
* that in older minor versions. To cope with that case, if this
* multixact is the latest one created, use the nextOffset value we read
* above as the endpoint.
*
* 2. The next multixact may still be in process of being filled in: that
* is, another process may have done GetNewMultiXactId but not yet written
* the offset entry for that ID. In that scenario, it is guaranteed that
* the offset entry for that multixact exists (because GetNewMultiXactId
* won't release MultiXactGenLock until it does) but contains zero
* (because we are careful to pre-zero offset pages). Because
* GetNewMultiXactId will never return zero as the starting offset for a
* multixact, when we read zero as the next multixact's offset, we know we
* have this case. We handle this by sleeping on the condition variable
* we have just for this; the process in charge will signal the CV as soon
* as it has finished writing the multixact offset.
*
* 3. Because GetNewMultiXactId increments offset zero to offset one to
* handle case #2, there is an ambiguity near the point of offset
* 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
* for to mean "unset", there is an ambiguity near the point of offset
* wraparound. If we see next multixact's offset is one, is that our
* multixact's actual endpoint, or did it end at zero with a subsequent
* increment? We handle this using the knowledge that if the zero'th
Expand All @@ -1419,7 +1493,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* cases, so it seems better than holding the MultiXactGenLock for a long
* time on every multixact creation.
*/
retry:
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);

Expand Down Expand Up @@ -1482,31 +1555,17 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
nextMXOffset = *offptr;

if (nextMXOffset == 0)
{
/* Corner case 2: next multixact is still being filled in */
LWLockRelease(lock);
CHECK_FOR_INTERRUPTS();

INJECTION_POINT("multixact-get-members-cv-sleep", NULL);

ConditionVariableSleep(&MultiXactState->nextoff_cv,
WAIT_EVENT_MULTIXACT_CREATION);
slept = true;
goto retry;
}
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("MultiXact %u has invalid next offset",
multi)));

length = nextMXOffset - offset;
}

LWLockRelease(lock);
lock = NULL;

/*
* If we slept above, clean up state; it's no longer needed.
*/
if (slept)
ConditionVariableCancelSleep();

ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));

truelength = 0;
Expand Down Expand Up @@ -1549,7 +1608,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,

if (!TransactionIdIsValid(*xactptr))
{
/* Corner case 3: we must be looking at unused slot zero */
/* Corner case 2: we must be looking at unused slot zero */
Assert(offset == 0);
continue;
}
Expand Down Expand Up @@ -1996,7 +2055,6 @@ MultiXactShmemInit(void)

/* Make sure we zero out the per-backend state */
MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
ConditionVariableInit(&MultiXactState->nextoff_cv);
}
else
Assert(found);
Expand Down Expand Up @@ -2203,26 +2261,34 @@ TrimMultiXact(void)
pageno);

/*
* Zero out the remainder of the current offsets page. See notes in
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
* rule "write xlog before data," nextMXact successors may carry obsolete,
* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
* operates normally.
* Set the offset of nextMXact on the offsets page. This is normally done
* in RecordNewMultiXact() of the previous multixact, but we used to not
* do that in older minor versions. To ensure that the next offset is set
* if the binary was just upgraded from an older minor version, do it now.
*
* Zero out the remainder of the page. See notes in TrimCLOG() for
* background. Unlike CLOG, some WAL record covers every pg_multixact
* SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
* xlog before data," nextMXact successors may carry obsolete, nonzero
* offset values.
*/
entryno = MultiXactIdToOffsetEntry(nextMXact);
if (entryno != 0)
{
int slotno;
MultiXactOffset *offptr;
LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);

LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
if (entryno == 0)
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
else
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;

MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
*offptr = offset;
if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));

MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
LWLockRelease(lock);
Expand Down Expand Up @@ -3407,14 +3473,24 @@ multixact_redo(XLogReaderState *record)

memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));

lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
LWLockAcquire(lock, LW_EXCLUSIVE);
/*
* Skip the record if we already initialized the page at the previous
* XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
*/
if (pre_initialized_offsets_page != pageno)
{
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
LWLockAcquire(lock, LW_EXCLUSIVE);

slotno = ZeroMultiXactOffsetPage(pageno, false);
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
slotno = ZeroMultiXactOffsetPage(pageno, false);
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);

LWLockRelease(lock);
LWLockRelease(lock);
}
else
elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno);
pre_initialized_offsets_page = -1;
}
else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
{
Expand All @@ -3440,6 +3516,22 @@ multixact_redo(XLogReaderState *record)
TransactionId max_xid;
int i;

if (pre_initialized_offsets_page != -1)
{
/*
* If we implicitly initialized the next offsets page while
* replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
* with an older minor version, we still expect to see an
* XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
* XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
* not happen. If it does, we'll continue with the replay, but
* log a message to note that something's funny.
*/
elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier",
pre_initialized_offsets_page);
pre_initialized_offsets_page = -1;
}

/* Store the data back into the SLRU files */
RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
xlrec->members);
Expand Down
Loading