Skip to content

Commit

Permalink
WAL-log inplace update before revealing it to other sessions.
Browse files Browse the repository at this point in the history
A buffer lock won't stop a reader having already checked tuple
visibility.  If a vac_update_datfrozenid() and then a crash happened
during inplace update of a relfrozenxid value, datfrozenxid could
overtake relfrozenxid.  That could lead to "could not access status of
transaction" errors.  Back-patch to v12 (all supported versions).  In
v14 and earlier, this also back-patches the assertion removal from
commit 7fcf2fa.

Discussion: https://postgr.es/m/[email protected]
  • Loading branch information
nmisch committed Oct 25, 2024
1 parent 95c5acb commit bfd5c6e
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 16 deletions.
4 changes: 1 addition & 3 deletions src/backend/access/heap/README.tuplock
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,4 @@ Inplace updates create an exception to the rule that tuple data won't change
under a reader holding a pin. A reader of a heap_fetch() result tuple may
witness a torn read. Current inplace-updated fields are aligned and are no
wider than four bytes, and current readers don't need consistency across
fields. Hence, they get by with just fetching each field once. XXX such a
caller may also read a value that has not reached WAL; see
systable_inplace_update_finish().
fields. Hence, they get by with just fetching each field once.
58 changes: 45 additions & 13 deletions src/backend/access/heap/heapam.c
Original file line number Diff line number Diff line change
Expand Up @@ -6344,13 +6344,18 @@ heap_inplace_update_and_unlock(Relation relation,
HeapTupleHeader htup = oldtup->t_data;
uint32 oldlen;
uint32 newlen;
char *dst;
char *src;

Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
oldlen = oldtup->t_len - htup->t_hoff;
newlen = tuple->t_len - tuple->t_data->t_hoff;
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length");

dst = (char *) htup + htup->t_hoff;
src = (char *) tuple->t_data + tuple->t_data->t_hoff;

/*
* Construct shared cache inval if necessary. Note that because we only
* pass the new version of the tuple, this mustn't be used for any
Expand All @@ -6369,15 +6374,15 @@ heap_inplace_update_and_unlock(Relation relation,
*/
PreInplace_Inval();

/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();

memcpy((char *) htup + htup->t_hoff,
(char *) tuple->t_data + tuple->t_data->t_hoff,
newlen);

/*----------
* XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
* NO EREPORT(ERROR) from here till changes are complete
*
* Our buffer lock won't stop a reader having already pinned and checked
* visibility for this tuple. Hence, we write WAL first, then mutate the
* buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
* checkpoint delay makes that acceptable. With the usual order of
* changes, a crash after memcpy() and before XLogInsert() could allow
* datfrozenxid to overtake relfrozenxid:
*
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
* ["R" is a VACUUM tbl]
Expand All @@ -6387,31 +6392,57 @@ heap_inplace_update_and_unlock(Relation relation,
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
* [crash]
* [recovery restores datfrozenxid w/o relfrozenxid]
*
* Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
* the buffer to the stack before logging. Here, that facilitates a FPI
* of the post-mutation block before we accept other sessions seeing it.
*/

MarkBufferDirty(buffer);
Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
START_CRIT_SECTION();
MyProc->delayChkptFlags |= DELAY_CHKPT_START;

/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
PGAlignedBlock copied_buffer;
char *origdata = (char *) BufferGetBlock(buffer);
Page page = BufferGetPage(buffer);
uint16 lower = ((PageHeader) page)->pd_lower;
uint16 upper = ((PageHeader) page)->pd_upper;
uintptr_t dst_offset_in_block;
RelFileLocator rlocator;
ForkNumber forkno;
BlockNumber blkno;
XLogRecPtr recptr;

xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);

XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);

XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
/* register block matching what buffer will look like after changes */
memcpy(copied_buffer.data, origdata, lower);
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
dst_offset_in_block = dst - origdata;
memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
BufferGetTag(buffer, &rlocator, &forkno, &blkno);
Assert(forkno == MAIN_FORKNUM);
XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
REGBUF_STANDARD);
XLogRegisterBufData(0, src, newlen);

/* inplace updates aren't decoded atm, don't log the origin */

recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);

PageSetLSN(BufferGetPage(buffer), recptr);
PageSetLSN(page, recptr);
}

memcpy(dst, src, newlen);

MarkBufferDirty(buffer);

LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

/*
Expand All @@ -6424,6 +6455,7 @@ heap_inplace_update_and_unlock(Relation relation,
*/
AtInplace_Inval();

MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
END_CRIT_SECTION();
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);

Expand Down

0 comments on commit bfd5c6e

Please sign in to comment.