Skip to content

Commit

Permalink
pgstat: Track more detailed relation IO statistics
Browse files Browse the repository at this point in the history
Commit 28e626b introduced the infrastructure for tracking more detailed IO
statistics. This commit adds the actual collection of the new IO statistics
for relations and temporary relations. See aforementioned commit for goals and
high-level design.

The changes in this commit are fairly straight-forward. The bulk of the change
is to passing sufficient information to the callsites of pgstat_count_io_op().

A somewhat unsightly detail is that it currently is hard to find a better
place to count fsyncs than in md.c, whereas the other pgstat_count_io_op()
calls are in bufmgr.c/localbuf.c. As the number of fsyncs is tied to md.c
implementation details, it's not obvious there is a better answer.

Author: Melanie Plageman <[email protected]>
Reviewed-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/[email protected]
  • Loading branch information
anarazel committed Feb 10, 2023
1 parent 40d0b2d commit f30d62c
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 36 deletions.
110 changes: 94 additions & 16 deletions src/backend/storage/buffer/bufmgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,9 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
bool *foundPtr, IOContext *io_context);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
IOObject io_object, IOContext io_context);
static void FindAndDropRelationBuffers(RelFileLocator rlocator,
ForkNumber forkNum,
BlockNumber nForkBlock,
Expand Down Expand Up @@ -814,6 +815,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BufferDesc *bufHdr;
Block bufBlock;
bool found;
IOContext io_context;
IOObject io_object;
bool isExtend;
bool isLocalBuf = SmgrIsTemp(smgr);

Expand Down Expand Up @@ -846,7 +849,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,

if (isLocalBuf)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
/*
* LocalBufferAlloc() will set the io_context to IOCONTEXT_NORMAL. We
* do not use a BufferAccessStrategy for I/O of temporary tables.
* However, in some cases, the "strategy" may not be NULL, so we can't
* rely on IOContextForStrategy() to set the right IOContext for us.
* This may happen in cases like CREATE TEMPORARY TABLE AS...
*/
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found, &io_context);
if (found)
pgBufferUsage.local_blks_hit++;
else if (isExtend)
Expand All @@ -862,7 +872,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* not currently in memory.
*/
bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
strategy, &found);
strategy, &found, &io_context);
if (found)
pgBufferUsage.shared_blks_hit++;
else if (isExtend)
Expand Down Expand Up @@ -977,7 +987,16 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */

bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (isLocalBuf)
{
bufBlock = LocalBufHdrGetBlock(bufHdr);
io_object = IOOBJECT_TEMP_RELATION;
}
else
{
bufBlock = BufHdrGetBlock(bufHdr);
io_object = IOOBJECT_RELATION;
}

if (isExtend)
{
Expand All @@ -986,6 +1005,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
/* don't set checksum for all-zero page */
smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);

pgstat_count_io_op(io_object, io_context, IOOP_EXTEND);

/*
* NB: we're *not* doing a ScheduleBufferTagForWriteback here;
* although we're essentially performing a write. At least on linux
Expand Down Expand Up @@ -1013,6 +1034,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,

smgrread(smgr, forkNum, blockNum, (char *) bufBlock);

pgstat_count_io_op(io_object, io_context, IOOP_READ);

if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
Expand Down Expand Up @@ -1106,14 +1129,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
* we keep it for simplicity in ReadBuffer.
*
* io_context is passed as an output parameter to avoid calling
* IOContextForStrategy() when there is a shared buffers hit and no IO
* statistics need be captured.
*
* No locks are held either at entry or exit.
*/
static BufferDesc *
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr)
bool *foundPtr, IOContext *io_context)
{
bool from_ring;
BufferTag newTag; /* identity of requested block */
uint32 newHash; /* hash value for newTag */
LWLock *newPartitionLock; /* buffer partition lock for it */
Expand Down Expand Up @@ -1165,8 +1193,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
{
/*
* If we get here, previous attempts to read the buffer must
* have failed ... but we shall bravely try again.
* have failed ... but we shall bravely try again. Set
* io_context since we will in fact need to count an IO
* Operation.
*/
*io_context = IOContextForStrategy(strategy);
*foundPtr = false;
}
}
Expand All @@ -1180,6 +1211,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
LWLockRelease(newPartitionLock);

*io_context = IOContextForStrategy(strategy);

/* Loop here in case we have to try another victim buffer */
for (;;)
{
Expand All @@ -1193,7 +1226,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* Select a victim buffer. The buffer is returned with its header
* spinlock still held!
*/
buf = StrategyGetBuffer(strategy, &buf_state);
buf = StrategyGetBuffer(strategy, &buf_state, &from_ring);

Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);

Expand Down Expand Up @@ -1247,7 +1280,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
UnlockBufHdr(buf, buf_state);

if (XLogNeedsFlush(lsn) &&
StrategyRejectBuffer(strategy, buf))
StrategyRejectBuffer(strategy, buf, from_ring))
{
/* Drop lock/pin and loop around for another buffer */
LWLockRelease(BufferDescriptorGetContentLock(buf));
Expand All @@ -1262,7 +1295,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
smgr->smgr_rlocator.locator.dbOid,
smgr->smgr_rlocator.locator.relNumber);

FlushBuffer(buf, NULL);
FlushBuffer(buf, NULL, IOOBJECT_RELATION, *io_context);
LWLockRelease(BufferDescriptorGetContentLock(buf));

ScheduleBufferTagForWriteback(&BackendWritebackContext,
Expand Down Expand Up @@ -1443,6 +1476,28 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,

LWLockRelease(newPartitionLock);

if (oldFlags & BM_VALID)
{
/*
* When a BufferAccessStrategy is in use, blocks evicted from shared
* buffers are counted as IOOP_EVICT in the corresponding context
* (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
* strategy in two cases: 1) while initially claiming buffers for the
* strategy ring 2) to replace an existing strategy ring buffer
* because it is pinned or in use and cannot be reused.
*
* Blocks evicted from buffers already in the strategy ring are
* counted as IOOP_REUSE in the corresponding strategy context.
*
* At this point, we can accurately count evictions and reuses,
* because we have successfully claimed the valid buffer. Previously,
* we may have been forced to release the buffer due to concurrent
* pinners or erroring out.
*/
pgstat_count_io_op(IOOBJECT_RELATION, *io_context,
from_ring ? IOOP_REUSE : IOOP_EVICT);
}

/*
* Buffer contents are currently invalid. Try to obtain the right to
* start I/O. If StartBufferIO returns false, then someone else managed
Expand Down Expand Up @@ -2563,7 +2618,7 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);

FlushBuffer(bufHdr, NULL);
FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);

LWLockRelease(BufferDescriptorGetContentLock(bufHdr));

Expand Down Expand Up @@ -2813,7 +2868,8 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
* as the second parameter. If not, pass NULL.
*/
static void
FlushBuffer(BufferDesc *buf, SMgrRelation reln)
FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
IOContext io_context)
{
XLogRecPtr recptr;
ErrorContextCallback errcallback;
Expand Down Expand Up @@ -2907,6 +2963,26 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
bufToWrite,
false);

/*
* When a strategy is in use, only flushes of dirty buffers already in the
* strategy ring are counted as strategy writes (IOCONTEXT
* [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
* statistics tracking.
*
* If a shared buffer initially added to the ring must be flushed before
* being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
*
* If a shared buffer which was added to the ring later because the
* current strategy buffer is pinned or in use or because all strategy
* buffers were dirty and rejected (for BAS_BULKREAD operations only)
* requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
* (from_ring will be false).
*
* When a strategy is not in use, the write can only be a "regular" write
* of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
*/
pgstat_count_io_op(IOOBJECT_RELATION, io_context, IOOP_WRITE);

if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
Expand Down Expand Up @@ -3549,6 +3625,8 @@ FlushRelationBuffers(Relation rel)
buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);

pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE);

/* Pop the error context stack */
error_context_stack = errcallback.previous;
}
Expand Down Expand Up @@ -3581,7 +3659,7 @@ FlushRelationBuffers(Relation rel)
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, RelationGetSmgr(rel));
FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr);
}
Expand Down Expand Up @@ -3679,7 +3757,7 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, srelent->srel);
FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr);
}
Expand Down Expand Up @@ -3889,7 +3967,7 @@ FlushDatabaseBuffers(Oid dbid)
{
PinBuffer_Locked(bufHdr);
LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
FlushBuffer(bufHdr, NULL);
FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
UnpinBuffer(bufHdr);
}
Expand All @@ -3916,7 +3994,7 @@ FlushOneBuffer(Buffer buffer)

Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));

FlushBuffer(bufHdr, NULL);
FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
}

/*
Expand Down
Loading

0 comments on commit f30d62c

Please sign in to comment.