diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index b246ddc6341..dca50707ad4 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -469,6 +469,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_iopg_stat_io + + One row for each combination of backend type, context, and target object + containing cluster-wide I/O statistics. + See + pg_stat_io for details. + + + pg_stat_replication_slotspg_stat_replication_slots One row per replication slot, showing statistics about the @@ -665,20 +675,16 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser - The pg_statio_ views are primarily useful to - determine the effectiveness of the buffer cache. When the number - of actual disk reads is much smaller than the number of buffer - hits, then the cache is satisfying most read requests without - invoking a kernel call. However, these statistics do not give the - entire story: due to the way in which PostgreSQL - handles disk I/O, data that is not in the - PostgreSQL buffer cache might still reside in the - kernel's I/O cache, and might therefore still be fetched without - requiring a physical read. Users interested in obtaining more - detailed information on PostgreSQL I/O behavior are - advised to use the PostgreSQL statistics views - in combination with operating system utilities that allow insight - into the kernel's handling of I/O. + The pg_stat_io and + pg_statio_ set of views are useful for determining + the effectiveness of the buffer cache. They can be used to calculate a cache + hit ratio. Note that while PostgreSQL's I/O + statistics capture most instances in which the kernel was invoked in order + to perform I/O, they do not differentiate between data which had to be + fetched from disk and that which already resided in the kernel page cache. + Users are advised to use the PostgreSQL + statistics views in combination with operating system utilities for a more + complete picture of their database's I/O performance. @@ -3669,6 +3675,293 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i last_archived_wal have also been successfully archived. + + + + <structname>pg_stat_io</structname> + + + pg_stat_io + + + + The pg_stat_io view will contain one row for each + combination of backend type, target I/O object, and I/O context, showing + cluster-wide I/O statistics. Combinations which do not make sense are + omitted. + + + + Currently, I/O on relations (e.g. tables, indexes) is tracked. However, + relation I/O which bypasses shared buffers (e.g. when moving a table from one + tablespace to another) is currently not tracked. + + + + <structname>pg_stat_io</structname> View + + + + + + Column Type + + + Description + + + + + + + + + backend_type text + + + Type of backend (e.g. background worker, autovacuum worker). See + pg_stat_activity for more information + on backend_types. Some + backend_types do not accumulate I/O operation + statistics and will not be included in the view. + + + + + + + + io_object text + + + Target object of an I/O operation. Possible values are: + + + + relation: Permanent relations. + + + + + temp relation: Temporary relations. + + + + + + + + + + + io_context text + + + The context of an I/O operation. Possible values are: + + + + + normal: The default or standard + io_context for a type of I/O operation. For + example, by default, relation data is read into and written out from + shared buffers. Thus, reads and writes of relation data to and from + shared buffers are tracked in io_context + normal. + + + + + vacuum: I/O operations performed outside of shared + buffers while vacuuming and analyzing permanent relations. Temporary + table vacuums use the same local buffer pool as other temporary table + IO operations and are tracked in io_context + normal. + + + + + bulkread: Certain large read I/O operations + done outside of shared buffers, for example, a sequential scan of a + large table. + + + + + bulkwrite: Certain large write I/O operations + done outside of shared buffers, such as COPY. + + + + + + + + + + reads bigint + + + Number of read operations, each of the size specified in + op_bytes. + + + + + + + + writes bigint + + + Number of write operations, each of the size specified in + op_bytes. + + + + + + + + extends bigint + + + Number of relation extend operations, each of the size specified in + op_bytes. + + + + + + + + op_bytes bigint + + + The number of bytes per unit of I/O read, written, or extended. + + + Relation data reads, writes, and extends are done in + block_size units, derived from the build-time + parameter BLCKSZ, which is 8192 by + default. + + + + + + + + evictions bigint + + + Number of times a block has been written out from a shared or local + buffer in order to make it available for another use. + + + In io_context normal, this counts + the number of times a block was evicted from a buffer and replaced with + another block. In io_contexts + bulkwrite, bulkread, and + vacuum, this counts the number of times a block was + evicted from shared buffers in order to add the shared buffer to a + separate, size-limited ring buffer for use in a bulk I/O operation. + + + + + + + + reuses bigint + + + The number of times an existing buffer in a size-limited ring buffer + outside of shared buffers was reused as part of an I/O operation in the + bulkread, bulkwrite, or + vacuum io_contexts. + + + + + + + + fsyncs bigint + + + Number of fsync calls. These are only tracked in + io_context normal. + + + + + + + + stats_reset timestamp with time zone + + + Time at which these statistics were last reset. + + + + + +
+ + + Some backend types never perform I/O operations on some I/O objects and/or + in some I/O contexts. These rows are omitted from the view. For example, the + checkpointer does not checkpoint temporary tables, so there will be no rows + for backend_type checkpointer and + io_object temp relation. + + + + In addition, some I/O operations will never be performed either by certain + backend types or on certain I/O objects and/or in certain I/O contexts. + These cells will be NULL. For example, temporary tables are not + fsynced, so fsyncs will be NULL for + io_object temp relation. Also, the + background writer does not perform reads, so reads will + be NULL in rows for backend_type background + writer. + + + + pg_stat_io can be used to inform database tuning. + For example: + + + + A high evictions count can indicate that shared + buffers should be increased. + + + + + Client backends rely on the checkpointer to ensure data is persisted to + permanent storage. Large numbers of fsyncs by + client backends could indicate a misconfiguration of + shared buffers or of the checkpointer. More information on configuring + the checkpointer can be found in . + + + + + Normally, client backends should be able to rely on auxiliary processes + like the checkpointer and the background writer to write out dirty data + as much as possible. Large numbers of writes by client backends could + indicate a misconfiguration of shared buffers or of the checkpointer. + More information on configuring the checkpointer can be found in . + + + + +
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 8608e3fa5b1..34ca0e739f5 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1117,6 +1117,21 @@ CREATE VIEW pg_stat_bgwriter AS pg_stat_get_buf_alloc() AS buffers_alloc, pg_stat_get_bgwriter_stat_reset_time() AS stats_reset; +CREATE VIEW pg_stat_io AS +SELECT + b.backend_type, + b.io_object, + b.io_context, + b.reads, + b.writes, + b.extends, + b.op_bytes, + b.evictions, + b.reuses, + b.fsyncs, + b.stats_reset +FROM pg_stat_get_io() b; + CREATE VIEW pg_stat_wal AS SELECT w.wal_records, diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 924698e6ae4..9d707c35216 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1245,6 +1245,147 @@ pg_stat_get_buf_alloc(PG_FUNCTION_ARGS) PG_RETURN_INT64(pgstat_fetch_stat_bgwriter()->buf_alloc); } +/* +* When adding a new column to the pg_stat_io view, add a new enum value +* here above IO_NUM_COLUMNS. +*/ +typedef enum io_stat_col +{ + IO_COL_BACKEND_TYPE, + IO_COL_IO_OBJECT, + IO_COL_IO_CONTEXT, + IO_COL_READS, + IO_COL_WRITES, + IO_COL_EXTENDS, + IO_COL_CONVERSION, + IO_COL_EVICTIONS, + IO_COL_REUSES, + IO_COL_FSYNCS, + IO_COL_RESET_TIME, + IO_NUM_COLUMNS, +} io_stat_col; + +/* + * When adding a new IOOp, add a new io_stat_col and add a case to this + * function returning the corresponding io_stat_col. + */ +static io_stat_col +pgstat_get_io_op_index(IOOp io_op) +{ + switch (io_op) + { + case IOOP_EVICT: + return IO_COL_EVICTIONS; + case IOOP_READ: + return IO_COL_READS; + case IOOP_REUSE: + return IO_COL_REUSES; + case IOOP_WRITE: + return IO_COL_WRITES; + case IOOP_EXTEND: + return IO_COL_EXTENDS; + case IOOP_FSYNC: + return IO_COL_FSYNCS; + } + + elog(ERROR, "unrecognized IOOp value: %d", io_op); + pg_unreachable(); +} + +Datum +pg_stat_get_io(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo; + PgStat_IO *backends_io_stats; + Datum reset_time; + + InitMaterializedSRF(fcinfo, 0); + rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + backends_io_stats = pgstat_fetch_stat_io(); + + reset_time = TimestampTzGetDatum(backends_io_stats->stat_reset_timestamp); + + for (BackendType bktype = B_INVALID; bktype < BACKEND_NUM_TYPES; bktype++) + { + Datum bktype_desc = CStringGetTextDatum(GetBackendTypeDesc(bktype)); + PgStat_BktypeIO *bktype_stats = &backends_io_stats->stats[bktype]; + + /* + * In Assert builds, we can afford an extra loop through all of the + * counters checking that only expected stats are non-zero, since it + * keeps the non-Assert code cleaner. + */ + Assert(pgstat_bktype_io_stats_valid(bktype_stats, bktype)); + + /* + * For those BackendTypes without IO Operation stats, skip + * representing them in the view altogether. + */ + if (!pgstat_tracks_io_bktype(bktype)) + continue; + + for (IOObject io_obj = IOOBJECT_FIRST; + io_obj < IOOBJECT_NUM_TYPES; io_obj++) + { + const char *obj_name = pgstat_get_io_object_name(io_obj); + + for (IOContext io_context = IOCONTEXT_FIRST; + io_context < IOCONTEXT_NUM_TYPES; io_context++) + { + const char *context_name = pgstat_get_io_context_name(io_context); + + Datum values[IO_NUM_COLUMNS] = {0}; + bool nulls[IO_NUM_COLUMNS] = {0}; + + /* + * Some combinations of BackendType, IOObject, and IOContext + * are not valid for any type of IOOp. In such cases, omit the + * entire row from the view. + */ + if (!pgstat_tracks_io_object(bktype, io_obj, io_context)) + continue; + + values[IO_COL_BACKEND_TYPE] = bktype_desc; + values[IO_COL_IO_CONTEXT] = CStringGetTextDatum(context_name); + values[IO_COL_IO_OBJECT] = CStringGetTextDatum(obj_name); + values[IO_COL_RESET_TIME] = TimestampTzGetDatum(reset_time); + + /* + * Hard-code this to the value of BLCKSZ for now. Future + * values could include XLOG_BLCKSZ, once WAL IO is tracked, + * and constant multipliers, once non-block-oriented IO (e.g. + * temporary file IO) is tracked. + */ + values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ); + + for (IOOp io_op = IOOP_FIRST; io_op < IOOP_NUM_TYPES; io_op++) + { + int col_idx = pgstat_get_io_op_index(io_op); + + /* + * Some combinations of BackendType and IOOp, of IOContext + * and IOOp, and of IOObject and IOOp are not tracked. Set + * these cells in the view NULL. + */ + nulls[col_idx] = !pgstat_tracks_io_op(bktype, io_obj, io_context, io_op); + + if (nulls[col_idx]) + continue; + + values[col_idx] = + Int64GetDatum(bktype_stats->data[io_obj][io_context][io_op]); + } + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + } + } + + return (Datum) 0; +} + /* * Returns statistics of WAL activity */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index c1ce0b76e14..9c298cb1253 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202301301 +#define CATALOG_VERSION_NO 202302111 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index c0f2a8a77c8..66b73c3900d 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5717,6 +5717,15 @@ proname => 'pg_stat_get_buf_alloc', provolatile => 's', proparallel => 'r', prorettype => 'int8', proargtypes => '', prosrc => 'pg_stat_get_buf_alloc' }, +{ oid => '8459', descr => 'statistics: per backend type IO statistics', + proname => 'pg_stat_get_io', provolatile => 'v', + prorows => '30', proretset => 't', + proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,text,text,int8,int8,int8,int8,int8,int8,int8,timestamptz}', + proargmodes => '{o,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{backend_type,io_object,io_context,reads,writes,extends,op_bytes,evictions,reuses,fsyncs,stats_reset}', + prosrc => 'pg_stat_get_io' }, + { oid => '1136', descr => 'statistics: information about WAL activity', proname => 'pg_stat_get_wal', proisstrict => 'f', provolatile => 's', proparallel => 'r', prorettype => 'record', proargtypes => '', diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index e7a2f5856aa..174b725fff1 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1876,6 +1876,18 @@ pg_stat_gssapi| SELECT pid, gss_enc AS encrypted FROM pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid, query_id) WHERE (client_port IS NOT NULL); +pg_stat_io| SELECT backend_type, + io_object, + io_context, + reads, + writes, + extends, + op_bytes, + evictions, + reuses, + fsyncs, + stats_reset + FROM pg_stat_get_io() b(backend_type, io_object, io_context, reads, writes, extends, op_bytes, evictions, reuses, fsyncs, stats_reset); pg_stat_progress_analyze| SELECT s.pid, s.datid, d.datname, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 36d1dc01177..45fc5759ced 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3378,6 +3378,7 @@ intset_internal_node intset_leaf_node intset_node intvKEY +io_stat_col itemIdCompact itemIdCompactData iterator