From: Tomas Vondra Date: Tue, 7 Apr 2026 20:33:32 +0000 (+0200) Subject: Add EXPLAIN (IO) infrastructure with BitmapHeapScan support X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=681daed93169ff5bed2796dd5c8c76ee2fae1ebd;p=thirdparty%2Fpostgresql.git Add EXPLAIN (IO) infrastructure with BitmapHeapScan support Allows collecting details about AIO / prefetch for scan nodes backed by a ReadStream. This may be enabled by a new "IO" option in EXPLAIN, and it shows information about the prefetch distance and I/O requests. As of this commit this applies only to BitmapHeapScan, because that's the only scan node using a ReadStream and collecting instrumentation from workers in a parallel query. Support for SeqScan and TidRangeScan, the other scan nodes using ReadStream, will be added in subsequent commits. The stats are collected only when required by EXPLAIN ANALYZE, with the IO option (disabled by default). The amount of collected statistics is very limited, but we don't want to clutter EXPLAIN with too much data. The IOStats struct is stored in the scan descriptor as a field, next to other fields used by table AMs. A pointer to the field is passed to the ReadStream, and updated directly. It's the responsibility of the table AM to allocate the struct (e.g. in ambeginscan) whenever the flag SO_SCAN_INSTRUMENT flag is passed to the scan, so that the executor and ReadStream has access to it. The collected stats are designed for ReadStream, but are meant to be reasonably generic in case a TAM manages I/Os in different ways. Author: Tomas Vondra Reviewed-by: Melanie Plageman Reviewed-by: Lukas Fittl Reviewed-by: Andres Freund Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me --- diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 5b8b521802e..e95e19081e1 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -46,6 +46,7 @@ EXPLAIN [ ( option [, ...] ) ] boolean ] SUMMARY [ boolean ] MEMORY [ boolean ] + IO [ boolean ] FORMAT { TEXT | XML | JSON | YAML } @@ -298,6 +299,17 @@ ROLLBACK; + + IO + + + Include information on I/O performed by scan nodes proving such information. + This parameter may only be used when ANALYZE is also + enabled. It defaults to FALSE. + + + + FORMAT diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f6ac5a0897c..abfd8e8970a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -43,6 +43,7 @@ #include "catalog/pg_database.h" #include "catalog/pg_database_d.h" #include "commands/vacuum.h" +#include "executor/instrument_node.h" #include "pgstat.h" #include "port/pg_bitutils.h" #include "storage/lmgr.h" @@ -1200,6 +1201,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, scan->rs_base.rs_nkeys = nkeys; scan->rs_base.rs_flags = flags; scan->rs_base.rs_parallel = parallel_scan; + scan->rs_base.rs_instrument = NULL; scan->rs_strategy = NULL; /* set in initscan */ scan->rs_cbuf = InvalidBuffer; @@ -1312,6 +1314,14 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + /* enable read stream instrumentation */ + if ((flags & SO_SCAN_INSTRUMENT) && (scan->rs_read_stream != NULL)) + { + scan->rs_base.rs_instrument = palloc0_object(TableScanInstrumentation); + read_stream_enable_stats(scan->rs_read_stream, + &scan->rs_base.rs_instrument->io); + } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; @@ -1415,6 +1425,9 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); + if (scan->rs_base.rs_instrument) + pfree(scan->rs_base.rs_instrument); + pfree(scan); } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index f151f21f9b3..20e0bc8f232 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/relscan.h" #include "access/xact.h" #include "catalog/pg_type.h" #include "commands/createas.h" @@ -139,6 +140,8 @@ static void show_hashagg_info(AggState *aggstate, ExplainState *es); static void show_indexsearches_info(PlanState *planstate, ExplainState *es); static void show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es); +static void show_scan_io_usage(ScanState *planstate, + ExplainState *es); static void show_instrumentation_count(const char *qlabel, int which, PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); @@ -519,6 +522,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, instrument_option |= INSTRUMENT_BUFFERS; if (es->wal) instrument_option |= INSTRUMENT_WAL; + if (es->io) + instrument_option |= INSTRUMENT_IO; /* * We always collect timing for the entire statement, even when node-level @@ -2008,6 +2013,7 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_tidbitmap_info((BitmapHeapScanState *) planstate, es); + show_scan_io_usage((ScanState *) planstate, es); break; case T_SampleScan: show_tablesample(((SampleScan *) plan)->tablesample, @@ -3984,6 +3990,128 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es) } } +/* + * Print I/O stats - prefetching and I/O performed + * + * This prints two types of stats - "prefetch" about the prefetching done by + * ReadStream, and "I/O" issued by the stream. The prefetch stats are based + * on buffers pulled from the stream (even if no I/O is needed). The I/O + * information is related to I/O requests issued by the stream. + * + * The prefetch stats are printed if any buffer was pulled from the stream. + * For the I/O stats it depend on the output format. In non-text formats the + * information is printed if prefetch stats were printed. In text format it + * gets printed only if there were any I/O requests. + */ +static void +print_io_usage(ExplainState *es, IOStats *stats) +{ + /* don't print prefetch stats if there's nothing to report */ + if (stats->prefetch_count > 0) + { + if (es->format == EXPLAIN_FORMAT_TEXT) + { + /* prefetch distance info */ + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: avg=%.2f max=%d capacity=%d\n", + (stats->distance_sum * 1.0 / stats->prefetch_count), + stats->distance_max, + stats->distance_capacity); + + /* prefetch I/O info (only if there were actual I/Os) */ + if (stats->io_count > 0) + { + ExplainIndentText(es); + appendStringInfo(es->str, "I/O: count=%" PRIu64 " waits=%" PRIu64 + " size=%.2f in-progress=%.2f\n", + stats->io_count, stats->wait_count, + (stats->io_nblocks * 1.0 / stats->io_count), + (stats->io_in_progress * 1.0 / stats->io_count)); + } + } + else + { + ExplainPropertyFloat("Average Prefetch Distance", NULL, + (stats->distance_sum * 1.0 / stats->prefetch_count), 3, es); + ExplainPropertyInteger("Max Prefetch Distance", NULL, + stats->distance_max, es); + ExplainPropertyInteger("Prefetch Capacity", NULL, + stats->distance_capacity, es); + + ExplainPropertyUInteger("I/O Count", NULL, + stats->io_count, es); + ExplainPropertyUInteger("I/O Waits", NULL, + stats->wait_count, es); + ExplainPropertyFloat("Average I/O Size", NULL, + (stats->io_nblocks * 1.0 / Max(1, stats->io_count)), 3, es); + ExplainPropertyFloat("Average I/Os In Progress", NULL, + (stats->io_in_progress * 1.0 / Max(1, stats->io_count)), 3, es); + } + } +} + +/* + * Show information about prefetch and I/O in a scan node. + */ +static void +show_scan_io_usage(ScanState *planstate, ExplainState *es) +{ + Plan *plan = planstate->ps.plan; + IOStats stats = {0}; + + if (!es->io) + return; + + /* + * Initialize counters with stats from the local process first. + * + * The scan descriptor may not exist, e.g. if the scan did not start, or + * because of debug_parallel_query=regress. We still want to collect data + * from workers. + */ + if (planstate->ss_currentScanDesc && + planstate->ss_currentScanDesc->rs_instrument) + { + stats = planstate->ss_currentScanDesc->rs_instrument->io; + } + + /* + * Accumulate data from parallel workers (if any). + */ + switch (nodeTag(plan)) + { + case T_BitmapHeapScan: + { + SharedBitmapHeapInstrumentation *sinstrument + = ((BitmapHeapScanState *) planstate)->sinstrument; + + if (sinstrument) + { + for (int i = 0; i < sinstrument->num_workers; ++i) + { + BitmapHeapScanInstrumentation *winstrument = &sinstrument->sinstrument[i]; + + AccumulateIOStats(&stats, &winstrument->stats.io); + + if (!es->workers_state) + continue; + + ExplainOpenWorker(i, es); + print_io_usage(es, &winstrument->stats.io); + ExplainCloseWorker(i, es); + } + } + + break; + } + default: + /* ignore other plans */ + return; + } + + print_io_usage(es, &stats); +} + /* * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node * diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c index 65dd4111459..0e07a63fca6 100644 --- a/src/backend/commands/explain_state.c +++ b/src/backend/commands/explain_state.c @@ -162,6 +162,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) "EXPLAIN", opt->defname, p), parser_errposition(pstate, opt->location))); } + else if (strcmp(opt->defname, "io") == 0) + es->io = defGetBoolean(opt); else if (!ApplyExtensionExplainOption(es, opt, pstate)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -188,6 +190,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("EXPLAIN option %s requires ANALYZE", "TIMING"))); + /* check that IO is used with EXPLAIN ANALYZE */ + if (es->io && !es->analyze) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("EXPLAIN option %s requires ANALYZE", "IO"))); + /* check that serialize is used with EXPLAIN ANALYZE */ if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze) ereport(ERROR, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index d65e2a87b42..83d6478bc2b 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -144,13 +144,20 @@ BitmapTableScanSetup(BitmapHeapScanState *node) */ if (!node->ss.ss_currentScanDesc) { + uint32 flags = SO_NONE; + + if (ScanRelIsReadOnly(&node->ss)) + flags |= SO_HINT_REL_READ_ONLY; + + if (node->ss.ps.state->es_instrument & INSTRUMENT_IO) + flags |= SO_SCAN_INSTRUMENT; + node->ss.ss_currentScanDesc = table_beginscan_bm(node->ss.ss_currentRelation, node->ss.ps.state->es_snapshot, 0, NULL, - ScanRelIsReadOnly(&node->ss) ? - SO_HINT_REL_READ_ONLY : SO_NONE); + flags); } node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator; @@ -330,6 +337,14 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) */ si->exact_pages += node->stats.exact_pages; si->lossy_pages += node->stats.lossy_pages; + + /* collect I/O instrumentation for this process */ + if (node->ss.ss_currentScanDesc && + node->ss.ss_currentScanDesc->rs_instrument) + { + AccumulateIOStats(&si->stats.io, + &node->ss.ss_currentScanDesc->rs_instrument->io); + } } /* diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index 0b6cdf7c873..b6fce4e7cc6 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -74,6 +74,7 @@ #include "postgres.h" #include "miscadmin.h" +#include "executor/instrument_node.h" #include "storage/aio.h" #include "storage/fd.h" #include "storage/smgr.h" @@ -123,6 +124,9 @@ struct ReadStream bool advice_enabled; bool temporary; + /* scan stats counters */ + IOStats *stats; + /* * One-block buffer to support 'ungetting' a block number, to resolve flow * control problems when I/Os are split. @@ -188,6 +192,73 @@ block_range_read_stream_cb(ReadStream *stream, return InvalidBlockNumber; } +/* + * Update stream stats with current pinned buffer depth. + * + * Called once per buffer returned to the consumer in read_stream_next_buffer(). + * Records the number of pinned buffers at that moment, so we can compute the + * average look-ahead depth. + */ +static inline void +read_stream_count_prefetch(ReadStream *stream) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->prefetch_count++; + stats->distance_sum += stream->pinned_buffers; + if (stream->pinned_buffers > stats->distance_max) + stats->distance_max = stream->pinned_buffers; +} + +/* + * Update stream stats about size of I/O requests. + * + * We count the number of I/O requests, size of requests (counted in blocks) + * and number of in-progress I/Os. + */ +static inline void +read_stream_count_io(ReadStream *stream, int nblocks, int in_progress) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->io_count++; + stats->io_nblocks += nblocks; + stats->io_in_progress += in_progress; +} + +/* + * Update stream stats about waits for I/O when consuming buffers. + * + * We count the number of I/O waits while pulling buffers out of a stream. + */ +static inline void +read_stream_count_wait(ReadStream *stream) +{ + IOStats *stats = stream->stats; + + if (stats == NULL) + return; + + stats->wait_count++; +} + +/* + * Enable collection of stats into the provided IOStats. + */ +void +read_stream_enable_stats(ReadStream *stream, IOStats *stats) +{ + stream->stats = stats; + if (stream->stats) + stream->stats->distance_capacity = stream->max_pinned_buffers; +} + /* * Ask the callback which block it would like us to read next, with a one block * buffer in front to allow read_stream_unget_block() to work. @@ -426,6 +497,9 @@ read_stream_start_pending_read(ReadStream *stream) Assert(stream->ios_in_progress < stream->max_ios); stream->ios_in_progress++; stream->seq_blocknum = stream->pending_read_blocknum + nblocks; + + /* update I/O stats */ + read_stream_count_io(stream, nblocks, stream->ios_in_progress); } /* @@ -1021,6 +1095,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) flags))) { /* Fast return. */ + read_stream_count_prefetch(stream); return buffer; } @@ -1036,6 +1111,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) * to avoid having to effectively do another synchronous IO for * the next block (if it were also a miss). */ + + /* update I/O stats */ + read_stream_count_io(stream, 1, stream->ios_in_progress); + + /* update prefetch distance */ + read_stream_count_prefetch(stream); } else { @@ -1116,6 +1197,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY) needed_wait = true; + /* Count it as a wait if we need to wait for IO */ + if (needed_wait) + read_stream_count_wait(stream); + /* * Have the read-ahead distance ramp up rapidly after we needed to * wait for IO. We only increase the read-ahead-distance when we @@ -1228,6 +1313,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) } #endif + read_stream_count_prefetch(stream); + /* Pin transferred to caller. */ Assert(stream->pinned_buffers > 0); stream->pinned_buffers--; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index fd2076c582a..2ea06a67a63 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -24,6 +24,7 @@ struct ParallelTableScanDescData; +struct TableScanInstrumentation; /* * Generic descriptor for table scans. This is the base-class for table scans, @@ -64,6 +65,11 @@ typedef struct TableScanDescData struct ParallelTableScanDescData *rs_parallel; /* parallel scan * information */ + + /* + * Instrumentation counters maintained by all table AMs. + */ + struct TableScanInstrumentation *rs_instrument; } TableScanDescData; typedef struct TableScanDescData *TableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index a21c7db5439..c13f05d39db 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -69,6 +69,9 @@ typedef enum ScanOptions /* set if the query doesn't modify the relation */ SO_HINT_REL_READ_ONLY = 1 << 10, + + /* collect scan instrumentation */ + SO_SCAN_INSTRUMENT = 1 << 11, } ScanOptions; /* diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h index 6252fe11f15..97bc7ed49f6 100644 --- a/src/include/commands/explain_state.h +++ b/src/include/commands/explain_state.h @@ -55,6 +55,7 @@ typedef struct ExplainState bool summary; /* print total planning and execution timing */ bool memory; /* print planner's memory usage information */ bool settings; /* print modified settings */ + bool io; /* print info about IO (prefetch, ...) */ bool generic; /* generate a generic plan */ ExplainSerializeOption serialize; /* serialize the query's output? */ ExplainFormat format; /* output format */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index cc9fbb0e2f0..f093a52aae0 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -64,6 +64,7 @@ typedef enum InstrumentOption INSTRUMENT_BUFFERS = 1 << 1, /* needs buffer usage */ INSTRUMENT_ROWS = 1 << 2, /* needs row count */ INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */ + INSTRUMENT_IO = 1 << 4, /* needs IO usage */ INSTRUMENT_ALL = PG_INT32_MAX } InstrumentOption; diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index e6a3f9f1941..22a75ccd863 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -48,6 +48,55 @@ typedef struct SharedAggInfo } SharedAggInfo; +/* --------------------- + * Instrumentation information about read streams and I/O + * --------------------- + */ +typedef struct IOStats +{ + /* number of buffers returned to consumer (for averaging distance) */ + uint64 prefetch_count; + + /* sum of pinned_buffers sampled at each buffer return */ + uint64 distance_sum; + + /* maximum actual pinned_buffers observed during the scan */ + int16 distance_max; + + /* maximum possible look-ahead distance (max_pinned_buffers) */ + int16 distance_capacity; + + /* number of waits for a read (for the I/O) */ + uint64 wait_count; + + /* I/O stats */ + uint64 io_count; /* number of I/Os */ + uint64 io_nblocks; /* sum of blocks for all I/Os */ + uint64 io_in_progress; /* sum of in-progress I/Os */ +} IOStats; + +typedef struct TableScanInstrumentation +{ + IOStats io; +} TableScanInstrumentation; + +/* merge IO statistics from 'src' into 'dst' */ +static inline void +AccumulateIOStats(IOStats *dst, IOStats *src) +{ + dst->prefetch_count += src->prefetch_count; + dst->distance_sum += src->distance_sum; + if (src->distance_max > dst->distance_max) + dst->distance_max = src->distance_max; + if (src->distance_capacity > dst->distance_capacity) + dst->distance_capacity = src->distance_capacity; + dst->wait_count += src->wait_count; + dst->io_count += src->io_count; + dst->io_nblocks += src->io_nblocks; + dst->io_in_progress += src->io_in_progress; +} + + /* --------------------- * Instrumentation information for indexscans (amgettuple and amgetbitmap) * --------------------- @@ -79,6 +128,7 @@ typedef struct BitmapHeapScanInstrumentation { uint64 exact_pages; uint64 lossy_pages; + TableScanInstrumentation stats; } BitmapHeapScanInstrumentation; /* diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h index c9359b29b0f..48995c6d534 100644 --- a/src/include/storage/read_stream.h +++ b/src/include/storage/read_stream.h @@ -65,6 +65,7 @@ struct ReadStream; typedef struct ReadStream ReadStream; +struct IOStats; /* for block_range_read_stream_cb */ typedef struct BlockRangeReadStreamPrivate @@ -103,5 +104,6 @@ extern BlockNumber read_stream_pause(ReadStream *stream); extern void read_stream_resume(ReadStream *stream); extern void read_stream_reset(ReadStream *stream); extern void read_stream_end(ReadStream *stream); +extern void read_stream_enable_stats(ReadStream *stream, struct IOStats *stats); #endif /* READ_STREAM_H */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a998bb5e882..ca54c783647 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1279,6 +1279,7 @@ IOContext IOFuncSelector IOObject IOOp +IOStats IO_STATUS_BLOCK IPCompareMethod ITEM @@ -3127,6 +3128,7 @@ TableLikeClause TableSampleClause TableScanDesc TableScanDescData +TableScanInstrumentation TableSpaceCacheEntry TableSpaceOpts TableToProcess