]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Add EXPLAIN (IO) instrumentation for SeqScan
authorTomas Vondra <tomas.vondra@postgresql.org>
Tue, 7 Apr 2026 21:06:43 +0000 (23:06 +0200)
committerTomas Vondra <tomas.vondra@postgresql.org>
Tue, 7 Apr 2026 21:07:03 +0000 (23:07 +0200)
Adds support for EXPLAIN (IO) instrumentation for sequential scans. This
requires adding shared instrumentation, using the separate DSM approach
introduced by dd78e69cfc33.

Author: Tomas Vondra <tomas@vondra.me>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me

src/backend/commands/explain.c
src/backend/executor/execParallel.c
src/backend/executor/nodeSeqscan.c
src/include/executor/instrument_node.h
src/include/executor/nodeSeqscan.h
src/include/nodes/execnodes.h
src/test/regress/expected/explain.out
src/test/regress/sql/explain.sql
src/tools/pgindent/typedefs.list

index 20e0bc8f2322546585c5856cf620cb8bac547c01..af32f09b3a47ecaa5a4d0bb1cc5e399197c1bd96 100644 (file)
@@ -2032,6 +2032,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
                                                                                   planstate, es);
                        if (IsA(plan, CteScan))
                                show_ctescan_info(castNode(CteScanState, planstate), es);
+                       show_scan_io_usage((ScanState *) planstate, es);
                        break;
                case T_Gather:
                        {
@@ -4102,6 +4103,30 @@ show_scan_io_usage(ScanState *planstate, ExplainState *es)
                                        }
                                }
 
+                               break;
+                       }
+               case T_SeqScan:
+                       {
+                               SharedSeqScanInstrumentation *sinstrument
+                               = ((SeqScanState *) planstate)->sinstrument;
+
+                               if (sinstrument)
+                               {
+                                       for (int i = 0; i < sinstrument->num_workers; ++i)
+                                       {
+                                               SeqScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+                                               AccumulateIOStats(&stats, &winstrument->stats.io);
+
+                                               if (!es->workers_state)
+                                                       continue;
+
+                                               ExplainOpenWorker(i, es);
+                                               print_io_usage(es, &winstrument->stats.io);
+                                               ExplainCloseWorker(i, es);
+                                       }
+                               }
+
                                break;
                        }
                default:
index 1a5ec0c305f04af6b9289c89d8c403c186938f04..9690f0938ae7a872ad9342d46820911cc46477c4 100644 (file)
@@ -257,6 +257,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
                        if (planstate->plan->parallel_aware)
                                ExecSeqScanEstimate((SeqScanState *) planstate,
                                                                        e->pcxt);
+                       /* even when not parallel-aware, for EXPLAIN ANALYZE */
+                       ExecSeqScanInstrumentEstimate((SeqScanState *) planstate,
+                                                                                 e->pcxt);
                        break;
                case T_IndexScanState:
                        if (planstate->plan->parallel_aware)
@@ -500,6 +503,9 @@ ExecParallelInitializeDSM(PlanState *planstate,
                        if (planstate->plan->parallel_aware)
                                ExecSeqScanInitializeDSM((SeqScanState *) planstate,
                                                                                 d->pcxt);
+                       /* even when not parallel-aware, for EXPLAIN ANALYZE */
+                       ExecSeqScanInstrumentInitDSM((SeqScanState *) planstate,
+                                                                                d->pcxt);
                        break;
                case T_IndexScanState:
                        if (planstate->plan->parallel_aware)
@@ -1148,6 +1154,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
                case T_BitmapHeapScanState:
                        ExecBitmapHeapRetrieveInstrumentation((BitmapHeapScanState *) planstate);
                        break;
+               case T_SeqScanState:
+                       ExecSeqScanRetrieveInstrumentation((SeqScanState *) planstate);
+                       break;
                default:
                        break;
        }
@@ -1388,6 +1397,8 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
                case T_SeqScanState:
                        if (planstate->plan->parallel_aware)
                                ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt);
+                       /* even when not parallel-aware, for EXPLAIN ANALYZE */
+                       ExecSeqScanInstrumentInitWorker((SeqScanState *) planstate, pwcxt);
                        break;
                case T_IndexScanState:
                        if (planstate->plan->parallel_aware)
index 04803b0e37d43a702a8cc787e0fdf0180a6897c6..5bcb0a861d74e339ab5146a9658729daa0063231 100644 (file)
@@ -29,6 +29,7 @@
 
 #include "access/relscan.h"
 #include "access/tableam.h"
+#include "executor/execParallel.h"
 #include "executor/execScan.h"
 #include "executor/executor.h"
 #include "executor/nodeSeqscan.h"
@@ -65,15 +66,21 @@ SeqNext(SeqScanState *node)
 
        if (scandesc == NULL)
        {
+               uint32          flags = SO_NONE;
+
+               if (ScanRelIsReadOnly(&node->ss))
+                       flags |= SO_HINT_REL_READ_ONLY;
+
+               if (estate->es_instrument & INSTRUMENT_IO)
+                       flags |= SO_SCAN_INSTRUMENT;
+
                /*
                 * We reach here if the scan is not parallel, or if we're serially
                 * executing a scan that was planned to be parallel.
                 */
                scandesc = table_beginscan(node->ss.ss_currentRelation,
                                                                   estate->es_snapshot,
-                                                                  0, NULL,
-                                                                  ScanRelIsReadOnly(&node->ss) ?
-                                                                  SO_HINT_REL_READ_ONLY : SO_NONE);
+                                                                  0, NULL, flags);
                node->ss.ss_currentScanDesc = scandesc;
        }
 
@@ -302,6 +309,22 @@ ExecEndSeqScan(SeqScanState *node)
         */
        scanDesc = node->ss.ss_currentScanDesc;
 
+       /*
+        * Collect I/O stats for this process into shared instrumentation.
+        */
+       if (node->sinstrument != NULL && IsParallelWorker())
+       {
+               SeqScanInstrumentation *si;
+
+               Assert(ParallelWorkerNumber < node->sinstrument->num_workers);
+               si = &node->sinstrument->sinstrument[ParallelWorkerNumber];
+
+               if (scanDesc && scanDesc->rs_instrument)
+               {
+                       AccumulateIOStats(&si->stats.io, &scanDesc->rs_instrument->io);
+               }
+       }
+
        /*
         * close heap scan
         */
@@ -370,6 +393,13 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
 {
        EState     *estate = node->ss.ps.state;
        ParallelTableScanDesc pscan;
+       uint32          flags = SO_NONE;
+
+       if (ScanRelIsReadOnly(&node->ss))
+               flags |= SO_HINT_REL_READ_ONLY;
+
+       if (estate->es_instrument & INSTRUMENT_IO)
+               flags |= SO_SCAN_INSTRUMENT;
 
        pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
        table_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -378,9 +408,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
        shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
 
        node->ss.ss_currentScanDesc =
-               table_beginscan_parallel(node->ss.ss_currentRelation, pscan,
-                                                                ScanRelIsReadOnly(&node->ss) ?
-                                                                SO_HINT_REL_READ_ONLY : SO_NONE);
+               table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags);
 }
 
 /* ----------------------------------------------------------------
@@ -410,10 +438,97 @@ ExecSeqScanInitializeWorker(SeqScanState *node,
                                                        ParallelWorkerContext *pwcxt)
 {
        ParallelTableScanDesc pscan;
+       uint32          flags = SO_NONE;
+
+       if (ScanRelIsReadOnly(&node->ss))
+               flags |= SO_HINT_REL_READ_ONLY;
+
+       if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
+               flags |= SO_SCAN_INSTRUMENT;
 
        pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
        node->ss.ss_currentScanDesc =
-               table_beginscan_parallel(node->ss.ss_currentRelation, pscan,
-                                                                ScanRelIsReadOnly(&node->ss) ?
-                                                                SO_HINT_REL_READ_ONLY : SO_NONE);
+               table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags);
+}
+
+/*
+ * Compute the amount of space we'll need for the shared instrumentation and
+ * inform pcxt->estimator.
+ */
+void
+ExecSeqScanInstrumentEstimate(SeqScanState *node, ParallelContext *pcxt)
+{
+       EState     *estate = node->ss.ps.state;
+       Size            size;
+
+       if ((estate->es_instrument & INSTRUMENT_IO) == 0 || pcxt->nworkers == 0)
+               return;
+
+       size = add_size(offsetof(SharedSeqScanInstrumentation, sinstrument),
+                                       mul_size(pcxt->nworkers, sizeof(SeqScanInstrumentation)));
+
+       shm_toc_estimate_chunk(&pcxt->estimator, size);
+       shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up parallel sequential scan instrumentation.
+ */
+void
+ExecSeqScanInstrumentInitDSM(SeqScanState *node, ParallelContext *pcxt)
+{
+       EState     *estate = node->ss.ps.state;
+       SharedSeqScanInstrumentation *sinstrument;
+       Size            size;
+
+       if ((estate->es_instrument & INSTRUMENT_IO) == 0 || pcxt->nworkers == 0)
+               return;
+
+       size = add_size(offsetof(SharedSeqScanInstrumentation, sinstrument),
+                                       mul_size(pcxt->nworkers, sizeof(SeqScanInstrumentation)));
+       sinstrument = shm_toc_allocate(pcxt->toc, size);
+       memset(sinstrument, 0, size);
+       sinstrument->num_workers = pcxt->nworkers;
+       shm_toc_insert(pcxt->toc,
+                                  node->ss.ps.plan->plan_node_id +
+                                  PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+                                  sinstrument);
+       node->sinstrument = sinstrument;
+}
+
+/*
+ * Look up and save the location of the shared instrumentation.
+ */
+void
+ExecSeqScanInstrumentInitWorker(SeqScanState *node,
+                                                               ParallelWorkerContext *pwcxt)
+{
+       EState     *estate = node->ss.ps.state;
+
+       if ((estate->es_instrument & INSTRUMENT_IO) == 0)
+               return;
+
+       node->sinstrument = shm_toc_lookup(pwcxt->toc,
+                                                                          node->ss.ps.plan->plan_node_id +
+                                                                          PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+                                                                          false);
+}
+
+/*
+ * Transfer sequential scan instrumentation from DSM to private memory.
+ */
+void
+ExecSeqScanRetrieveInstrumentation(SeqScanState *node)
+{
+       SharedSeqScanInstrumentation *sinstrument = node->sinstrument;
+       Size            size;
+
+       if (sinstrument == NULL)
+               return;
+
+       size = offsetof(SharedSeqScanInstrumentation, sinstrument)
+               + sinstrument->num_workers * sizeof(SeqScanInstrumentation);
+
+       node->sinstrument = palloc(size);
+       memcpy(node->sinstrument, sinstrument, size);
 }
index 22a75ccd863644c46e8c77982c499e072b75fa27..003dc262b5d83ef433b3eeaf029b3f39aabe0298 100644 (file)
@@ -266,4 +266,23 @@ typedef struct SharedIncrementalSortInfo
        IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER];
 } SharedIncrementalSortInfo;
 
+
+/* ---------------------
+ *     Instrumentation information for sequential scans
+ * ---------------------
+ */
+typedef struct SeqScanInstrumentation
+{
+       TableScanInstrumentation stats;
+} SeqScanInstrumentation;
+
+/*
+ * Shared memory container for per-worker information
+ */
+typedef struct SharedSeqScanInstrumentation
+{
+       int                     num_workers;
+       SeqScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedSeqScanInstrumentation;
+
 #endif                                                 /* INSTRUMENT_NODE_H */
index 7a1490596fb373a79aaa215aae06d8ace906830a..9c0ad4879d7cbcf4b874c9e975e55d8943ed75af 100644 (file)
@@ -28,4 +28,13 @@ extern void ExecSeqScanReInitializeDSM(SeqScanState *node, ParallelContext *pcxt
 extern void ExecSeqScanInitializeWorker(SeqScanState *node,
                                                                                ParallelWorkerContext *pwcxt);
 
+/* instrument support */
+extern void ExecSeqScanInstrumentEstimate(SeqScanState *node,
+                                                                                 ParallelContext *pcxt);
+extern void ExecSeqScanInstrumentInitDSM(SeqScanState *node,
+                                                                                ParallelContext *pcxt);
+extern void ExecSeqScanInstrumentInitWorker(SeqScanState *node,
+                                                                                       ParallelWorkerContext *pwcxt);
+extern void ExecSeqScanRetrieveInstrumentation(SeqScanState *node);
+
 #endif                                                 /* NODESEQSCAN_H */
index 3ecae7552fc712e0a1d72b386490cd6be03c9c58..56febb3204c96f3febebdde9b9e8194978f0be48 100644 (file)
@@ -1670,6 +1670,7 @@ typedef struct SeqScanState
 {
        ScanState       ss;                             /* its first field is NodeTag */
        Size            pscan_len;              /* size of parallel heap scan descriptor */
+       struct SharedSeqScanInstrumentation *sinstrument;
 } SeqScanState;
 
 /* ----------------
index dc31c7ce9f9764b5d5155abc4766001d27c37a4b..74a4d87801e69457228d6780bef162b7fa23e2cd 100644 (file)
@@ -100,7 +100,7 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
 (1 row)
 
 \a
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
 explain_filter
 <explain xmlns="http://www.postgresql.org/N/explain">
   <Query>
@@ -119,6 +119,13 @@ explain_filter
       <Actual-Rows>N.N</Actual-Rows>
       <Actual-Loops>N</Actual-Loops>
       <Disabled>false</Disabled>
+      <Average-Prefetch-Distance>N.N</Average-Prefetch-Distance>
+      <Max-Prefetch-Distance>N</Max-Prefetch-Distance>
+      <Prefetch-Capacity>N</Prefetch-Capacity>
+      <I-O-Count>N</I-O-Count>
+      <I-O-Waits>N</I-O-Waits>
+      <Average-I-O-Size>N.N</Average-I-O-Size>
+      <Average-I-Os-In-Progress>N.N</Average-I-Os-In-Progress>
       <Shared-Hit-Blocks>N</Shared-Hit-Blocks>
       <Shared-Read-Blocks>N</Shared-Read-Blocks>
       <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>
@@ -149,7 +156,7 @@ explain_filter
   </Query>
 </explain>
 (1 row)
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
 explain_filter
 - Plan: 
     Node Type: "Seq Scan"
@@ -166,6 +173,13 @@ explain_filter
     Actual Rows: N.N
     Actual Loops: N
     Disabled: false
+    Average Prefetch Distance: N.N
+    Max Prefetch Distance: N
+    Prefetch Capacity: N
+    I/O Count: N
+    I/O Waits: N
+    Average I/O Size: N.N
+    Average I/Os In Progress: N.N
     Shared Hit Blocks: N
     Shared Read Blocks: N
     Shared Dirtied Blocks: N
index 8f10e1aff55ca4d238151e3596ac45e31964802e..2f163c64bf6d38d7e8bed93d44e7a0d740ea0976 100644 (file)
@@ -69,8 +69,8 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
 
 \a
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
 \a
 
index ca54c783647c03e488b617404a3c4be3203e6b02..f323b9d758be6b7907cad0085c3b3f2d6f838736 100644 (file)
@@ -2800,6 +2800,7 @@ SelfJoinCandidate
 SemTPadded
 SemiAntiJoinFactors
 SeqScan
+SeqScanInstrumentation
 SeqScanState
 SeqTable
 SeqTableData
@@ -2864,6 +2865,7 @@ SharedMemoizeInfo
 SharedRecordTableEntry
 SharedRecordTableKey
 SharedRecordTypmodRegistry
+SharedSeqScanInstrumentation
 SharedSortInfo
 SharedTuplestore
 SharedTuplestoreAccessor