]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Add system view pg_stat_recovery
authorMichael Paquier <michael@paquier.xyz>
Fri, 6 Mar 2026 03:37:40 +0000 (12:37 +0900)
committerMichael Paquier <michael@paquier.xyz>
Fri, 6 Mar 2026 03:37:40 +0000 (12:37 +0900)
This commit introduces pg_stat_recovery, that exposes at SQL level the
state of recovery as tracked by XLogRecoveryCtlData in shared memory,
maintained by the startup process.  This new view includes the following
fields, that are useful for monitoring purposes on a standby, once it
has reached a consistent state (making the execution of the SQL function
possible):
- Last-successfully replayed WAL record LSN boundaries and its timeline.
- Currently replaying WAL record end LSN and its timeline.
- Current WAL chunk start time.
- Promotion trigger state.
- Timestamp of latest processed commit/abort.
- Recovery pause state.

Some of this data can already be recovered from different system
functions, but not all of it.  See pg_get_wal_replay_pause_state or
pg_last_xact_replay_timestamp.  This new view offers a stronger
consistency guarantee, by grabbing the recovery state for all fields
through one spinlock acquisition.

The system view relies on a new function, called pg_stat_get_recovery().
Querying this data requires the pg_read_all_stats privilege.  The view
returns no rows if the node is not in recovery.

This feature originates from a suggestion I have made while discussion
the addition of a CONNECTING state to the WAL receiver's shared memory
state, because we lacked access to some of the state data.  The author
has taken the time to implement it, so thanks for that.

Bump catalog version.

Author: Xuneng Zhou <xunengzhou@gmail.com>
Discussion: https://postgr.es/m/CABPTF7W+Nody-+P9y4PNk37-QWuLpfUrEonHuEhrX+Vx9Kq+Kw@mail.gmail.com
Discussion: https://postgr.es/m/aW13GJn_RfTJIFCa@paquier.xyz

doc/src/sgml/monitoring.sgml
src/backend/access/transam/xlogfuncs.c
src/backend/catalog/system_views.sql
src/include/catalog/catversion.h
src/include/catalog/pg_proc.dat
src/test/recovery/t/001_stream_rep.pl
src/test/regress/expected/rules.out
src/test/regress/expected/sysviews.out
src/test/regress/sql/sysviews.sql

index dcf6e6a2f48a2d6a1c46de87376fabff0fbc9644..b3d5355068801bc89773e7f60cd7c48158270e11 100644 (file)
@@ -338,6 +338,14 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       </entry>
      </row>
 
+     <row>
+      <entry><structname>pg_stat_recovery</structname><indexterm><primary>pg_stat_recovery</primary></indexterm></entry>
+      <entry>Only one row, showing statistics about the state of recovery.
+       See <link linkend="monitoring-pg-stat-recovery-view">
+       <structname>pg_stat_recovery</structname></link> for details.
+      </entry>
+     </row>
+
      <row>
       <entry><structname>pg_stat_recovery_prefetch</structname><indexterm><primary>pg_stat_recovery_prefetch</primary></indexterm></entry>
       <entry>Only one row, showing statistics about blocks prefetched during recovery.
@@ -1912,6 +1920,149 @@ description | Waiting for a newly initialized WAL file to reach durable storage
 
  </sect2>
 
+ <sect2 id="monitoring-pg-stat-recovery-view">
+  <title><structname>pg_stat_recovery</structname></title>
+
+  <indexterm>
+   <primary>pg_stat_recovery</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_stat_recovery</structname> view will contain only
+   one row, showing statistics about the recovery state of the startup
+   process. This view returns no row when the server is not in recovery.
+  </para>
+
+  <table id="pg-stat-recovery-view" xreflabel="pg_stat_recovery">
+   <title><structname>pg_stat_recovery</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>promote_triggered</structfield> <type>boolean</type>
+      </para>
+      <para>
+       True if a promotion has been triggered.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>last_replayed_read_lsn</structfield> <type>pg_lsn</type>
+      </para>
+      <para>
+       Start write-ahead log location of the last successfully replayed
+       WAL record.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>last_replayed_end_lsn</structfield> <type>pg_lsn</type>
+      </para>
+      <para>
+       End write-ahead log location of the last successfully replayed
+       WAL record.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>last_replayed_tli</structfield> <type>integer</type>
+      </para>
+      <para>
+       Timeline of the last successfully replayed WAL record.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>replay_end_lsn</structfield> <type>pg_lsn</type>
+      </para>
+      <para>
+       Write-ahead log location of the record currently being replayed
+       (end position plus one).  When no record is being actively replayed,
+       equals <structfield>last_replayed_end_lsn</structfield>.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>replay_end_tli</structfield> <type>integer</type>
+      </para>
+      <para>
+       Timeline of the WAL record currently being replayed.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+        <structfield>recovery_last_xact_time</structfield> <type>timestamp with time zone</type>
+       </para>
+       <para>
+        Timestamp of the last transaction commit or abort replayed during
+        recovery. This is the time at which the commit or abort WAL record
+        for that transaction was generated on the primary.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>current_chunk_start_time</structfield> <type>timestamp with time zone</type>
+      </para>
+      <para>
+       Time when the startup process observed that replay had caught up
+       with the latest received WAL chunk.  Used in recovery-conflict
+       timing and replay/apply-lag diagnostics.  NULL if not yet
+       available.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>pause_state</structfield> <type>text</type>
+      </para>
+      <para>
+       Recovery pause state. Possible values are:
+      </para>
+       <itemizedlist>
+        <listitem>
+         <para>
+          <literal>not paused</literal>: Recovery is proceeding normally.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>pause requested</literal>: A pause has been requested
+          but recovery has not yet paused.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>paused</literal>: Recovery is paused.
+         </para>
+        </listitem>
+       </itemizedlist>
+      </entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+ </sect2>
+
  <sect2 id="monitoring-pg-stat-recovery-prefetch">
   <title><structname>pg_stat_recovery_prefetch</structname></title>
 
index 785430558953b713928b6ca263d196422fa8ed87..7c0e430b690939e07dc9db84aac3bb0418ed2ddd 100644 (file)
 #include "access/xlog_internal.h"
 #include "access/xlogbackup.h"
 #include "access/xlogrecovery.h"
+#include "catalog/pg_authid.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "utils/acl.h"
 #include "replication/walreceiver.h"
 #include "storage/fd.h"
 #include "storage/latch.h"
@@ -763,3 +765,95 @@ pg_promote(PG_FUNCTION_ARGS)
                                                   wait_seconds)));
        PG_RETURN_BOOL(false);
 }
+
+/*
+ * pg_stat_get_recovery - returns information about WAL recovery state
+ *
+ * Returns NULL when not in recovery or when the caller lacks
+ * pg_read_all_stats privileges; one row otherwise.
+ */
+Datum
+pg_stat_get_recovery(PG_FUNCTION_ARGS)
+{
+       TupleDesc       tupdesc;
+       Datum      *values;
+       bool       *nulls;
+
+       /* Local copies of shared state */
+       bool            promote_triggered;
+       XLogRecPtr      last_replayed_read_lsn;
+       XLogRecPtr      last_replayed_end_lsn;
+       TimeLineID      last_replayed_tli;
+       XLogRecPtr      replay_end_lsn;
+       TimeLineID      replay_end_tli;
+       TimestampTz recovery_last_xact_time;
+       TimestampTz current_chunk_start_time;
+       RecoveryPauseState pause_state;
+
+       if (!RecoveryInProgress())
+               PG_RETURN_NULL();
+
+       if (!has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS))
+               PG_RETURN_NULL();
+
+       /* Take a lock to ensure value consistency */
+       SpinLockAcquire(&XLogRecoveryCtl->info_lck);
+       promote_triggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
+       last_replayed_read_lsn = XLogRecoveryCtl->lastReplayedReadRecPtr;
+       last_replayed_end_lsn = XLogRecoveryCtl->lastReplayedEndRecPtr;
+       last_replayed_tli = XLogRecoveryCtl->lastReplayedTLI;
+       replay_end_lsn = XLogRecoveryCtl->replayEndRecPtr;
+       replay_end_tli = XLogRecoveryCtl->replayEndTLI;
+       recovery_last_xact_time = XLogRecoveryCtl->recoveryLastXTime;
+       current_chunk_start_time = XLogRecoveryCtl->currentChunkStartTime;
+       pause_state = XLogRecoveryCtl->recoveryPauseState;
+       SpinLockRelease(&XLogRecoveryCtl->info_lck);
+
+       if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+               elog(ERROR, "return type must be a row type");
+
+       values = palloc0_array(Datum, tupdesc->natts);
+       nulls = palloc0_array(bool, tupdesc->natts);
+
+       values[0] = BoolGetDatum(promote_triggered);
+
+       if (XLogRecPtrIsValid(last_replayed_read_lsn))
+               values[1] = LSNGetDatum(last_replayed_read_lsn);
+       else
+               nulls[1] = true;
+
+       if (XLogRecPtrIsValid(last_replayed_end_lsn))
+               values[2] = LSNGetDatum(last_replayed_end_lsn);
+       else
+               nulls[2] = true;
+
+       if (XLogRecPtrIsValid(last_replayed_end_lsn))
+               values[3] = Int32GetDatum(last_replayed_tli);
+       else
+               nulls[3] = true;
+
+       if (XLogRecPtrIsValid(replay_end_lsn))
+               values[4] = LSNGetDatum(replay_end_lsn);
+       else
+               nulls[4] = true;
+
+       if (XLogRecPtrIsValid(replay_end_lsn))
+               values[5] = Int32GetDatum(replay_end_tli);
+       else
+               nulls[5] = true;
+
+       if (current_chunk_start_time != 0)
+               values[6] = TimestampTzGetDatum(current_chunk_start_time);
+       else
+               nulls[6] = true;
+
+       /* recovery_last_xact_time */
+       if (recovery_last_xact_time != 0)
+               values[7] = TimestampTzGetDatum(recovery_last_xact_time);
+       else
+               nulls[7] = true;
+
+       values[8] = CStringGetTextDatum(GetRecoveryPauseStateString(pause_state));
+
+       PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
index e5c3e1855c1554bd8878629d084395ebc174d104..2eda7d80d022fac48ba872ec8537bed29d93b7eb 100644 (file)
@@ -998,6 +998,20 @@ CREATE VIEW pg_stat_wal_receiver AS
     FROM pg_stat_get_wal_receiver() s
     WHERE s.pid IS NOT NULL;
 
+CREATE VIEW pg_stat_recovery AS
+    SELECT
+            s.promote_triggered,
+            s.last_replayed_read_lsn,
+            s.last_replayed_end_lsn,
+            s.last_replayed_tli,
+            s.replay_end_lsn,
+            s.replay_end_tli,
+            s.recovery_last_xact_time,
+            s.current_chunk_start_time,
+            s.pause_state
+    FROM pg_stat_get_recovery() s
+    WHERE s.promote_triggered IS NOT NULL;
+
 CREATE VIEW pg_stat_recovery_prefetch AS
     SELECT
             s.stats_reset,
index 123e7c4261b8cfea73266b032c1cc9cd8fda589a..b863edfabdad89afc13847ff83f5138aa3bbcdbe 100644 (file)
@@ -57,6 +57,6 @@
  */
 
 /*                                                     yyyymmddN */
-#define CATALOG_VERSION_NO     202603051
+#define CATALOG_VERSION_NO     202603061
 
 #endif
index 4950bff280453289a3569e7e028b1d3409396602..361e2cfffebe94f39f4671bd56ee01ffab23ed52 100644 (file)
   proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
   proargnames => '{pid,status,receive_start_lsn,receive_start_tli,written_lsn,flushed_lsn,received_tli,last_msg_send_time,last_msg_receipt_time,latest_end_lsn,latest_end_time,slot_name,sender_host,sender_port,conninfo}',
   prosrc => 'pg_stat_get_wal_receiver' },
+{ oid => '9949', descr => 'statistics: information about WAL recovery',
+  proname => 'pg_stat_get_recovery', proisstrict => 'f', provolatile => 's',
+  proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{bool,pg_lsn,pg_lsn,int4,pg_lsn,int4,timestamptz,timestamptz,text}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o}',
+  proargnames => '{promote_triggered,last_replayed_read_lsn,last_replayed_end_lsn,last_replayed_tli,replay_end_lsn,replay_end_tli,recovery_last_xact_time,current_chunk_start_time,pause_state}',
+  prosrc => 'pg_stat_get_recovery' },
 { oid => '6169', descr => 'statistics: information about replication slot',
   proname => 'pg_stat_get_replication_slot', provolatile => 's',
   proparallel => 'r', prorettype => 'record', proargtypes => 'text',
index e9ac67813c70dc1dceaf67e16167cba539873307..a4fa4b96c61f3390075252bd7a1fd5e34f33ca29 100644 (file)
@@ -82,6 +82,11 @@ $result =
 print "standby 2: $result\n";
 is($result, qq(1002), 'check streamed content on standby 2');
 
+$result = $node_standby_1->safe_psql('postgres',
+       "SELECT count(*) FROM pg_stat_recovery WHERE promote_triggered IS NOT NULL"
+);
+is($result, qq(1), 'check recovery state on standby 1');
+
 # Likewise, but for a sequence
 $node_primary->safe_psql('postgres',
        "CREATE SEQUENCE seq1; SELECT nextval('seq1')");
index 78a37d9fc8f8c2f07a5e7fb245e2865a3ac99c6a..deb6e2ad6a94b354935885cc0279d265607c87ef 100644 (file)
@@ -2127,6 +2127,17 @@ pg_stat_progress_vacuum| SELECT s.pid,
         END AS started_by
    FROM (pg_stat_get_progress_info('VACUUM'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20)
      LEFT JOIN pg_database d ON ((s.datid = d.oid)));
+pg_stat_recovery| SELECT promote_triggered,
+    last_replayed_read_lsn,
+    last_replayed_end_lsn,
+    last_replayed_tli,
+    replay_end_lsn,
+    replay_end_tli,
+    recovery_last_xact_time,
+    current_chunk_start_time,
+    pause_state
+   FROM pg_stat_get_recovery() s(promote_triggered, last_replayed_read_lsn, last_replayed_end_lsn, last_replayed_tli, replay_end_lsn, replay_end_tli, recovery_last_xact_time, current_chunk_start_time, pause_state)
+  WHERE (promote_triggered IS NOT NULL);
 pg_stat_recovery_prefetch| SELECT stats_reset,
     prefetch,
     hit,
index 3dd63fd88ed486070009035e437f99853be2a190..132b56a5864ca62c86b8adb2aa97425fac9c0178 100644 (file)
@@ -143,6 +143,13 @@ select count(*) = 0 as ok from pg_stat_wal_receiver;
  t
 (1 row)
 
+-- We expect no recovery state in this test (running on primary)
+select count(*) = 0 as ok from pg_stat_recovery;
+ ok 
+----
+ t
+(1 row)
+
 -- This is to record the prevailing planner enable_foo settings during
 -- a regression test run.
 select name, setting from pg_settings where name like 'enable%';
index 004f9a70e00d87ce985782b4c10ca2c467715961..507e400ad4af9ad26ce5542ede080e4e24cc2af2 100644 (file)
@@ -76,6 +76,9 @@ select count(*) = 1 as ok from pg_stat_wal;
 -- We expect no walreceiver running in this test
 select count(*) = 0 as ok from pg_stat_wal_receiver;
 
+-- We expect no recovery state in this test (running on primary)
+select count(*) = 0 as ok from pg_stat_recovery;
+
 -- This is to record the prevailing planner enable_foo settings during
 -- a regression test run.
 select name, setting from pg_settings where name like 'enable%';