</listitem>
</varlistentry>
+ <varlistentry id="guc-wal-sender-shutdown-timeout" xreflabel="wal_sender_shutdown_timeout">
+ <term><varname>wal_sender_shutdown_timeout</varname> (<type>integer</type>)
+ <indexterm>
+ <primary><varname>wal_sender_shutdown_timeout</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ Specifies the maximum time the server waits during shutdown for all
+ WAL data to be replicated to the receiver. If this value is specified
+ without units, it is taken as milliseconds. A value of
+ <literal>-1</literal> (the default) disables the timeout mechanism.
+ </para>
+ <para>
+ When replication is in use, the sending server normally waits until
+ all WAL data has been transferred to the receiver before completing
+ shutdown. This helps keep sender and receiver in sync after shutdown,
+ which is especially important for physical replication switchovers,
+ but it can delay shutdown.
+ </para>
+ <para>
+ If this parameter is set, the server stops waiting and completes
+ shutdown when the timeout expires. This can shorten shutdown time,
+ for example, when replication is slow on high-latency networks or
+ when a logical replication apply worker is blocked waiting for locks.
+ However, in this case the sender and receiver may be out of sync after
+ shutdown.
+ </para>
+ <para>
+ This parameter can be set in <varname>primary_conninfo</varname> and
+ in the <literal>CONNECTION</literal> clause of
+ <command>CREATE SUBSCRIPTION</command> (for example, include
+ <literal>options=-cwal_sender_shutdown_timeout=10s</literal> in the
+ connection string), allowing different timeouts per replication
+ connection. For example, when both physical and logical replication
+ are used, it can be disabled for physical replication (e.g., for
+ switchovers) while enabled for logical replication to limit shutdown
+ time.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="guc-track-commit-timestamp" xreflabel="track_commit_timestamp">
<term><varname>track_commit_timestamp</varname> (<type>boolean</type>)
<indexterm>
</para>
<para>
- Users will stop waiting if a fast shutdown is requested. However, as
- when using asynchronous replication, the server will not fully
- shutdown until all outstanding WAL records are transferred to the currently
- connected standby servers.
+ Users will stop waiting if a fast shutdown is requested. However, when
+ using replication, the server will not fully shutdown until all
+ outstanding WAL records are transferred to the currently connected
+ standby servers, or <xref linkend="guc-wal-sender-shutdown-timeout"/>
+ (if set) expires, regardless of whether replication is synchronous or
+ asynchronous.
</para>
</sect3>
* checkpoint finishes, the postmaster sends us SIGUSR2. This instructs
* walsender to send any outstanding WAL, including the shutdown checkpoint
* record, wait for it to be replicated to the standby, and then exit.
+ * This waiting time can be limited by the wal_sender_shutdown_timeout
+ * parameter.
*
*
* Portions Copyright (c) 2010-2026, PostgreSQL Global Development Group
* walsenders */
int wal_sender_timeout = 60 * 1000; /* maximum time to send one WAL
* data message */
+
+int wal_sender_shutdown_timeout = -1; /* maximum time to wait during
+ * shutdown for WAL
+ * replication */
+
bool log_replication_commands = false;
/*
/* Have we sent a heartbeat message asking for reply, since last reply? */
static bool waiting_for_ping_response = false;
+/* Timestamp when walsender received the shutdown request */
+static TimestampTz shutdown_request_timestamp = 0;
+
/*
* While streaming WAL in Copy mode, streamingDoneSending is set to true
* after we have sent CopyDone. We should not send any more CopyData messages
pg_noreturn static void WalSndShutdown(void);
static void XLogSendPhysical(void);
static void XLogSendLogical(void);
+pg_noreturn static void WalSndDoneImmediate(void);
static void WalSndDone(WalSndSendDataCallback send_data);
static void IdentifySystem(void);
static void UploadManifest(void);
static void WalSndKeepalive(bool requestReply, XLogRecPtr writePtr);
static void WalSndKeepaliveIfNecessary(void);
static void WalSndCheckTimeOut(void);
+static void WalSndCheckShutdownTimeout(void);
static long WalSndComputeSleeptime(TimestampTz now);
static void WalSndWait(uint32 socket_events, long timeout, uint32 wait_event);
static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
/* die if timeout was reached */
WalSndCheckTimeOut();
+ /*
+ * During shutdown, die if the shutdown timeout expires. Call this
+ * before WalSndComputeSleeptime() so the timeout is considered when
+ * computing sleep time.
+ */
+ WalSndCheckShutdownTimeout();
+
/* Send keepalive if the time has come */
WalSndKeepaliveIfNecessary();
/* die if timeout was reached */
WalSndCheckTimeOut();
+ /*
+ * During shutdown, die if the shutdown timeout expires. Call this
+ * before WalSndComputeSleeptime() so the timeout is considered when
+ * computing sleep time.
+ */
+ WalSndCheckShutdownTimeout();
+
/* Send keepalive if the time has come */
WalSndKeepaliveIfNecessary();
* If wal_sender_timeout is enabled we want to wake up in time to send
* keepalives and to abort the connection if wal_sender_timeout has been
* reached.
+ *
+ * If wal_sender_shutdown_timeout is enabled, during shutdown, we want to
+ * wake up in time to exit when it expires.
*/
static long
WalSndComputeSleeptime(TimestampTz now)
{
+ TimestampTz wakeup_time;
long sleeptime = 10000; /* 10 s */
if (wal_sender_timeout > 0 && last_reply_timestamp > 0)
{
- TimestampTz wakeup_time;
-
/*
* At the latest stop sleeping once wal_sender_timeout has been
* reached.
sleeptime = TimestampDifferenceMilliseconds(now, wakeup_time);
}
+ if (shutdown_request_timestamp != 0 && wal_sender_shutdown_timeout > 0)
+ {
+ long shutdown_sleeptime;
+
+ wakeup_time = TimestampTzPlusMilliseconds(shutdown_request_timestamp,
+ wal_sender_shutdown_timeout);
+
+ shutdown_sleeptime = TimestampDifferenceMilliseconds(now, wakeup_time);
+
+ /* Choose the earliest wakeup. */
+ if (shutdown_sleeptime < sleeptime)
+ sleeptime = shutdown_sleeptime;
+ }
+
return sleeptime;
}
}
}
+/*
+ * Check whether the walsender process should terminate due to the expiration
+ * of wal_sender_shutdown_timeout after the receipt of a shutdown request.
+ */
+static void
+WalSndCheckShutdownTimeout(void)
+{
+ TimestampTz now;
+
+ /* Do nothing if shutdown has not been requested yet */
+ if (!(got_STOPPING || got_SIGUSR2))
+ return;
+
+ /* Terminate immediately if the timeout is set to 0 */
+ if (wal_sender_shutdown_timeout == 0)
+ WalSndDoneImmediate();
+
+ /*
+ * Record the shutdown request timestamp even if
+ * wal_sender_shutdown_timeout is disabled (-1), since the setting may
+ * change during shutdown and the timestamp will be needed in that case.
+ */
+ if (shutdown_request_timestamp == 0)
+ {
+ shutdown_request_timestamp = GetCurrentTimestamp();
+ return;
+ }
+
+ /* Do not check the timeout if it's disabled */
+ if (wal_sender_shutdown_timeout == -1)
+ return;
+
+ /* Terminate immediately if the timeout expires */
+ now = GetCurrentTimestamp();
+ if (TimestampDifferenceExceeds(shutdown_request_timestamp, now,
+ wal_sender_shutdown_timeout))
+ WalSndDoneImmediate();
+}
+
/* Main loop of walsender process that streams the WAL over Copy messages. */
static void
WalSndLoop(WalSndSendDataCallback send_data)
/* Check for replication timeout. */
WalSndCheckTimeOut();
+ /*
+ * During shutdown, die if the shutdown timeout expires. Call this
+ * before WalSndComputeSleeptime() so the timeout is considered when
+ * computing sleep time.
+ */
+ WalSndCheckShutdownTimeout();
+
/* Send keepalive if the time has come */
WalSndKeepaliveIfNecessary();
}
}
+/*
+ * Forced shutdown of walsender if wal_sender_shutdown_timeout has expired.
+ */
+static void
+WalSndDoneImmediate(void)
+{
+ WalSndState state = MyWalSnd->state;
+
+ if (state == WALSNDSTATE_CATCHUP ||
+ state == WALSNDSTATE_STREAMING ||
+ state == WALSNDSTATE_STOPPING)
+ {
+ QueryCompletion qc;
+
+ /* Try to inform receiver that XLOG streaming is done */
+ SetQueryCompletion(&qc, CMDTAG_COPY, 0);
+ EndCommand(&qc, DestRemote, false);
+
+ /*
+ * Note that the output buffer may be full during the forced shutdown
+ * of walsender. If pq_flush() is called at that time, the walsender
+ * process will be stuck. Therefore, call pq_flush_if_writable()
+ * instead. Successful reception of the done message with the
+ * walsender forced into a shutdown is not guaranteed.
+ */
+ pq_flush_if_writable();
+ }
+
+ /*
+ * Prevent ereport from attempting to send any more messages to the
+ * standby. Otherwise, it can cause the process to get stuck if the output
+ * buffers are full.
+ */
+ if (whereToSendOutput == DestRemote)
+ whereToSendOutput = DestNone;
+
+ ereport(WARNING,
+ (errmsg("terminating walsender process due to replication shutdown timeout"),
+ errdetail("Walsender process might have been terminated before all WAL data was replicated to the receiver.")));
+
+ proc_exit(0);
+}
+
/*
* Shutdown if the sender is caught up.
*
check_hook => 'check_wal_segment_size',
},
+{ name => 'wal_sender_shutdown_timeout', type => 'int', context => 'PGC_USERSET', group => 'REPLICATION_SENDING',
+ short_desc => 'Sets the maximum time the server waits during shutdown for all WAL data to be replicated to the receiver.',
+ long_desc => '-1 disables the timeout',
+ flags => 'GUC_UNIT_MS',
+ variable => 'wal_sender_shutdown_timeout',
+ boot_val => '-1',
+ min => '-1',
+ max => 'INT_MAX',
+},
+
{ name => 'wal_sender_timeout', type => 'int', context => 'PGC_USERSET', group => 'REPLICATION_SENDING',
short_desc => 'Sets the maximum time to wait for WAL replication.',
flags => 'GUC_UNIT_MS',
#max_slot_wal_keep_size = -1 # in megabytes; -1 disables
#idle_replication_slot_timeout = 0 # in seconds; 0 disables
#wal_sender_timeout = 60s # in milliseconds; 0 disables
+#wal_sender_shutdown_timeout = -1 # in milliseconds; -1 disables
#track_commit_timestamp = off # collect timestamp of transaction commit
# (change requires restart)
/* user-settable parameters */
extern PGDLLIMPORT int max_wal_senders;
extern PGDLLIMPORT int wal_sender_timeout;
+extern PGDLLIMPORT int wal_sender_shutdown_timeout;
extern PGDLLIMPORT bool log_replication_commands;
extern void InitWalSender(void);
't/035_conflicts.pl',
't/036_sequences.pl',
't/037_except.pl',
+ 't/038_walsnd_shutdown_timeout.pl',
't/100_bugs.pl',
],
},
--- /dev/null
+
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Checks that the publisher is able to shut down without
+# waiting for sending of all pending data to the subscriber
+# with wal_sender_shutdown_timeout set
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use Time::HiRes qw(usleep);
+
+# Initialize publisher node
+my $node_publisher = PostgreSQL::Test::Cluster->new('publisher');
+$node_publisher->init(allows_streaming => 'logical');
+$node_publisher->append_conf(
+ 'postgresql.conf',
+ qq(wal_sender_timeout = 1h
+ wal_sender_shutdown_timeout = 10ms));
+$node_publisher->start;
+
+# Initialize subscriber node
+my $node_subscriber = PostgreSQL::Test::Cluster->new('subscriber');
+$node_subscriber->init;
+$node_subscriber->start;
+
+# Create publication for test table
+$node_publisher->safe_psql(
+ 'postgres', qq(
+ CREATE TABLE test_tab (id int PRIMARY KEY);
+ CREATE PUBLICATION test_pub FOR TABLE test_tab;
+));
+
+# Create matching table and subscription on subscriber
+my $publisher_connstr = $node_publisher->connstr . ' dbname=postgres';
+$node_subscriber->safe_psql(
+ 'postgres', qq(
+ CREATE TABLE test_tab (id int PRIMARY KEY);
+ CREATE SUBSCRIPTION test_sub CONNECTION '$publisher_connstr' PUBLICATION test_pub WITH (failover = true);
+));
+
+# Wait for initial table sync to finish
+$node_subscriber->wait_for_subscription_sync($node_publisher, 'test_sub');
+
+# Start a background session on the subscriber to run a transaction later
+# that will block the logical apply worker on a lock.
+my $sub_session = $node_subscriber->background_psql('postgres');
+
+# Test that when the logical apply worker is blocked on a lock and replication
+# is stalled, shutting down the publisher causes the logical walsender to exit
+# due to wal_sender_shutdown_timeout, allowing shutdown to complete.
+
+# Cause the logical apply worker to block on a lock by running conflicting
+# transactions on the publisher and subscriber.
+$sub_session->query_safe("BEGIN; INSERT INTO test_tab VALUES (0);");
+$node_publisher->safe_psql('postgres', "INSERT INTO test_tab VALUES (0);");
+
+my $log_offset = -s $node_publisher->logfile;
+
+# Verify that the walsender exits due to wal_sender_shutdown_timeout.
+$node_publisher->stop('fast');
+ok( $node_publisher->log_contains(
+ qr/WARNING: .* terminating walsender process due to replication shutdown timeout/,
+ $log_offset),
+ "walsender exits due to wal_sender_shutdown_timeout");
+
+$sub_session->query_safe("ABORT;");
+$node_publisher->start;
+$node_publisher->wait_for_catchup('test_sub');
+
+# Test that when the logical apply worker is blocked on a lock, replication
+# is stalled, and the logical walsender's output buffer is full, shutting down
+# the publisher causes the walsender to exit due to
+# wal_sender_shutdown_timeout, allowing shutdown to complete.
+#
+# This test differs from the previous one in that the walsender's output
+# buffer is full (because pending data cannot be transferred).
+
+# Run a transaction on the subscriber that blocks the logical apply worker
+# on a lock.
+$sub_session->query_safe("BEGIN; LOCK TABLE test_tab IN EXCLUSIVE MODE;");
+
+# Generate enough data to fill the logical walsender's output buffer.
+$node_publisher->safe_psql('postgres',
+ "INSERT INTO test_tab VALUES (generate_series(1, 20000));");
+
+# Wait for the logical walsender's output buffer to fill. If the WAL send
+# positions do not advance between checks, treat the buffer as full.
+my $last_sent_lsn = $node_publisher->safe_psql('postgres',
+ "SELECT sent_lsn FROM pg_stat_replication WHERE application_name = 'test_sub';"
+);
+
+my $max_attempts = $PostgreSQL::Test::Utils::timeout_default * 10;
+while ($max_attempts-- >= 0)
+{
+ usleep(100_000);
+
+ my $cur_sent_lsn = $node_publisher->safe_psql('postgres',
+ "SELECT sent_lsn FROM pg_stat_replication WHERE application_name = 'test_sub';"
+ );
+
+ my $diff = $node_publisher->safe_psql('postgres',
+ "SELECT pg_wal_lsn_diff('$cur_sent_lsn', '$last_sent_lsn');");
+ last if $diff == 0;
+
+ $last_sent_lsn = $cur_sent_lsn;
+}
+
+$log_offset = -s $node_publisher->logfile;
+
+# Verify that the walsender exits due to wal_sender_shutdown_timeout.
+$node_publisher->stop('fast');
+ok( $node_publisher->log_contains(
+ qr/WARNING: .* terminating walsender process due to replication shutdown timeout/,
+ $log_offset),
+ "walsender with full output buffer exits due to wal_sender_shutdown_timeout"
+);
+
+$sub_session->query_safe("ABORT;");
+
+# The next test depends on Perl's `kill`, which apparently is not
+# portable to Windows. (It would be nice to use Test::More's `subtest`,
+# but that's not in the ancient version we require.)
+if ($PostgreSQL::Test::Utils::windows_os)
+{
+ $node_subscriber->stop('fast');
+ done_testing();
+ exit;
+}
+
+$node_publisher->start;
+
+# Test that wal_sender_shutdown_timeout works correctly when both physical
+# and logical replication are active, and slot synchronization is running on
+# the standby.
+#
+# In this scenario, the logical apply worker is blocked on a lock and
+# the standby's walreceiver is stopped (via SIGSTOP signal), stalling both
+# replication streams. Verify that shutting down the publisher (primary)
+# causes both physical and logical walsenders to exit due to
+# wal_sender_shutdown_timeout, allowing shutdown to complete.
+#
+# Skip this test on Windows.
+
+# Create the standby with slot synchronization enabled.
+$node_publisher->backup(
+ 'publisher_backup',
+ backup_options => [
+ '--create-slot', '--slot',
+ 'test_slot', '-d',
+ 'dbname=postgres', '--write-recovery-conf'
+ ]);
+
+$node_publisher->append_conf('postgresql.conf',
+ "synchronized_standby_slots = 'test_slot'");
+$node_publisher->reload;
+
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_publisher, 'publisher_backup');
+$node_standby->append_conf(
+ 'postgresql.conf',
+ qq(sync_replication_slots = on
+ hot_standby_feedback = on));
+$node_standby->start;
+
+# Cause the logical apply worker to block on a lock by running conflicting
+# transactions on the publisher and subscriber, stalling logical replication.
+$node_publisher->wait_for_catchup('test_sub');
+$sub_session->query_safe("BEGIN; LOCK TABLE test_tab IN EXCLUSIVE MODE;");
+$node_publisher->safe_psql('postgres', "INSERT INTO test_tab VALUES (-1); ");
+
+# Cause the standby's walreceiver to be blocked with SIGSTOP signal,
+# stalling physical replication.
+$node_standby->poll_query_until('postgres',
+ "SELECT EXISTS(SELECT 1 FROM pg_stat_wal_receiver)");
+my $receiverpid = $node_standby->safe_psql('postgres',
+ "SELECT pid FROM pg_stat_wal_receiver");
+like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
+kill 'STOP', $receiverpid;
+
+$log_offset = -s $node_publisher->logfile;
+
+# Verify that the walsender exits due to wal_sender_shutdown_timeout
+# even when both physical and logical replication are stalled.
+# wal_sender_shutdown_timeout.
+$node_publisher->safe_psql('postgres', "INSERT INTO test_tab VALUES (-2);");
+$node_publisher->stop('fast');
+ok( $node_publisher->log_contains(
+ qr/WARNING: .* terminating walsender process due to replication shutdown timeout/,
+ $log_offset),
+ "walsender exits due to wal_sender_shutdown_timeout even when both physical and logical replication are stalled"
+);
+
+kill 'CONT', $receiverpid;
+$sub_session->quit;
+
+$node_subscriber->stop('fast');
+$node_standby->stop('fast');
+
+done_testing();