]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Switch to FATAL error for missing checkpoint record without backup_label
authorMichael Paquier <michael@paquier.xyz>
Tue, 10 Mar 2026 03:00:05 +0000 (12:00 +0900)
committerMichael Paquier <michael@paquier.xyz>
Tue, 10 Mar 2026 03:00:05 +0000 (12:00 +0900)
Crash recovery started without a backup_label previously crashed with a
PANIC if the checkpoint record could not be found.  This commit lowers
the report generated to be a FATAL instead.

With recovery methods being more imaginative these days, this should
provide more flexibility when handling PostgreSQL recovery processing in
the event of a driver error, similarly to 15f68cebdcec.  An extra
benefit of this change is that it becomes possible to add a test to
check that a FATAL is hit with an expected error message pattern.  With
the recovery code becoming more complicated over the last couple of
years, I suspect that this will be benefitial to cover in the long-term.

The original PANIC behavior has been introduced in the early days of
crash recovery, as of 4d14fe0048cf (PANIC did not exist yet, the code
used STOP).

Author: Nitin Jadhav <nitinjadhavpostgres@gmail.com>
Discussion: https://postgr.es/m/CAMm1aWZbQ-Acp_xAxC7mX9uZZMH8+NpfepY9w=AOxbBVT9E=uA@mail.gmail.com

src/backend/access/transam/xlogrecovery.c
src/test/recovery/meson.build
src/test/recovery/t/052_checkpoint_segment_missing.pl [new file with mode: 0644]

index d55a534b138836bb61524c9dd766c9779c32a722..6d2c4a86b96007506ecb89b74f58e1e4a1b97c89 100644 (file)
@@ -735,7 +735,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
                         * can't read the last checkpoint because this allows us to
                         * simplify processing around checkpoints.
                         */
-                       ereport(PANIC,
+                       ereport(FATAL,
                                        errmsg("could not locate a valid checkpoint record at %X/%08X",
                                                   LSN_FORMAT_ARGS(CheckPointLoc)));
                }
index 8d20488952e69a443d3184e71375c14400206b5a..36d789720a3c8ca3496de391a5a49822a191faca 100644 (file)
@@ -60,6 +60,7 @@ tests += {
       't/049_wait_for_lsn.pl',
       't/050_redo_segment_missing.pl',
       't/051_effective_wal_level.pl',
+      't/052_checkpoint_segment_missing.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/052_checkpoint_segment_missing.pl b/src/test/recovery/t/052_checkpoint_segment_missing.pl
new file mode 100644 (file)
index 0000000..da54d14
--- /dev/null
@@ -0,0 +1,59 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+#
+# Verify crash recovery behavior when the WAL segment containing the
+# checkpoint record referenced by pg_controldata is missing.  This
+# checks the code path where there is no backup_label file, where the
+# startup process should fail with FATAL and log a message about the
+# missing checkpoint record.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('testnode');
+$node->init;
+$node->append_conf('postgresql.conf', 'log_checkpoints = on');
+$node->start;
+
+# Force a checkpoint so as pg_controldata points to a checkpoint record we
+# can target.
+$node->safe_psql('postgres', 'CHECKPOINT;');
+
+# Retrieve the checkpoint LSN and derive the WAL segment name.
+my $checkpoint_walfile = $node->safe_psql('postgres',
+       "SELECT pg_walfile_name(checkpoint_lsn) FROM pg_control_checkpoint()");
+
+ok($checkpoint_walfile ne '',
+       "derived checkpoint WAL file name: $checkpoint_walfile");
+
+# Stop the node.
+$node->stop('immediate');
+
+# Remove the WAL segment containing the checkpoint record.
+my $walpath = $node->data_dir . "/pg_wal/$checkpoint_walfile";
+ok(-f $walpath, "checkpoint WAL file exists before deletion: $walpath");
+
+unlink $walpath
+  or die "could not remove WAL file $walpath: $!";
+
+ok(!-e $walpath, "checkpoint WAL file removed: $walpath");
+
+# Use run_log instead of node->start because this test expects that
+# the server ends with an error during recovery.
+run_log(
+       [
+               'pg_ctl',
+               '--pgdata' => $node->data_dir,
+               '--log' => $node->logfile,
+               'start',
+       ]);
+
+# Confirm that recovery has failed as expected.
+my $logfile = slurp_file($node->logfile());
+ok( $logfile =~
+         qr/FATAL: .* could not locate a valid checkpoint record at .*/,
+       "FATAL logged for missing checkpoint record (no backup_label path)");
+
+done_testing();