]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Report detailed errors from XLogFindNextRecord() failures.
authorFujii Masao <fujii@postgresql.org>
Tue, 24 Mar 2026 13:33:09 +0000 (22:33 +0900)
committerFujii Masao <fujii@postgresql.org>
Tue, 24 Mar 2026 13:33:09 +0000 (22:33 +0900)
Previously, XLogFindNextRecord() did not return detailed error information
when it failed to find a valid WAL record. As a result, callers such as
the WAL summarizer, pg_waldump, and pg_walinspect could only report generic
errors (e.g., "could not find a valid record after ..."), making
troubleshooting difficult.

This commit fix the issue by extending XLogFindNextRecord() to return
detailed error information on failure, and updating its callers to include
those details in their error messages.

For example, when pg_waldump is run on a WAL file with an invalid magic number,
it now reports not only the generic error but also the specific cause
(e.g., "invalid magic number").

Author: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Reviewed-by: Mircea Cadariu <cadariu.mircea@gmail.com>
Reviewed-by: Japin Li <japinli@hotmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Fujii Masao <masao.fujii@gmail.com>
Discussion: https://postgr.es/m/CAO6_XqoxJXddcT4wkd9Xd+cD6Sz-fyspRGuV4Bq-wbXG4pVNzA@mail.gmail.com

contrib/pg_walinspect/pg_walinspect.c
src/backend/access/transam/xlogreader.c
src/backend/postmaster/walsummarizer.c
src/bin/pg_waldump/pg_waldump.c
src/bin/pg_waldump/t/001_basic.pl
src/include/access/xlogreader.h

index f6f5f0792d21b4f00341826cf3899ee82b8a8557..4cf6e41e2f5c8e5b3d1c656beffd539e19090058 100644 (file)
@@ -99,6 +99,7 @@ InitXLogReaderState(XLogRecPtr lsn)
        XLogReaderState *xlogreader;
        ReadLocalXLogPageNoWaitPrivate *private_data;
        XLogRecPtr      first_valid_record;
+       char       *errormsg;
 
        /*
         * Reading WAL below the first page of the first segments isn't allowed.
@@ -126,12 +127,19 @@ InitXLogReaderState(XLogRecPtr lsn)
                                 errdetail("Failed while allocating a WAL reading processor.")));
 
        /* first find a valid recptr to start from */
-       first_valid_record = XLogFindNextRecord(xlogreader, lsn);
+       first_valid_record = XLogFindNextRecord(xlogreader, lsn, &errormsg);
 
        if (!XLogRecPtrIsValid(first_valid_record))
-               ereport(ERROR,
-                               errmsg("could not find a valid record after %X/%08X",
-                                          LSN_FORMAT_ARGS(lsn)));
+       {
+               if (errormsg)
+                       ereport(ERROR,
+                                       errmsg("could not find a valid record after %X/%08X: %s",
+                                                  LSN_FORMAT_ARGS(lsn), errormsg));
+               else
+                       ereport(ERROR,
+                                       errmsg("could not find a valid record after %X/%08X",
+                                                  LSN_FORMAT_ARGS(lsn)));
+       }
 
        return xlogreader;
 }
index 8cb2110cb99c0f67c9ae4528c43106086ac36a97..8849610db005d6a4e4a4c5dde0a0445d7857bf42 100644 (file)
@@ -1390,14 +1390,21 @@ XLogReaderResetError(XLogReaderState *state)
  *
  * This positions the reader, like XLogBeginRead(), so that the next call to
  * XLogReadRecord() will read the next valid record.
+ *
+ * On failure, InvalidXLogRecPtr is returned, and *errormsg is set to a string
+ * with details of the failure.
+ *
+ * When set, *errormsg points to an internal buffer that's valid until the next
+ * call to XLogReadRecord.
  */
 XLogRecPtr
-XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
+XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
 {
        XLogRecPtr      tmpRecPtr;
        XLogRecPtr      found = InvalidXLogRecPtr;
        XLogPageHeader header;
-       char       *errormsg;
+
+       *errormsg = NULL;
 
        Assert(XLogRecPtrIsValid(RecPtr));
 
@@ -1482,7 +1489,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
         * or we just jumped over the remaining data of a continuation.
         */
        XLogBeginRead(state, tmpRecPtr);
-       while (XLogReadRecord(state, &errormsg) != NULL)
+       while (XLogReadRecord(state, errormsg) != NULL)
        {
                /* past the record we've found, break out */
                if (RecPtr <= state->ReadRecPtr)
@@ -1497,6 +1504,17 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
 err:
        XLogReaderInvalReadState(state);
 
+       /*
+        * We may have reported errors due to invalid WAL header, propagate the
+        * error message to the caller.
+        */
+       if (state->errormsg_deferred)
+       {
+               if (state->errormsg_buf[0] != '\0')
+                       *errormsg = state->errormsg_buf;
+               state->errormsg_deferred = false;
+       }
+
        return InvalidXLogRecPtr;
 }
 
index e1aa102f41dce6b6c459591a4ac94ab6e688c495..0c0670f7da9640a5524669d1f8ea50ace7fe99a9 100644 (file)
@@ -915,6 +915,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
        WalSummaryIO io;
        BlockRefTable *brtab = CreateEmptyBlockRefTable();
        bool            fast_forward = true;
+       char       *errormsg;
 
        /* Initialize private data for xlogreader. */
        private_data = palloc0_object(SummarizerReadLocalXLogPrivate);
@@ -966,7 +967,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
        }
        else
        {
-               summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
+               summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn, &errormsg);
                if (!XLogRecPtrIsValid(summary_start_lsn))
                {
                        /*
@@ -995,9 +996,16 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
                                switch_lsn = xlogreader->EndRecPtr;
                        }
                        else
-                               ereport(ERROR,
-                                               errmsg("could not find a valid record after %X/%08X",
-                                                          LSN_FORMAT_ARGS(start_lsn)));
+                       {
+                               if (errormsg)
+                                       ereport(ERROR,
+                                                       errmsg("could not find a valid record after %X/%08X: %s",
+                                                                  LSN_FORMAT_ARGS(start_lsn), errormsg));
+                               else
+                                       ereport(ERROR,
+                                                       errmsg("could not find a valid record after %X/%08X",
+                                                                  LSN_FORMAT_ARGS(start_lsn)));
+                       }
                }
 
                /* We shouldn't go backward. */
@@ -1010,7 +1018,6 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
        while (1)
        {
                int                     block_id;
-               char       *errormsg;
                XLogRecord *record;
                uint8           rmid;
 
index f82507ef6967c001bc492b6f1a09844c3a27c353..630d9859882adcdcbe8eca6240ac2008f07ec23a 100644 (file)
@@ -1384,11 +1384,17 @@ main(int argc, char **argv)
                pg_fatal("out of memory while allocating a WAL reading processor");
 
        /* first find a valid recptr to start from */
-       first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+       first_record = XLogFindNextRecord(xlogreader_state, private.startptr, &errormsg);
 
        if (!XLogRecPtrIsValid(first_record))
-               pg_fatal("could not find a valid record after %X/%08X",
-                                LSN_FORMAT_ARGS(private.startptr));
+       {
+               if (errormsg)
+                       pg_fatal("could not find a valid record after %X/%08X: %s",
+                                        LSN_FORMAT_ARGS(private.startptr), errormsg);
+               else
+                       pg_fatal("could not find a valid record after %X/%08X",
+                                        LSN_FORMAT_ARGS(private.startptr));
+       }
 
        /*
         * Display a message that we're skipping data if `from` wasn't a pointer
index 11df7e092bf09a80ab27ca5ca8dbf73a6c5405cb..8bb8fa225f6fceae755af4b8a525c9392e4cabbe 100644 (file)
@@ -4,6 +4,7 @@
 use strict;
 use warnings FATAL => 'all';
 use Cwd;
+use File::Copy;
 use PostgreSQL::Test::Cluster;
 use PostgreSQL::Test::Utils;
 use Test::More;
@@ -246,6 +247,33 @@ command_like(
        qr/^$/,
        'no output with --quiet option');
 
+# Test that pg_waldump reports a detailed error message when dumping
+# a WAL file with an invalid magic number (0000).
+#
+# The broken WAL file is created by copying a valid WAL file and
+# overwriting its magic number with 0000.
+my $broken_wal_dir = PostgreSQL::Test::Utils::tempdir_short();
+my $broken_wal = "$broken_wal_dir/$start_walfile";
+copy($node->data_dir . '/pg_wal/' . $start_walfile, $broken_wal)
+  || die "copying $start_walfile $!";
+
+my $fh;
+open($fh, '+<', $broken_wal)
+  or BAIL_OUT("open failed: $!");
+binmode $fh;
+
+sysseek($fh, 0, 0)
+  or BAIL_OUT("sysseek failed: $!");
+syswrite($fh, pack("S", 0))
+  or BAIL_OUT("syswrite failed: $!");
+close($fh)
+  or BAIL_OUT("close failed: $!");
+
+command_fails_like(
+       [ 'pg_waldump', $broken_wal ],
+       qr/invalid magic number 0000/i,
+       'detailed error message shown for invalid WAL page magic');
+
 # Test for: Display a message that we're skipping data if `from`
 # wasn't a pointer to the start of a record.
 sub test_pg_waldump_skip_bytes
index 80f1a548e0848ca07072fae9b3bc1a015892cf06..97eae2c1daba8c91c1f4ca94565e22fe5548feb9 100644 (file)
@@ -342,7 +342,8 @@ extern void XLogReaderSetDecodeBuffer(XLogReaderState *state,
 
 /* Position the XLogReader to given record */
 extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr);
-extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
+extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr,
+                                                                        char **errormsg);
 
 /* Return values from XLogPageReadCB. */
 typedef enum XLogPageReadResult