]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
journalctl: find boot ID more gracefully in corrupted journal
authorYu Watanabe <watanabe.yu+github@gmail.com>
Wed, 27 Sep 2023 21:04:53 +0000 (06:04 +0900)
committerYu Watanabe <watanabe.yu+github@gmail.com>
Mon, 2 Oct 2023 01:59:33 +0000 (10:59 +0900)
In discover_next_boot(), first we find a new boot ID based on the value
stored in the entry object. Then, find the tail (or head when we are going
upwards) entry of the boot based on the _BOOT_ID= field data.

If boot IDs of an entry in the entry object and _BOOT_ID field data
are inconsistent, which may happen on corrupted journal, then previously
discover_next_boot() failed with -ENODATA.

This makes the function check if the two boot IDs in each entry are
consistent, and skip the entry if not.

Fixes the failure of `journalctl -b -1` for 'truncated' journal:
https://github.com/systemd/systemd/pull/29334#issuecomment-1736567951

src/shared/logs-show.c
test/units/testsuite-04.journal.sh

index 77452a81bad92e17df08d832385423b6dcc4e6ac..c90400b81955c1cfe39d6e6211bccb0c9c44e788 100644 (file)
@@ -1836,7 +1836,7 @@ static int discover_next_boot(
                 bool advance_older,
                 BootId *ret) {
 
-        BootId boot;
+        _cleanup_set_free_ Set *broken_ids = NULL;
         int r;
 
         assert(j);
@@ -1853,7 +1853,10 @@ static int discover_next_boot(
          * we can actually advance to a *different* boot. */
         sd_journal_flush_matches(j);
 
-        do {
+        for (;;) {
+                sd_id128_t *id_dup;
+                BootId boot;
+
                 r = sd_journal_step_one(j, !advance_older);
                 if (r < 0)
                         return r;
@@ -1874,38 +1877,106 @@ static int discover_next_boot(
                  * speed things up, but let's not trust that it is complete, and hence, manually advance as
                  * necessary. */
 
-        } while (sd_id128_equal(boot.id, previous_boot_id));
+                if (!sd_id128_is_null(previous_boot_id) && sd_id128_equal(boot.id, previous_boot_id))
+                        continue;
 
-        r = sd_journal_get_realtime_usec(j, &boot.first_usec);
-        if (r < 0)
-                return r;
+                if (set_contains(broken_ids, &boot.id))
+                        continue;
 
-        /* Now seek to the last occurrence of this boot ID. */
-        r = add_match_boot_id(j, boot.id);
-        if (r < 0)
-                return r;
+                /* Yay, we found a new boot ID from the entry object. Let's check there exist corresponding
+                 * entries matching with the _BOOT_ID= data. */
 
-        if (advance_older)
-                r = sd_journal_seek_head(j);
-        else
-                r = sd_journal_seek_tail(j);
-        if (r < 0)
-                return r;
+                r = add_match_boot_id(j, boot.id);
+                if (r < 0)
+                        return r;
 
-        r = sd_journal_step_one(j, advance_older);
-        if (r < 0)
-                return r;
-        if (r == 0)
-                return log_debug_errno(SYNTHETIC_ERRNO(ENODATA),
-                                       "Whoopsie! We found a boot ID but can't read its last entry."); /* This shouldn't happen. We just came from this very boot ID. */
+                /* First, seek to the first (or the last when we are going upwards) occurrence of this boot ID.
+                 * You may think this is redundant. Yes, that's redundant unless the journal is corrupted.
+                 * But when the journal is corrupted, especially, badly 'truncated', then the below may fail.
+                 * See https://github.com/systemd/systemd/pull/29334#issuecomment-1736567951. */
+                if (advance_older)
+                        r = sd_journal_seek_tail(j);
+                else
+                        r = sd_journal_seek_head(j);
+                if (r < 0)
+                        return r;
 
-        r = sd_journal_get_realtime_usec(j, &boot.last_usec);
-        if (r < 0)
-                return r;
+                r = sd_journal_step_one(j, 0);
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        log_debug("Whoopsie! We found a boot ID %s but can't read its first entry. "
+                                  "The journal seems to be corrupted. Ignoring the boot ID.",
+                                  SD_ID128_TO_STRING(boot.id));
+                        goto try_again;
+                }
 
-        sd_journal_flush_matches(j);
-        *ret = boot;
-        return 1;
+                r = sd_journal_get_realtime_usec(j, &boot.first_usec);
+                if (r < 0)
+                        return r;
+
+                /* Next, seek to the last occurrence of this boot ID. */
+                if (advance_older)
+                        r = sd_journal_seek_head(j);
+                else
+                        r = sd_journal_seek_tail(j);
+                if (r < 0)
+                        return r;
+
+                r = sd_journal_step_one(j, 0);
+                if (r < 0)
+                        return r;
+                if (r == 0) {
+                        log_debug("Whoopsie! We found a boot ID %s but can't read its last entry. "
+                                  "The journal seems to be corrupted. Ignoring the boot ID.",
+                                  SD_ID128_TO_STRING(boot.id));
+                        goto try_again;
+                }
+
+                r = sd_journal_get_realtime_usec(j, &boot.last_usec);
+                if (r < 0)
+                        return r;
+
+                sd_journal_flush_matches(j);
+                *ret = boot;
+                return 1;
+
+        try_again:
+                /* Save the bad boot ID. */
+                id_dup = newdup(sd_id128_t, &boot.id, 1);
+                if (!id_dup)
+                        return -ENOMEM;
+
+                r = set_ensure_consume(&broken_ids, &id128_hash_ops_free, id_dup);
+                if (r < 0)
+                        return r;
+
+                /* Move to the previous position again. */
+                sd_journal_flush_matches(j);
+
+                if (!sd_id128_is_null(previous_boot_id)) {
+                        r = add_match_boot_id(j, previous_boot_id);
+                        if (r < 0)
+                                return r;
+                }
+
+                if (advance_older)
+                        r = sd_journal_seek_head(j);
+                else
+                        r = sd_journal_seek_tail(j);
+                if (r < 0)
+                        return r;
+
+                r = sd_journal_step_one(j, 0);
+                if (r < 0)
+                        return r;
+                if (r == 0)
+                        return log_debug_errno(SYNTHETIC_ERRNO(ENODATA),
+                                               "Whoopsie! Cannot seek to the last entry of boot %s.",
+                                               SD_ID128_TO_STRING(previous_boot_id));
+
+                sd_journal_flush_matches(j);
+        }
 }
 
 int journal_find_boot_by_id(sd_journal *j, sd_id128_t boot_id) {
index 2b3929324cca3d1ba0e8e3d5c11aaad035cd61f7..1e105a05664c8081b1b3613cdefa632a99d7c9fe 100755 (executable)
@@ -233,6 +233,6 @@ done < <(find /test-journals/no-rtc -name "*.zst")
 
 journalctl --directory="$JOURNAL_DIR" --list-boots --output=json >/tmp/lb1
 diff -u /tmp/lb1 - <<'EOF'
-[{"index":-3,"boot_id":"5ea5fc4f82a14186b5332a788ef9435e","first_entry":1666569600994371,"last_entry":1666584266223608},{"index":-2,"boot_id":"bea6864f21ad4c9594c04a99d89948b0","first_entry":1666584266731785,"last_entry":1666584347230411},{"index":-1,"boot_id":"4c708e1fd0744336be16f3931aa861fb","first_entry":1666584348378271,"last_entry":1666584354649355},{"index":0,"boot_id":"35e8501129134edd9df5267c49f744a4","first_entry":1666584356661527,"last_entry":1666584438086856}]
+[{"index":-3,"boot_id":"5ea5fc4f82a14186b5332a788ef9435e","first_entry":1666569600994371,"last_entry":1666584266223608},{"index":-2,"boot_id":"bea6864f21ad4c9594c04a99d89948b0","first_entry":1666569601005945,"last_entry":1666584347230411},{"index":-1,"boot_id":"4c708e1fd0744336be16f3931aa861fb","first_entry":1666569601017222,"last_entry":1666584354649355},{"index":0,"boot_id":"35e8501129134edd9df5267c49f744a4","first_entry":1666569601009823,"last_entry":1666584438086856}]
 EOF
 rm -rf "$JOURNAL_DIR" /tmp/lb1