]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
journal-file: make strict order optional
authorLennart Poettering <lennart@poettering.net>
Thu, 19 Jan 2023 21:45:06 +0000 (22:45 +0100)
committerLennart Poettering <lennart@poettering.net>
Wed, 25 Jan 2023 21:12:29 +0000 (22:12 +0100)
This is a follow-up for 1d8d483f59ffa62974772fb58a8ef4abe88550ec and
makes the strict ordering by realtime clock within each journal file
optional, not mandatory. It then enables it for all journal files
written by journald, but leaves it off on others (for example those
written by journald-remote).

This relaxes the logic behind writing journal files to the status quo
ante for all cases where the journal files are not generated, but are
merged/processed/propagated. Typically when processing journal records
from many files ordering by realtime clock and monotonic clock are
contradictory, and cannot be universally guaranteed as the records are
interleaved. By enforcing strict rules we would thus end up generating
myriads of separate journal files, each with just a few records in them.

Hence, let's losen restrictions again, but continue to enforce them in
journald, i.e. when we original create the journal files locally.

Note that generally there's nothing really wring with having journal
files with non-monotonically ordered entries by realtime clock. Looking
for records will not be deterministic anymore, but that's inherent to a
realtime clock that jumps up and down. So you won't get the "only"
answer, but still *a* answer that is correct if you seek for a realtime
clock.

This also adds similar logic on the monotonic clock, which is also only
enabled when generating journal files locally. This should be harder to
trigger (as journald will generate the messages, and should run with a
stable boot id and monotonic clock), but let's better be safe than
sorry, and refuse on the lower layer what makes no sense, even if it's
unlikely the higher layer will ever generate records that aren't ordered
by their monotonic clock.

src/journal/journald-server.c
src/libsystemd/sd-journal/journal-file.c
src/libsystemd/sd-journal/journal-file.h

index d31da2d129519f0b20e57356e3fa65af58a6e2e9..f43d1b56095a04d8d5932b750fcac8796095f8a6 100644 (file)
@@ -278,7 +278,10 @@ static int open_journal(
         assert(fname);
         assert(ret);
 
-        file_flags = (s->compress.enabled ? JOURNAL_COMPRESS : 0) | (seal ? JOURNAL_SEAL : 0);
+        file_flags =
+                (s->compress.enabled ? JOURNAL_COMPRESS : 0) |
+                (seal ? JOURNAL_SEAL : 0) |
+                JOURNAL_STRICT_ORDER;
 
         if (reliably)
                 r = managed_journal_file_open_reliably(
@@ -511,7 +514,8 @@ static int do_rotate(
 
         file_flags =
                 (s->compress.enabled ? JOURNAL_COMPRESS : 0)|
-                (seal ? JOURNAL_SEAL : 0);
+                (seal ? JOURNAL_SEAL : 0) |
+                JOURNAL_STRICT_ORDER;
 
         r = managed_journal_file_rotate(f, s->mmap, file_flags, s->compress.threshold_bytes, s->deferred_closes);
         if (r < 0) {
@@ -624,7 +628,7 @@ static int server_archive_offline_user_journals(Server *s) {
                                 full,
                                 O_RDWR,
                                 (s->compress.enabled ? JOURNAL_COMPRESS : 0) |
-                                (s->seal ? JOURNAL_SEAL : 0),
+                                (s->seal ? JOURNAL_SEAL : 0), /* strict order does not matter here */
                                 0640,
                                 s->compress.threshold_bytes,
                                 &s->system_storage.metrics,
@@ -844,8 +848,16 @@ static bool shall_try_append_again(JournalFile *f, int r) {
                 log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Journal file is from the future, rotating.", f->path);
                 return true;
 
-        case -EREMCHG:         /* Time jumped backwards relative to last journal entry */
-                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Time jumped backwards relative to last journal entry, rotating.", f->path);
+        case -EREMCHG:         /* Wallclock time (CLOCK_REALTIME) jumped backwards relative to last journal entry */
+                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Realtime clock jumped backwards relative to last journal entry, rotating.", f->path);
+                return true;
+
+        case -EREMOTE:         /* Boot ID different from the one of the last entry */
+                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Boot ID changed since last record, rotating.", f->path);
+                return true;
+
+        case -ENOTNAM:         /* Monotonic time (CLOCK_MONOTONIC) jumped backwards relative to last journal entry */
+                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Montonic clock jumped backwards relative to last journal entry, rotating.", f->path);
                 return true;
 
         case -EAFNOSUPPORT:
index c517e31cc42c243048e1723b52227e7a7265dd02..2bd5dd650df373c2ab10ccb867ca92eb0c75b6ad 100644 (file)
@@ -334,7 +334,11 @@ static bool compact_mode_requested(void) {
         return true;
 }
 
-static int journal_file_init_header(JournalFile *f, JournalFileFlags file_flags, JournalFile *template) {
+static int journal_file_init_header(
+                JournalFile *f,
+                JournalFileFlags file_flags,
+                JournalFile *template) {
+
         bool seal = false;
         ssize_t k;
         int r;
@@ -2092,11 +2096,37 @@ static int journal_file_append_entry_internal(
         assert(ts);
         assert(items || n_items == 0);
 
-        if (ts->realtime < le64toh(f->header->tail_entry_realtime))
-                return log_debug_errno(SYNTHETIC_ERRNO(EREMCHG),
-                                       "Realtime timestamp %" PRIu64 " smaller than previous realtime "
-                                       "timestamp %" PRIu64 ", refusing entry.",
-                                       ts->realtime, le64toh(f->header->tail_entry_realtime));
+        if (f->strict_order) {
+                /* If requested be stricter with ordering in this journal file, to make searching via
+                 * bisection fully deterministic. This is an optional feature, so that if desired journal
+                 * files can be written where the ordering is not strictly enforced (in which case bisection
+                 * will yield *a* result, but not the *only* result, when searching for points in
+                 * time). Strict ordering mode is enabled when journald originally writes the files, but
+                 * might not necessarily be if other tools (the remoting tools for example) write journal
+                 * files from combined sources.
+                 *
+                 * Typically, if any of the errors generated here are seen journald will just rotate the
+                 * journal files and start anew. */
+
+                if (ts->realtime < le64toh(f->header->tail_entry_realtime))
+                        return log_debug_errno(SYNTHETIC_ERRNO(EREMCHG),
+                                               "Realtime timestamp %" PRIu64 " smaller than previous realtime "
+                                               "timestamp %" PRIu64 ", refusing entry.",
+                                               ts->realtime, le64toh(f->header->tail_entry_realtime));
+
+                if (!sd_id128_is_null(f->header->boot_id) && boot_id) {
+
+                        if (!sd_id128_equal(f->header->boot_id, *boot_id))
+                                return log_debug_errno(SYNTHETIC_ERRNO(EREMOTE),
+                                                       "Boot ID to write is different from previous boot id, refusing entry.");
+
+                        if (ts->monotonic < le64toh(f->header->tail_entry_monotonic))
+                                return log_debug_errno(SYNTHETIC_ERRNO(ENOTNAM),
+                                                       "Monotonic timestamp %" PRIu64 " smaller than previous monotonic "
+                                                       "timestamp %" PRIu64 ", refusing entry.",
+                                                       ts->monotonic, le64toh(f->header->tail_entry_monotonic));
+                }
+        }
 
         osize = offsetof(Object, entry.items) + (n_items * journal_file_entry_item_size(f));
 
@@ -3710,6 +3740,8 @@ int journal_file_open(
         int r;
 
         assert(fd >= 0 || fname);
+        assert(file_flags >= 0);
+        assert(file_flags <= _JOURNAL_FILE_FLAGS_MAX);
         assert(mmap_cache);
         assert(ret);
 
@@ -3733,6 +3765,7 @@ int journal_file_open(
                 .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ?
                                             DEFAULT_COMPRESS_THRESHOLD :
                                             MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
+                .strict_order = FLAGS_SET(file_flags, JOURNAL_STRICT_ORDER),
         };
 
         if (fname) {
index a35aa5daef51bb25ddd7b5f4e5fce1d4e0663e28..8c809ed4b99a56cc53e5eee3756ddae3a304b205 100644 (file)
@@ -67,6 +67,7 @@ typedef struct JournalFile {
         int open_flags;
         bool close_fd:1;
         bool archive:1;
+        bool strict_order:1;
 
         direction_t last_direction;
         LocationType location_type;
@@ -123,8 +124,10 @@ typedef struct JournalFile {
 } JournalFile;
 
 typedef enum JournalFileFlags {
-        JOURNAL_COMPRESS = 1 << 0,
-        JOURNAL_SEAL     = 1 << 1,
+        JOURNAL_COMPRESS        = 1 << 0,
+        JOURNAL_SEAL            = 1 << 1,
+        JOURNAL_STRICT_ORDER    = 1 << 2,
+        _JOURNAL_FILE_FLAGS_MAX = JOURNAL_COMPRESS|JOURNAL_SEAL|JOURNAL_STRICT_ORDER,
 } JournalFileFlags;
 
 typedef struct {