]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
journald: maintain entry seqnum counter in mmap()ed file in /run/
authorLennart Poettering <lennart@poettering.net>
Mon, 23 Jan 2023 20:21:21 +0000 (21:21 +0100)
committerLennart Poettering <lennart@poettering.net>
Wed, 8 Feb 2023 12:42:29 +0000 (13:42 +0100)
Let's ensure that entry seqnums remain stable and monotonic across the
entire runtime of the system, even if local storage is turned off. Let's
do this by maintainer a counter file in /run/ which we mmap() and
wherein we maintain the counter from early-boot on till late shutdown.

This takes inspiration of the kernel-seqnum file we already maintain
like that that tracks which kmsg messages we already processed. In fact,
we reuse the same code for maintaining it.

This should allow the behaviour entry seqnums to be more predictable, in
particular when journal local storage is turned off. Previously, we'd
maintain the seqnum simply by always bumping it to the maximum of the
last written entry seqnum plus one, and the biggest seqnum so far
written to the journal file on disk. If we'd never write a file on disk,
or if no journal file was existing during the initrd→seqnum transition
we'd completely lose the current seqnum position during daemon restarts
(such as the one happening during the switch-root operation).

This also will cause a journal file rotation whenever we try to write to
a journal file with multiple sequence number IDs, so that we know that
from early boot trhough the entire runtime we'll have stable sequence
numbers that do not jump, and thus can be used to determine "lost"
messages.

src/journal-remote/journal-remote-write.c
src/journal/journald-kmsg.c
src/journal/journald-server.c
src/journal/journald-server.h
src/journal/test-journal-flush.c
src/journal/test-journal-interleaving.c
src/journal/test-journal-stream.c
src/journal/test-journal-verify.c
src/journal/test-journal.c
src/libsystemd/sd-journal/journal-file.c
src/libsystemd/sd-journal/journal-file.h

index f6c3d83732b7f8272e10cb2490519bf4fb960102..ace4a1cfadd91538d69ead2a6684b794c25a01ea 100644 (file)
@@ -99,9 +99,16 @@ int writer_write(Writer *w,
                         return r;
         }
 
-        r = journal_file_append_entry(w->journal->file, ts, boot_id,
-                                      iovw->iovec, iovw->count,
-                                      &w->seqnum, NULL, NULL);
+        r = journal_file_append_entry(
+                        w->journal->file,
+                        ts,
+                        boot_id,
+                        iovw->iovec,
+                        iovw->count,
+                        &w->seqnum,
+                        /* seqnum_id= */ NULL,
+                        /* ret_object= */ NULL,
+                        /* ret_offset= */ NULL);
         if (r >= 0) {
                 if (w->server)
                         w->server->event_count += 1;
@@ -120,9 +127,15 @@ int writer_write(Writer *w,
                 return r;
 
         log_debug("Retrying write.");
-        r = journal_file_append_entry(w->journal->file, ts, boot_id,
-                                      iovw->iovec, iovw->count,
-                                      &w->seqnum, NULL, NULL);
+        r = journal_file_append_entry(
+                        w->journal->file,
+                        ts,
+                        boot_id,
+                        iovw->iovec, iovw->count,
+                        &w->seqnum,
+                        /* seqnum_id= */ NULL,
+                        /* ret_object= */ NULL,
+                        /* ret_offset= */ NULL);
         if (r < 0)
                 return r;
 
index c64e626b2cdff9879b5f394e8df47c35d163377f..83082c4e94f537473c0828450dc4ed77dc51076d 100644 (file)
@@ -423,9 +423,6 @@ finish:
 }
 
 int server_open_kernel_seqnum(Server *s) {
-        _cleanup_close_ int fd = -EBADF;
-        const char *fn;
-        uint64_t *p;
         int r;
 
         assert(s);
@@ -436,26 +433,9 @@ int server_open_kernel_seqnum(Server *s) {
         if (!s->dev_kmsg_readable)
                 return 0;
 
-        fn = strjoina(s->runtime_directory, "/kernel-seqnum");
-        fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
-        if (fd < 0) {
-                log_error_errno(errno, "Failed to open %s, ignoring: %m", fn);
-                return 0;
-        }
-
-        r = posix_fallocate_loop(fd, 0, sizeof(uint64_t));
-        if (r < 0) {
-                log_error_errno(r, "Failed to allocate sequential number file, ignoring: %m");
-                return 0;
-        }
-
-        p = mmap(NULL, sizeof(uint64_t), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
-        if (p == MAP_FAILED) {
-                log_error_errno(errno, "Failed to map sequential number file, ignoring: %m");
-                return 0;
-        }
-
-        s->kernel_seqnum = p;
+        r = server_map_seqnum_file(s, "kernel-seqnum", sizeof(uint64_t), (void**) &s->kernel_seqnum);
+        if (r < 0)
+                return log_error_errno(r, "Failed to map kernel seqnum file: %m");
 
         return 0;
 }
index 022e12d83dfd2b3b38cec0f54367ee488950b6a2..f88365e2684f940490cd6ac56cc0d58e39185a94 100644 (file)
@@ -889,10 +889,12 @@ static bool shall_try_append_again(JournalFile *f, int r) {
                 log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Montonic clock jumped backwards relative to last journal entry, rotating.", f->path);
                 return true;
 
+        case -EILSEQ:          /* seqnum ID last used in the file doesn't match the one we'd passed when writing an entry to it */
+                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: Journal file uses a different sequence number ID, rotating.", f->path);
+                return true;
+
         case -EAFNOSUPPORT:
-                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT,
-                                      "%s: underlying file system does not support memory mapping or another required file system feature.",
-                                      f->path);
+                log_ratelimit_warning(JOURNAL_LOG_RATELIMIT, "%s: underlying file system does not support memory mapping or another required file system feature.", f->path);
                 return false;
 
         default:
@@ -900,7 +902,13 @@ static bool shall_try_append_again(JournalFile *f, int r) {
         }
 }
 
-static void server_write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n, int priority) {
+static void server_write_to_journal(
+                Server *s,
+                uid_t uid,
+                const struct iovec *iovec,
+                size_t n,
+                int priority) {
+
         bool vacuumed = false, rotate = false;
         struct dual_timestamp ts;
         ManagedJournalFile *f;
@@ -950,7 +958,15 @@ static void server_write_to_journal(Server *s, uid_t uid, struct iovec *iovec, s
 
         s->last_realtime_clock = ts.realtime;
 
-        r = journal_file_append_entry(f->file, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        r = journal_file_append_entry(
+                        f->file,
+                        &ts,
+                        /* boot_id= */ NULL,
+                        iovec, n,
+                        &s->seqnum->seqnum,
+                        &s->seqnum->id,
+                        /* ret_object= */ NULL,
+                        /* ret_offset= */ NULL);
         if (r >= 0) {
                 server_schedule_sync(s, priority);
                 return;
@@ -978,7 +994,15 @@ static void server_write_to_journal(Server *s, uid_t uid, struct iovec *iovec, s
                 return;
 
         log_debug_errno(r, "Retrying write.");
-        r = journal_file_append_entry(f->file, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        r = journal_file_append_entry(
+                        f->file,
+                        &ts,
+                        /* boot_id= */ NULL,
+                        iovec, n,
+                        &s->seqnum->seqnum,
+                        &s->seqnum->id,
+                        /* ret_object= */ NULL,
+                        /* ret_offset= */ NULL);
         if (r < 0)
                 log_ratelimit_error_errno(r, FAILED_TO_WRITE_ENTRY_RATELIMIT,
                                           "Failed to write entry to %s (%zu items, %zu bytes) despite vacuuming, ignoring: %m",
@@ -1290,7 +1314,13 @@ int server_flush_to_var(Server *s, bool require_flag_file) {
                         goto finish;
                 }
 
-                r = journal_file_copy_entry(f, s->system_journal->file, o, f->current_offset);
+                r = journal_file_copy_entry(
+                                f,
+                                s->system_journal->file,
+                                o,
+                                f->current_offset,
+                                &s->seqnum->seqnum,
+                                &s->seqnum->id);
                 if (r >= 0)
                         continue;
 
@@ -1312,7 +1342,13 @@ int server_flush_to_var(Server *s, bool require_flag_file) {
                 }
 
                 log_debug("Retrying write.");
-                r = journal_file_copy_entry(f, s->system_journal->file, o, f->current_offset);
+                r = journal_file_copy_entry(
+                                f,
+                                s->system_journal->file,
+                                o,
+                                f->current_offset,
+                                &s->seqnum->seqnum,
+                                &s->seqnum->id);
                 if (r < 0) {
                         log_ratelimit_error_errno(r, JOURNAL_LOG_RATELIMIT, "Can't write entry: %m");
                         goto finish;
@@ -2250,6 +2286,51 @@ static int server_open_varlink(Server *s, const char *socket, int fd) {
         return 0;
 }
 
+int server_map_seqnum_file(
+                Server *s,
+                const char *fname,
+                size_t size,
+                void **ret) {
+
+        _cleanup_free_ char *fn = NULL;
+        _cleanup_close_ int fd = -EBADF;
+        uint64_t *p;
+        int r;
+
+        assert(s);
+        assert(fname);
+        assert(size > 0);
+        assert(ret);
+
+        fn = path_join(s->runtime_directory, fname);
+        if (!fn)
+                return -ENOMEM;
+
+        fd = open(fn, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
+        if (fd < 0)
+                return -errno;
+
+        r = posix_fallocate_loop(fd, 0, size);
+        if (r < 0)
+                return r;
+
+        p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+        if (p == MAP_FAILED)
+                return -errno;
+
+        *ret = p;
+        return 0;
+}
+
+void server_unmap_seqnum_file(void *p, size_t size) {
+        assert(size > 0);
+
+        if (!p)
+                return;
+
+        assert_se(munmap(p, size) >= 0);
+}
+
 static bool server_is_idle(Server *s) {
         assert(s);
 
@@ -2562,6 +2643,10 @@ int server_init(Server *s, const char *namespace) {
         if (r < 0)
                 return r;
 
+        r = server_map_seqnum_file(s, "seqnum", sizeof(SeqnumData), (void**) &s->seqnum);
+        if (r < 0)
+                return log_error_errno(r, "Failed to map main seqnum file: %m");
+
         r = server_open_kernel_seqnum(s);
         if (r < 0)
                 return r;
@@ -2678,8 +2763,8 @@ void server_done(Server *s) {
         if (s->ratelimit)
                 journal_ratelimit_free(s->ratelimit);
 
-        if (s->kernel_seqnum)
-                munmap(s->kernel_seqnum, sizeof(uint64_t));
+        server_unmap_seqnum_file(s->seqnum, sizeof(*s->seqnum));
+        server_unmap_seqnum_file(s->kernel_seqnum, sizeof(*s->kernel_seqnum));
 
         free(s->buffer);
         free(s->tty_path);
index ee8f37419088c2e7e9c99e29ba078ec28b4923d5..e7bf750a59c9f517d9d6499ad1fe2d483ad33188 100644 (file)
@@ -60,6 +60,13 @@ typedef struct JournalStorage {
         JournalStorageSpace space;
 } JournalStorage;
 
+/* This structure will be kept in $RUNTIME_DIRECTORY/seqnum and is mapped by journald, and is used to
+ * maintain the sequence number counter with its seqnum ID */
+typedef struct SeqnumData {
+        sd_id128_t id;
+        uint64_t seqnum;
+} SeqnumData;
+
 struct Server {
         char *namespace;
 
@@ -93,7 +100,7 @@ struct Server {
         ManagedJournalFile *system_journal;
         OrderedHashmap *user_journals;
 
-        uint64_t seqnum;
+        SeqnumData *seqnum;
 
         char *buffer;
 
@@ -227,3 +234,6 @@ void server_space_usage_message(Server *s, JournalStorage *storage);
 
 int server_start_or_stop_idle_timer(Server *s);
 int server_refresh_idle_timer(Server *s);
+
+int server_map_seqnum_file(Server *s, const char *fname, size_t size, void **ret);
+void server_unmap_seqnum_file(void *p, size_t size);
index 53e479909e78877f5ebeeb276e097b26b578b6c1..f2fe0e51c00db84581567e78bd5c40105bced880 100644 (file)
@@ -53,7 +53,7 @@ static void test_journal_flush(int argc, char *argv[]) {
                         log_error_errno(r, "journal_file_move_to_object failed: %m");
                 assert_se(r >= 0);
 
-                r = journal_file_copy_entry(f, new_journal->file, o, f->current_offset);
+                r = journal_file_copy_entry(f, new_journal->file, o, f->current_offset, NULL, NULL);
                 if (r < 0)
                         log_warning_errno(r, "journal_file_copy_entry failed: %m");
                 assert_se(r >= 0 ||
index 67552de8fe8344943c5264da9cec72a202be5510..55d717da3163ec45f428bd26b9a74de965d7e434 100644 (file)
@@ -66,7 +66,7 @@ static void append_number(ManagedJournalFile *f, int n, uint64_t *seqnum) {
 
         assert_se(asprintf(&p, "NUMBER=%d", n) >= 0);
         iovec[0] = IOVEC_MAKE_STRING(p);
-        assert_ret(journal_file_append_entry(f->file, &ts, NULL, iovec, 1, seqnum, NULL, NULL));
+        assert_ret(journal_file_append_entry(f->file, &ts, NULL, iovec, 1, seqnum, NULL, NULL, NULL));
         free(p);
 }
 
index 9f4494c6c0ab05ba4bf4485675ba3c0cdd20aa07..940625a0843d21dad4e8ef63390ccd41b010337f 100644 (file)
@@ -103,12 +103,12 @@ static void run_test(void) {
                 iovec[1] = IOVEC_MAKE(q, strlen(q));
 
                 if (i % 10 == 0)
-                        assert_se(journal_file_append_entry(three->file, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0);
+                        assert_se(journal_file_append_entry(three->file, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0);
                 else {
                         if (i % 3 == 0)
-                                assert_se(journal_file_append_entry(two->file, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0);
+                                assert_se(journal_file_append_entry(two->file, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0);
 
-                        assert_se(journal_file_append_entry(one->file, &ts, NULL, iovec, 2, NULL, NULL, NULL) == 0);
+                        assert_se(journal_file_append_entry(one->file, &ts, NULL, iovec, 2, NULL, NULL, NULL, NULL) == 0);
                 }
 
                 free(p);
index 0c58d05ced3e5d373e8f174084549ba89d156df0..7a90079fc8e6b1e65bff7edd4c3d187ee1a9cabd 100644 (file)
@@ -94,7 +94,7 @@ static int run_test(int argc, char *argv[]) {
 
                 iovec = IOVEC_MAKE_STRING(test);
 
-                assert_se(journal_file_append_entry(df->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0);
+                assert_se(journal_file_append_entry(df->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0);
 
                 free(test);
         }
index 889673cae776b35ac7c6f8a9a5480fb2f2563232..cd295b2a7e51bc751a6e37283788729e0b7588ab 100644 (file)
@@ -45,13 +45,13 @@ static void test_non_empty_one(void) {
         assert_se(sd_id128_randomize(&fake_boot_id) == 0);
 
         iovec = IOVEC_MAKE_STRING(test);
-        assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0);
+        assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0);
 
         iovec = IOVEC_MAKE_STRING(test2);
-        assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0);
+        assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0);
 
         iovec = IOVEC_MAKE_STRING(test);
-        assert_se(journal_file_append_entry(f->file, &ts, &fake_boot_id, &iovec, 1, NULL, NULL, NULL) == 0);
+        assert_se(journal_file_append_entry(f->file, &ts, &fake_boot_id, &iovec, 1, NULL, NULL, NULL, NULL) == 0);
 
 #if HAVE_GCRYPT
         journal_file_append_tag(f->file);
@@ -199,7 +199,7 @@ static bool check_compressed(uint64_t compress_threshold, uint64_t data_size) {
         dual_timestamp_get(&ts);
 
         iovec = IOVEC_MAKE(data, data_size);
-        assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL) == 0);
+        assert_se(journal_file_append_entry(f->file, &ts, NULL, &iovec, 1, NULL, NULL, NULL, NULL) == 0);
 
 #if HAVE_GCRYPT
         journal_file_append_tag(f->file);
index 2ead29548b334f574f76e843ddf6cd2c47f0e2f8..aab33dbfccab138f58d923ced36bea8b16c96a0e 100644 (file)
@@ -2094,6 +2094,7 @@ static int journal_file_append_entry_internal(
                 const EntryItem items[],
                 size_t n_items,
                 uint64_t *seqnum,
+                sd_id128_t *seqnum_id,
                 Object **ret_object,
                 uint64_t *ret_offset) {
 
@@ -2139,6 +2140,21 @@ static int journal_file_append_entry_internal(
                 }
         }
 
+        if (seqnum_id) {
+                /* Settle the passed in sequence number ID */
+
+                if (sd_id128_is_null(*seqnum_id))
+                        *seqnum_id = f->header->seqnum_id; /* Caller has none assigned, then copy the one from the file */
+                else if (!sd_id128_equal(*seqnum_id, f->header->seqnum_id)) {
+                        /* Different seqnum IDs? We can't allow entries from multiple IDs end up in the same journal.*/
+                        if (le64toh(f->header->n_entries) == 0)
+                                f->header->seqnum_id = *seqnum_id; /* Caller has one, and file so far has no entries, then copy the one from the caller */
+                        else
+                                return log_debug_errno(SYNTHETIC_ERRNO(EILSEQ),
+                                                       "Sequence number IDs don't match, refusing entry.");
+                }
+        }
+
         osize = offsetof(Object, entry.items) + (n_items * journal_file_entry_item_size(f));
 
         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
@@ -2290,6 +2306,7 @@ int journal_file_append_entry(
                 const struct iovec iovec[],
                 size_t n_iovec,
                 uint64_t *seqnum,
+                sd_id128_t *seqnum_id,
                 Object **ret_object,
                 uint64_t *ret_offset) {
 
@@ -2376,7 +2393,7 @@ int journal_file_append_entry(
         typesafe_qsort(items, n_iovec, entry_item_cmp);
         n_iovec = remove_duplicate_entry_items(items, n_iovec);
 
-        r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret_object, ret_offset);
+        r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, seqnum_id, ret_object, ret_offset);
 
         /* If the memory mapping triggered a SIGBUS then we return an
          * IO error and ignore the error code passed down to us, since
@@ -4093,7 +4110,14 @@ int journal_file_dispose(int dir_fd, const char *fname) {
         return 0;
 }
 
-int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
+int journal_file_copy_entry(
+                JournalFile *from,
+                JournalFile *to,
+                Object *o,
+                uint64_t p,
+                uint64_t *seqnum,
+                sd_id128_t *seqnum_id) {
+
         _cleanup_free_ EntryItem *items_alloc = NULL;
         EntryItem *items;
         uint64_t q, n, xor_hash = 0;
@@ -4168,7 +4192,7 @@ int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint6
                         return r;
         }
 
-        r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);
+        r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, seqnum, seqnum_id, NULL, NULL);
 
         if (mmap_cache_fd_got_sigbus(to->cache_fd))
                 return -EIO;
index 8c809ed4b99a56cc53e5eee3756ddae3a304b205..07f1f5d180651ab250132f0baf30c474da8ca62d 100644 (file)
@@ -259,7 +259,8 @@ int journal_file_append_entry(
                 const sd_id128_t *boot_id,
                 const struct iovec iovec[],
                 size_t n_iovec,
-                uint64_t *seqno,
+                uint64_t *seqnum,
+                sd_id128_t *seqnum_id,
                 Object **ret_object,
                 uint64_t *ret_offset);
 
@@ -286,7 +287,7 @@ int journal_file_move_to_entry_by_seqnum_for_data(JournalFile *f, Object *d, uin
 int journal_file_move_to_entry_by_realtime_for_data(JournalFile *f, Object *d, uint64_t realtime, direction_t direction, Object **ret_object, uint64_t *ret_offset);
 int journal_file_move_to_entry_by_monotonic_for_data(JournalFile *f, Object *d, sd_id128_t boot_id, uint64_t monotonic, direction_t direction, Object **ret_object, uint64_t *ret_offset);
 
-int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p);
+int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, sd_id128_t *seqnum_id);
 
 void journal_file_dump(JournalFile *f);
 void journal_file_print_header(JournalFile *f);