]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-journal/journal-file.c
Merge pull request #21517 from yuwata/network-long-hw-addr
[thirdparty/systemd.git] / src / libsystemd / sd-journal / journal-file.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
cec736d2 2
cec736d2 3#include <errno.h>
cec736d2 4#include <fcntl.h>
11689d2a 5#include <linux/fs.h>
65ddc2c5 6#include <linux/magic.h>
ac2e41f5 7#include <pthread.h>
07630cea
LP
8#include <stddef.h>
9#include <sys/mman.h>
10#include <sys/statvfs.h>
11#include <sys/uio.h>
12#include <unistd.h>
fb0951b0 13
a03d4359
ZJS
14#include "sd-event.h"
15
b5efdb8a 16#include "alloc-util.h"
c8b3094d 17#include "chattr-util.h"
07630cea 18#include "compress.h"
4ce534f4 19#include "env-util.h"
3ffd4af2 20#include "fd-util.h"
aa892669 21#include "format-util.h"
11b29a96 22#include "fs-util.h"
0284adc6 23#include "journal-authenticate.h"
cec736d2
LP
24#include "journal-def.h"
25#include "journal-file.h"
26#include "lookup3.h"
0a970718 27#include "memory-util.h"
5d1ce257 28#include "path-util.h"
3df3e884 29#include "random-util.h"
b58c888f 30#include "set.h"
760877e9 31#include "sort-util.h"
3cc44114 32#include "stat-util.h"
363b2b9a 33#include "string-table.h"
07630cea 34#include "string-util.h"
4761fd0f 35#include "strv.h"
bf819d3a 36#include "sync-util.h"
89a5a90c 37#include "xattr-util.h"
cec736d2 38
4a92baf3
LP
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 41
57850536
AG
42#define DEFAULT_COMPRESS_THRESHOLD (512ULL)
43#define MIN_COMPRESS_THRESHOLD (8ULL)
807e17f0 44
babfc091 45/* This is the minimum journal file size */
6aae0b1a 46#define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
babfc091
LP
47
48/* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
6aae0b1a
ZJS
50#define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
51#define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
babfc091 52
6aae0b1a
ZJS
53/* Those are the lower and upper bounds for the minimal use limit,
54 * i.e. how much we'll use even if keep_free suggests otherwise. */
55#define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
56#define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
8580d1f7 57
babfc091 58/* This is the upper bound if we deduce max_size from max_use */
6aae0b1a 59#define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
babfc091
LP
60
61/* This is the upper bound if we deduce the keep_free value from the
62 * file system size */
6aae0b1a 63#define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
babfc091
LP
64
65/* This is the keep_free value when we can't determine the system
66 * size */
6aae0b1a 67#define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
babfc091 68
8580d1f7 69/* This is the default maximum number of journal files to keep around. */
6aae0b1a 70#define DEFAULT_N_MAX_FILES 100
8580d1f7 71
dca6219e
LP
72/* n_data was the first entry we added after the initial file format design */
73#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 74
a4bcff5b
LP
75/* How many entries to keep in the entry array chain cache at max */
76#define CHAIN_CACHE_MAX 20
77
a676e665 78/* How much to increase the journal file size at once each time we allocate something new. */
6aae0b1a 79#define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
a676e665 80
2678031a
LP
81/* Reread fstat() of the file for detecting deletions at least this often */
82#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83
fa6ac760
LP
84/* The mmap context to use for the header we pick as one above the last defined typed */
85#define CONTEXT_HEADER _OBJECT_TYPE_MAX
86
0dbe57ee
LP
87/* Longest hash chain to rotate after */
88#define HASH_CHAIN_DEPTH_MAX 100
89
51804460
ZJS
90#ifdef __clang__
91# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
92#endif
93
ac2e41f5
VC
94/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
95 * As a result we use atomic operations on f->offline_state for inter-thread communications with
96 * journal_file_set_offline() and journal_file_set_online(). */
97static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 98 assert(f);
ac2e41f5
VC
99 assert(f->fd >= 0);
100 assert(f->header);
101
102 for (;;) {
103 switch (f->offline_state) {
104 case OFFLINE_CANCEL:
105 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
106 continue;
107 return;
108
109 case OFFLINE_AGAIN_FROM_SYNCING:
110 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
111 continue;
112 break;
113
114 case OFFLINE_AGAIN_FROM_OFFLINING:
115 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
116 continue;
117 break;
118
119 case OFFLINE_SYNCING:
120 (void) fsync(f->fd);
26687bf8 121
ac2e41f5
VC
122 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
123 continue;
26687bf8 124
8eb85171 125 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
126 (void) fsync(f->fd);
127 break;
128
129 case OFFLINE_OFFLINING:
130 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
131 continue;
4831981d 132 _fallthrough_;
ac2e41f5
VC
133 case OFFLINE_DONE:
134 return;
135
136 case OFFLINE_JOINED:
137 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
138 return;
139 }
140 }
141}
142
143static void * journal_file_set_offline_thread(void *arg) {
144 JournalFile *f = arg;
145
fa7ff4cf
LP
146 (void) pthread_setname_np(pthread_self(), "journal-offline");
147
ac2e41f5
VC
148 journal_file_set_offline_internal(f);
149
150 return NULL;
151}
152
153static int journal_file_set_offline_thread_join(JournalFile *f) {
154 int r;
155
156 assert(f);
157
158 if (f->offline_state == OFFLINE_JOINED)
159 return 0;
160
161 r = pthread_join(f->offline_thread, NULL);
162 if (r)
163 return -r;
164
165 f->offline_state = OFFLINE_JOINED;
26687bf8 166
c3bd54bf 167 if (mmap_cache_fd_got_sigbus(f->cache_fd))
fa6ac760
LP
168 return -EIO;
169
ac2e41f5
VC
170 return 0;
171}
26687bf8 172
ac2e41f5
VC
173/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
174static bool journal_file_set_offline_try_restart(JournalFile *f) {
175 for (;;) {
176 switch (f->offline_state) {
177 case OFFLINE_AGAIN_FROM_SYNCING:
178 case OFFLINE_AGAIN_FROM_OFFLINING:
179 return true;
180
181 case OFFLINE_CANCEL:
182 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
183 continue;
184 return true;
185
186 case OFFLINE_SYNCING:
187 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
188 continue;
189 return true;
190
191 case OFFLINE_OFFLINING:
192 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
193 continue;
194 return true;
26687bf8
OS
195
196 default:
ac2e41f5
VC
197 return false;
198 }
26687bf8
OS
199 }
200}
201
ac2e41f5
VC
202/* Sets a journal offline.
203 *
204 * If wait is false then an offline is dispatched in a separate thread for a
205 * subsequent journal_file_set_offline() or journal_file_set_online() of the
206 * same journal to synchronize with.
207 *
208 * If wait is true, then either an existing offline thread will be restarted
209 * and joined, or if none exists the offline is simply performed in this
210 * context without involving another thread.
211 */
212int journal_file_set_offline(JournalFile *f, bool wait) {
bb1296b5 213 int target_state;
ac2e41f5
VC
214 bool restarted;
215 int r;
216
26687bf8
OS
217 assert(f);
218
219 if (!f->writable)
220 return -EPERM;
221
846e5418 222 if (f->fd < 0 || !f->header)
26687bf8
OS
223 return -EINVAL;
224
bb1296b5
VC
225 target_state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
226
b8f99e27 227 /* An offlining journal is implicitly online and may modify f->header->state,
bb1296b5
VC
228 * we must also join any potentially lingering offline thread when already in
229 * the desired offline state.
230 */
231 if (!journal_file_is_offlining(f) && f->header->state == target_state)
b8f99e27 232 return journal_file_set_offline_thread_join(f);
26687bf8 233
ac2e41f5
VC
234 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
235 restarted = journal_file_set_offline_try_restart(f);
236 if ((restarted && wait) || !restarted) {
237 r = journal_file_set_offline_thread_join(f);
238 if (r < 0)
239 return r;
240 }
26687bf8 241
ac2e41f5
VC
242 if (restarted)
243 return 0;
244
245 /* Initiate a new offline. */
246 f->offline_state = OFFLINE_SYNCING;
fa6ac760 247
ac2e41f5
VC
248 if (wait) /* Without using a thread if waiting. */
249 journal_file_set_offline_internal(f);
250 else {
5e9f01e8
LP
251 sigset_t ss, saved_ss;
252 int k;
253
cd2a429e 254 assert_se(sigfillset(&ss) >= 0);
08f9e80b
CM
255 /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
256 * Asynchronous SIGBUS signals can safely be handled by either thread. */
257 assert_se(sigdelset(&ss, SIGBUS) >= 0);
5e9f01e8
LP
258
259 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
260 if (r > 0)
261 return -r;
262
ac2e41f5 263 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
264
265 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
266 if (r > 0) {
267 f->offline_state = OFFLINE_JOINED;
ac2e41f5 268 return -r;
ec9ffa2c 269 }
5e9f01e8
LP
270 if (k > 0)
271 return -k;
ac2e41f5
VC
272 }
273
274 return 0;
275}
276
277static int journal_file_set_online(JournalFile *f) {
83bf6b67 278 bool wait = true;
ac2e41f5
VC
279
280 assert(f);
281
282 if (!f->writable)
283 return -EPERM;
284
846e5418 285 if (f->fd < 0 || !f->header)
ac2e41f5
VC
286 return -EINVAL;
287
83bf6b67 288 while (wait) {
ac2e41f5
VC
289 switch (f->offline_state) {
290 case OFFLINE_JOINED:
291 /* No offline thread, no need to wait. */
83bf6b67 292 wait = false;
ac2e41f5
VC
293 break;
294
295 case OFFLINE_SYNCING:
296 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
297 continue;
298 /* Canceled syncing prior to offlining, no need to wait. */
83bf6b67 299 wait = false;
ac2e41f5
VC
300 break;
301
302 case OFFLINE_AGAIN_FROM_SYNCING:
303 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
304 continue;
305 /* Canceled restart from syncing, no need to wait. */
83bf6b67 306 wait = false;
ac2e41f5
VC
307 break;
308
309 case OFFLINE_AGAIN_FROM_OFFLINING:
310 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
311 continue;
312 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 313 _fallthrough_;
ac2e41f5
VC
314 default: {
315 int r;
316
317 r = journal_file_set_offline_thread_join(f);
318 if (r < 0)
319 return r;
320
83bf6b67 321 wait = false;
ac2e41f5
VC
322 break;
323 }
324 }
325 }
26687bf8 326
c3bd54bf 327 if (mmap_cache_fd_got_sigbus(f->cache_fd))
fa6ac760
LP
328 return -EIO;
329
ac2e41f5
VC
330 switch (f->header->state) {
331 case STATE_ONLINE:
332 return 0;
26687bf8 333
ac2e41f5
VC
334 case STATE_OFFLINE:
335 f->header->state = STATE_ONLINE;
336 (void) fsync(f->fd);
337 return 0;
338
339 default:
340 return -EINVAL;
341 }
26687bf8
OS
342}
343
b58c888f
VC
344bool journal_file_is_offlining(JournalFile *f) {
345 assert(f);
346
347 __sync_synchronize();
348
3742095b 349 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
350 return false;
351
352 return true;
353}
354
804ae586 355JournalFile* journal_file_close(JournalFile *f) {
c377a6f3
YW
356 if (!f)
357 return NULL;
cec736d2 358
349cc4a5 359#if HAVE_GCRYPT
b0af6f41 360 /* Write the final tag */
43cd8794
FB
361 if (f->seal && f->writable) {
362 int r;
363
364 r = journal_file_append_tag(f);
365 if (r < 0)
366 log_error_errno(r, "Failed to append tag when closing journal: %m");
367 }
feb12d3e 368#endif
b0af6f41 369
7a24f3bf 370 if (f->post_change_timer) {
b6cdfbe5
ZJS
371 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
372 journal_file_post_change(f);
7a24f3bf 373
1d3fe304 374 sd_event_source_disable_unref(f->post_change_timer);
7a24f3bf
VC
375 }
376
ac2e41f5 377 journal_file_set_offline(f, true);
cec736d2 378
be7cdd8e 379 if (f->mmap && f->cache_fd)
c3bd54bf 380 mmap_cache_fd_free(f->cache_fd);
cec736d2 381
11689d2a
LP
382 if (f->fd >= 0 && f->defrag_on_close) {
383
384 /* Be friendly to btrfs: turn COW back on again now,
385 * and defragment the file. We won't write to the file
386 * ever again, hence remove all fragmentation, and
387 * reenable all the good bits COW usually provides
388 * (such as data checksumming). */
389
db9a4254 390 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
11689d2a
LP
391 (void) btrfs_defrag_fd(f->fd);
392 }
f27a3864 393
5d1ce257
LP
394 if (f->close_fd)
395 safe_close(f->fd);
cec736d2 396 free(f->path);
807e17f0 397
f649045c 398 mmap_cache_unref(f->mmap);
16e9f408 399
4743015d 400 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 401
d80b051c 402#if HAVE_COMPRESSION
807e17f0
LP
403 free(f->compress_buffer);
404#endif
405
349cc4a5 406#if HAVE_GCRYPT
baed47c3
LP
407 if (f->fss_file)
408 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 409 else
b7c9ae91
LP
410 free(f->fsprg_state);
411
412 free(f->fsprg_seed);
7560fffc
LP
413
414 if (f->hmac)
415 gcry_md_close(f->hmac);
416#endif
417
6b430fdb 418 return mfree(f);
cec736d2
LP
419}
420
0ac38b70 421static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 422 Header h = {};
cec736d2
LP
423 ssize_t k;
424 int r;
425
426 assert(f);
427
7560fffc 428 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 429 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 430
d89c8fdf
ZJS
431 h.incompatible_flags |= htole32(
432 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
4ce534f4 433 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 |
8653185a 434 f->compress_zstd * HEADER_INCOMPATIBLE_COMPRESSED_ZSTD |
4ce534f4 435 f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
7560fffc 436
d89c8fdf
ZJS
437 h.compatible_flags = htole32(
438 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 439
cec736d2
LP
440 r = sd_id128_randomize(&h.file_id);
441 if (r < 0)
442 return r;
443
0ac38b70
LP
444 if (template) {
445 h.seqnum_id = template->header->seqnum_id;
beec0085 446 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
447 } else
448 h.seqnum_id = h.file_id;
cec736d2
LP
449
450 k = pwrite(f->fd, &h, sizeof(h), 0);
451 if (k < 0)
452 return -errno;
453
454 if (k != sizeof(h))
455 return -EIO;
456
457 return 0;
458}
459
460static int journal_file_refresh_header(JournalFile *f) {
fa6ac760 461 int r;
cec736d2
LP
462
463 assert(f);
c88cc6af 464 assert(f->header);
cec736d2
LP
465
466 r = sd_id128_get_machine(&f->header->machine_id);
fd4885df
ZJS
467 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
468 /* We don't have a machine-id, let's continue without */
469 zero(f->header->machine_id);
470 else if (r < 0)
cec736d2
LP
471 return r;
472
e958c057 473 r = sd_id128_get_boot(&f->header->boot_id);
cec736d2
LP
474 if (r < 0)
475 return r;
476
fa6ac760 477 r = journal_file_set_online(f);
b788cc23 478
bf819d3a
LP
479 /* Sync the online state to disk; likely just created a new file, also sync the directory this file
480 * is located in. */
481 (void) fsync_full(f->fd);
a0fe2a2d 482
fa6ac760 483 return r;
cec736d2
LP
484}
485
4214009f
ZJS
486static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
487 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
488 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
489 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
490 uint32_t flags;
491
4214009f
ZJS
492 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
493
494 if (flags & ~supported) {
495 if (flags & ~any)
4761fd0f 496 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
497 f->path, type, flags & ~any);
498 flags = (flags & any) & ~supported;
4761fd0f 499 if (flags) {
8653185a 500 const char* strv[5];
86e68f38 501 size_t n = 0;
4761fd0f
ZJS
502 _cleanup_free_ char *t = NULL;
503
4ce534f4
LP
504 if (compatible) {
505 if (flags & HEADER_COMPATIBLE_SEALED)
506 strv[n++] = "sealed";
507 } else {
508 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
509 strv[n++] = "xz-compressed";
510 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
511 strv[n++] = "lz4-compressed";
8653185a
LP
512 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
513 strv[n++] = "zstd-compressed";
4ce534f4
LP
514 if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
515 strv[n++] = "keyed-hash";
516 }
4761fd0f
ZJS
517 strv[n] = NULL;
518 assert(n < ELEMENTSOF(strv));
519
520 t = strv_join((char**) strv, ", ");
521 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
522 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
523 }
4214009f
ZJS
524 return true;
525 }
526
527 return false;
528}
529
530static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
531 uint64_t arena_size, header_size;
532
cec736d2 533 assert(f);
c88cc6af 534 assert(f->header);
cec736d2 535
7560fffc 536 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
537 return -EBADMSG;
538
4214009f
ZJS
539 /* In both read and write mode we refuse to open files with incompatible
540 * flags we don't know. */
541 if (warn_wrong_flags(f, false))
cec736d2
LP
542 return -EPROTONOSUPPORT;
543
4214009f
ZJS
544 /* When open for writing we refuse to open files with compatible flags, too. */
545 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 546 return -EPROTONOSUPPORT;
7560fffc 547
db11ac1a
LP
548 if (f->header->state >= _STATE_MAX)
549 return -EBADMSG;
550
893e0f8f 551 header_size = le64toh(READ_NOW(f->header->header_size));
6f94e420 552
dca6219e 553 /* The first addition was n_data, so check that we are at least this large */
6f94e420 554 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
555 return -EBADMSG;
556
8088cbd3 557 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
558 return -EBADMSG;
559
893e0f8f 560 arena_size = le64toh(READ_NOW(f->header->arena_size));
6f94e420
TS
561
562 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
563 return -ENODATA;
564
6f94e420 565 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
566 return -ENODATA;
567
7762e02b
LP
568 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
569 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
570 !VALID64(le64toh(f->header->tail_object_offset)) ||
571 !VALID64(le64toh(f->header->entry_array_offset)))
572 return -ENODATA;
573
cec736d2 574 if (f->writable) {
cec736d2 575 sd_id128_t machine_id;
ae739cc1 576 uint8_t state;
cec736d2
LP
577 int r;
578
579 r = sd_id128_get_machine(&machine_id);
580 if (r < 0)
581 return r;
582
583 if (!sd_id128_equal(machine_id, f->header->machine_id))
584 return -EHOSTDOWN;
585
de190aef 586 state = f->header->state;
cec736d2 587
b288cdeb
ZJS
588 if (state == STATE_ARCHIVED)
589 return -ESHUTDOWN; /* Already archived */
baaa35ad
ZJS
590 else if (state == STATE_ONLINE)
591 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
592 "Journal file %s is already online. Assuming unclean closing.",
593 f->path);
594 else if (state != STATE_OFFLINE)
595 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
596 "Journal file %s has unknown state %i.",
597 f->path, state);
ae739cc1 598
5b3cc0c8
YN
599 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
600 return -EBADMSG;
601
ae739cc1
LP
602 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
603 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
604 * bisection. */
baaa35ad
ZJS
605 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
606 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
607 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
608 f->path);
cec736d2
LP
609 }
610
d89c8fdf
ZJS
611 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
612 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
8653185a 613 f->compress_zstd = JOURNAL_HEADER_COMPRESSED_ZSTD(f->header);
c586dbf1 614
f1889c91 615 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 616
4ce534f4
LP
617 f->keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
618
cec736d2
LP
619 return 0;
620}
621
28ca867a 622int journal_file_fstat(JournalFile *f) {
3cc44114
LP
623 int r;
624
2678031a
LP
625 assert(f);
626 assert(f->fd >= 0);
627
628 if (fstat(f->fd, &f->last_stat) < 0)
629 return -errno;
630
631 f->last_stat_usec = now(CLOCK_MONOTONIC);
632
e9dd6984 633 /* Refuse dealing with files that aren't regular */
3cc44114
LP
634 r = stat_verify_regular(&f->last_stat);
635 if (r < 0)
636 return r;
8d6a4d33 637
2678031a
LP
638 /* Refuse appending to files that are already deleted */
639 if (f->last_stat.st_nlink <= 0)
640 return -EIDRM;
641
642 return 0;
643}
644
cec736d2 645static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
893e0f8f 646 uint64_t old_size, new_size, old_header_size, old_arena_size;
fec2aa2f 647 int r;
cec736d2
LP
648
649 assert(f);
c88cc6af 650 assert(f->header);
cec736d2 651
893e0f8f
LP
652 /* We assume that this file is not sparse, and we know that for sure, since we always call
653 * posix_fallocate() ourselves */
654
655 if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset)
656 return -EINVAL;
cec736d2 657
c3bd54bf 658 if (mmap_cache_fd_got_sigbus(f->cache_fd))
fa6ac760
LP
659 return -EIO;
660
893e0f8f
LP
661 old_header_size = le64toh(READ_NOW(f->header->header_size));
662 old_arena_size = le64toh(READ_NOW(f->header->arena_size));
663 if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size)
664 return -EBADMSG;
665
666 old_size = old_header_size + old_arena_size;
cec736d2 667
893e0f8f 668 new_size = MAX(PAGE_ALIGN(offset + size), old_header_size);
bc85bfee 669
2678031a
LP
670 if (new_size <= old_size) {
671
672 /* We already pre-allocated enough space, but before
673 * we write to it, let's check with fstat() if the
674 * file got deleted, in order make sure we don't throw
675 * away the data immediately. Don't check fstat() for
676 * all writes though, but only once ever 10s. */
677
678 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
679 return 0;
680
681 return journal_file_fstat(f);
682 }
683
684 /* Allocate more space. */
cec736d2 685
a676e665 686 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 687 return -E2BIG;
cec736d2 688
a676e665 689 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
690 struct statvfs svfs;
691
692 if (fstatvfs(f->fd, &svfs) >= 0) {
693 uint64_t available;
694
070052ab 695 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
696
697 if (new_size - old_size > available)
698 return -E2BIG;
699 }
700 }
701
eda4b58b 702 /* Increase by larger blocks at once */
be6b0c21 703 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
eda4b58b
LP
704 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
705 new_size = f->metrics.max_size;
706
bc85bfee
LP
707 /* Note that the glibc fallocate() fallback is very
708 inefficient, hence we try to minimize the allocation area
709 as we can. */
4c54768c
IZ
710 r = posix_fallocate_loop(f->fd, old_size, new_size - old_size);
711 if (r < 0)
712 return r;
cec736d2 713
893e0f8f 714 f->header->arena_size = htole64(new_size - old_header_size);
cec736d2 715
2678031a 716 return journal_file_fstat(f);
cec736d2
LP
717}
718
78519831 719static unsigned type_to_context(ObjectType type) {
d3d3208f 720 /* One context for each type, plus one catch-all for the rest */
69adae51 721 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 722 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 723 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
724}
725
71139898
LP
726static int journal_file_move_to(
727 JournalFile *f,
728 ObjectType type,
729 bool keep_always,
730 uint64_t offset,
731 uint64_t size,
258190a0 732 void **ret) {
71139898 733
2678031a
LP
734 int r;
735
cec736d2 736 assert(f);
cec736d2
LP
737 assert(ret);
738
7762e02b
LP
739 if (size <= 0)
740 return -EINVAL;
741
893e0f8f
LP
742 if (size > UINT64_MAX - offset)
743 return -EBADMSG;
744
2a59ea54 745 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
746 if (offset + size > (uint64_t) f->last_stat.st_size) {
747 /* Hmm, out of range? Let's refresh the fstat() data
748 * first, before we trust that check. */
749
2678031a
LP
750 r = journal_file_fstat(f);
751 if (r < 0)
752 return r;
753
754 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
755 return -EADDRNOTAVAIL;
756 }
757
c3bd54bf 758 return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
759}
760
16e9f408
LP
761static uint64_t minimum_header_size(Object *o) {
762
b8e891e6 763 static const uint64_t table[] = {
16e9f408
LP
764 [OBJECT_DATA] = sizeof(DataObject),
765 [OBJECT_FIELD] = sizeof(FieldObject),
766 [OBJECT_ENTRY] = sizeof(EntryObject),
767 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
768 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
769 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
770 [OBJECT_TAG] = sizeof(TagObject),
771 };
772
773 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
774 return sizeof(ObjectHeader);
775
776 return table[o->object.type];
777}
778
24754f36
TR
779/* Lightweight object checks. We want this to be fast, so that we won't
780 * slowdown every journal_file_move_to_object() call too much. */
781static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
782 assert(f);
783 assert(o);
784
785 switch (o->object.type) {
786
a602d93e 787 case OBJECT_DATA:
baaa35ad
ZJS
788 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
789 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
790 "Bad n_entries: %" PRIu64 ": %" PRIu64,
791 le64toh(o->data.n_entries),
792 offset);
793
20ee282b 794 if (le64toh(o->object.size) <= offsetof(DataObject, payload))
baaa35ad
ZJS
795 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
796 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
797 offsetof(DataObject, payload),
798 le64toh(o->object.size),
799 offset);
24754f36 800
10e8445b
TR
801 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
802 !VALID64(le64toh(o->data.next_field_offset)) ||
803 !VALID64(le64toh(o->data.entry_offset)) ||
baaa35ad
ZJS
804 !VALID64(le64toh(o->data.entry_array_offset)))
805 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
806 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
807 le64toh(o->data.next_hash_offset),
808 le64toh(o->data.next_field_offset),
809 le64toh(o->data.entry_offset),
810 le64toh(o->data.entry_array_offset),
811 offset);
24754f36
TR
812
813 break;
24754f36
TR
814
815 case OBJECT_FIELD:
20ee282b 816 if (le64toh(o->object.size) <= offsetof(FieldObject, payload))
baaa35ad
ZJS
817 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
818 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
819 offsetof(FieldObject, payload),
820 le64toh(o->object.size),
821 offset);
24754f36 822
10e8445b 823 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
baaa35ad
ZJS
824 !VALID64(le64toh(o->field.head_data_offset)))
825 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
826 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
827 le64toh(o->field.next_hash_offset),
828 le64toh(o->field.head_data_offset),
829 offset);
24754f36
TR
830 break;
831
893e0f8f
LP
832 case OBJECT_ENTRY: {
833 uint64_t sz;
834
835 sz = le64toh(READ_NOW(o->object.size));
836 if (sz < offsetof(EntryObject, items) ||
837 (sz - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
baaa35ad
ZJS
838 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
839 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
840 offsetof(EntryObject, items),
893e0f8f 841 sz,
baaa35ad
ZJS
842 offset);
843
893e0f8f 844 if ((sz - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
baaa35ad
ZJS
845 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
846 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
893e0f8f 847 (sz - offsetof(EntryObject, items)) / sizeof(EntryItem),
baaa35ad
ZJS
848 offset);
849
850 if (le64toh(o->entry.seqnum) <= 0)
851 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
852 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
853 le64toh(o->entry.seqnum),
854 offset);
855
856 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
857 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
858 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
859 le64toh(o->entry.realtime),
860 offset);
861
862 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
863 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
864 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
865 le64toh(o->entry.monotonic),
866 offset);
24754f36
TR
867
868 break;
893e0f8f 869 }
24754f36
TR
870
871 case OBJECT_DATA_HASH_TABLE:
893e0f8f
LP
872 case OBJECT_FIELD_HASH_TABLE: {
873 uint64_t sz;
874
875 sz = le64toh(READ_NOW(o->object.size));
876 if (sz < offsetof(HashTableObject, items) ||
877 (sz - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
878 (sz - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
baaa35ad
ZJS
879 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
880 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
881 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
893e0f8f 882 sz,
baaa35ad 883 offset);
24754f36
TR
884
885 break;
893e0f8f 886 }
24754f36 887
893e0f8f
LP
888 case OBJECT_ENTRY_ARRAY: {
889 uint64_t sz;
890
891 sz = le64toh(READ_NOW(o->object.size));
892 if (sz < offsetof(EntryArrayObject, items) ||
893 (sz - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
894 (sz - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
baaa35ad
ZJS
895 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
896 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
893e0f8f 897 sz,
baaa35ad
ZJS
898 offset);
899
900 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
901 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
902 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
903 le64toh(o->entry_array.next_entry_array_offset),
904 offset);
24754f36
TR
905
906 break;
893e0f8f 907 }
24754f36
TR
908
909 case OBJECT_TAG:
baaa35ad
ZJS
910 if (le64toh(o->object.size) != sizeof(TagObject))
911 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
912 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
913 le64toh(o->object.size),
914 offset);
24754f36 915
baaa35ad
ZJS
916 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
917 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
918 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
919 le64toh(o->tag.epoch), offset);
24754f36
TR
920
921 break;
922 }
923
924 return 0;
925}
926
78519831 927int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
928 int r;
929 void *t;
930 Object *o;
931 uint64_t s;
932
933 assert(f);
934 assert(ret);
935
db11ac1a 936 /* Objects may only be located at multiple of 64 bit */
baaa35ad
ZJS
937 if (!VALID64(offset))
938 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
939 "Attempt to move to object at non-64bit boundary: %" PRIu64,
940 offset);
db11ac1a 941
50809d7a 942 /* Object may not be located in the file header */
baaa35ad
ZJS
943 if (offset < le64toh(f->header->header_size))
944 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
945 "Attempt to move to object located in file header: %" PRIu64,
946 offset);
50809d7a 947
258190a0 948 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
949 if (r < 0)
950 return r;
951
952 o = (Object*) t;
893e0f8f 953 s = le64toh(READ_NOW(o->object.size));
cec736d2 954
baaa35ad
ZJS
955 if (s == 0)
956 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
957 "Attempt to move to uninitialized object: %" PRIu64,
958 offset);
959 if (s < sizeof(ObjectHeader))
960 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
961 "Attempt to move to overly short object: %" PRIu64,
962 offset);
963
964 if (o->object.type <= OBJECT_UNUSED)
965 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
966 "Attempt to move to object with invalid type: %" PRIu64,
967 offset);
968
969 if (s < minimum_header_size(o))
970 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
971 "Attempt to move to truncated object: %" PRIu64,
972 offset);
973
974 if (type > OBJECT_UNUSED && o->object.type != type)
975 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
976 "Attempt to move to object of unexpected type: %" PRIu64,
977 offset);
cec736d2 978
258190a0
VC
979 r = journal_file_move_to(f, type, false, offset, s, &t);
980 if (r < 0)
981 return r;
cec736d2 982
258190a0 983 o = (Object*) t;
cec736d2 984
24754f36
TR
985 r = journal_file_check_object(f, offset, o);
986 if (r < 0)
987 return r;
988
cec736d2
LP
989 *ret = o;
990 return 0;
991}
992
0eaee828
LP
993static uint64_t journal_file_entry_seqnum(
994 JournalFile *f,
995 uint64_t *seqnum) {
996
997 uint64_t ret;
cec736d2
LP
998
999 assert(f);
c88cc6af 1000 assert(f->header);
cec736d2 1001
0eaee828
LP
1002 /* Picks a new sequence number for the entry we are about to add and returns it. */
1003
1004 ret = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
1005
1006 if (seqnum) {
0eaee828
LP
1007 /* If an external seqnum counter was passed, we update both the local and the external one,
1008 * and set it to the maximum of both */
c2373f84 1009
0eaee828
LP
1010 if (*seqnum + 1 > ret)
1011 ret = *seqnum + 1;
c2373f84 1012
0eaee828 1013 *seqnum = ret;
c2373f84
LP
1014 }
1015
0eaee828 1016 f->header->tail_entry_seqnum = htole64(ret);
cec736d2 1017
beec0085 1018 if (f->header->head_entry_seqnum == 0)
0eaee828 1019 f->header->head_entry_seqnum = htole64(ret);
de190aef 1020
0eaee828
LP
1021 return ret;
1022}
1023
f4474e00
LP
1024int journal_file_append_object(
1025 JournalFile *f,
1026 ObjectType type,
1027 uint64_t size,
1028 Object **ret,
1029 uint64_t *ret_offset) {
1030
cec736d2
LP
1031 int r;
1032 uint64_t p;
0aa649b1 1033 Object *tail, *o;
cec736d2
LP
1034 void *t;
1035
1036 assert(f);
c88cc6af 1037 assert(f->header);
d05089d8 1038 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2 1039 assert(size >= sizeof(ObjectHeader));
cec736d2 1040
26687bf8
OS
1041 r = journal_file_set_online(f);
1042 if (r < 0)
1043 return r;
1044
0aa649b1
YW
1045 p = le64toh(f->header->tail_object_offset);
1046 if (p == 0)
1047 p = le64toh(f->header->header_size);
1048 else {
1049 uint64_t sz;
1050
1051 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
1052 if (r < 0)
1053 return r;
1054
1055 sz = le64toh(READ_NOW(tail->object.size));
1056 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
1057 return -EBADMSG;
1058
1059 sz = ALIGN64(sz);
1060 if (p > UINT64_MAX - sz)
1061 return -EBADMSG;
1062
1063 p += sz;
1064 }
cec736d2
LP
1065
1066 r = journal_file_allocate(f, p, size);
1067 if (r < 0)
1068 return r;
1069
258190a0 1070 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
1071 if (r < 0)
1072 return r;
1073
1074 o = (Object*) t;
71139898
LP
1075 o->object = (ObjectHeader) {
1076 .type = type,
1077 .size = htole64(size),
1078 };
cec736d2
LP
1079
1080 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1081 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1082
f4474e00
LP
1083 if (ret)
1084 *ret = o;
1085
1086 if (ret_offset)
1087 *ret_offset = p;
cec736d2
LP
1088
1089 return 0;
1090}
1091
de190aef 1092static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1093 uint64_t s, p;
1094 Object *o;
1095 int r;
1096
1097 assert(f);
c88cc6af 1098 assert(f->header);
cec736d2 1099
070052ab
LP
1100 /* We estimate that we need 1 hash table entry per 768 bytes
1101 of journal file and we want to make sure we never get
1102 beyond 75% fill level. Calculate the hash table size for
1103 the maximum file size based on these metrics. */
4a92baf3 1104
dfabe643 1105 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1106 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1107 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1108
5030c85a 1109 log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));
4a92baf3 1110
de190aef
LP
1111 r = journal_file_append_object(f,
1112 OBJECT_DATA_HASH_TABLE,
1113 offsetof(Object, hash_table.items) + s,
1114 &o, &p);
cec736d2
LP
1115 if (r < 0)
1116 return r;
1117
29804cc1 1118 memzero(o->hash_table.items, s);
cec736d2 1119
de190aef
LP
1120 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1121 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1122
1123 return 0;
1124}
1125
de190aef 1126static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1127 uint64_t s, p;
1128 Object *o;
1129 int r;
1130
1131 assert(f);
c88cc6af 1132 assert(f->header);
cec736d2 1133
3c1668da
LP
1134 /* We use a fixed size hash table for the fields as this
1135 * number should grow very slowly only */
1136
de190aef 1137 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
5030c85a
LP
1138 log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem));
1139
de190aef
LP
1140 r = journal_file_append_object(f,
1141 OBJECT_FIELD_HASH_TABLE,
1142 offsetof(Object, hash_table.items) + s,
1143 &o, &p);
cec736d2
LP
1144 if (r < 0)
1145 return r;
1146
29804cc1 1147 memzero(o->hash_table.items, s);
cec736d2 1148
de190aef
LP
1149 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1150 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1151
1152 return 0;
1153}
1154
dade37d4 1155int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1156 uint64_t s, p;
1157 void *t;
1158 int r;
1159
1160 assert(f);
c88cc6af 1161 assert(f->header);
cec736d2 1162
dade37d4
LP
1163 if (f->data_hash_table)
1164 return 0;
1165
de190aef
LP
1166 p = le64toh(f->header->data_hash_table_offset);
1167 s = le64toh(f->header->data_hash_table_size);
cec736d2 1168
de190aef 1169 r = journal_file_move_to(f,
16e9f408 1170 OBJECT_DATA_HASH_TABLE,
fcde2389 1171 true,
de190aef 1172 p, s,
258190a0 1173 &t);
cec736d2
LP
1174 if (r < 0)
1175 return r;
1176
de190aef 1177 f->data_hash_table = t;
cec736d2
LP
1178 return 0;
1179}
1180
dade37d4 1181int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1182 uint64_t s, p;
1183 void *t;
1184 int r;
1185
1186 assert(f);
c88cc6af 1187 assert(f->header);
cec736d2 1188
dade37d4
LP
1189 if (f->field_hash_table)
1190 return 0;
1191
de190aef
LP
1192 p = le64toh(f->header->field_hash_table_offset);
1193 s = le64toh(f->header->field_hash_table_size);
cec736d2 1194
de190aef 1195 r = journal_file_move_to(f,
16e9f408 1196 OBJECT_FIELD_HASH_TABLE,
fcde2389 1197 true,
de190aef 1198 p, s,
258190a0 1199 &t);
cec736d2
LP
1200 if (r < 0)
1201 return r;
1202
de190aef 1203 f->field_hash_table = t;
cec736d2
LP
1204 return 0;
1205}
1206
3c1668da
LP
1207static int journal_file_link_field(
1208 JournalFile *f,
1209 Object *o,
1210 uint64_t offset,
1211 uint64_t hash) {
1212
805d1486 1213 uint64_t p, h, m;
3c1668da
LP
1214 int r;
1215
1216 assert(f);
c88cc6af 1217 assert(f->header);
90d222c1 1218 assert(f->field_hash_table);
3c1668da
LP
1219 assert(o);
1220 assert(offset > 0);
1221
1222 if (o->object.type != OBJECT_FIELD)
1223 return -EINVAL;
1224
893e0f8f 1225 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
805d1486
LP
1226 if (m <= 0)
1227 return -EBADMSG;
3c1668da 1228
805d1486 1229 /* This might alter the window we are looking at */
3c1668da
LP
1230 o->field.next_hash_offset = o->field.head_data_offset = 0;
1231
805d1486 1232 h = hash % m;
3c1668da
LP
1233 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1234 if (p == 0)
1235 f->field_hash_table[h].head_hash_offset = htole64(offset);
1236 else {
1237 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1238 if (r < 0)
1239 return r;
1240
1241 o->field.next_hash_offset = htole64(offset);
1242 }
1243
1244 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1245
1246 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1247 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1248
1249 return 0;
1250}
1251
1252static int journal_file_link_data(
1253 JournalFile *f,
1254 Object *o,
1255 uint64_t offset,
1256 uint64_t hash) {
1257
805d1486 1258 uint64_t p, h, m;
cec736d2
LP
1259 int r;
1260
1261 assert(f);
c88cc6af 1262 assert(f->header);
90d222c1 1263 assert(f->data_hash_table);
cec736d2
LP
1264 assert(o);
1265 assert(offset > 0);
b588975f
LP
1266
1267 if (o->object.type != OBJECT_DATA)
1268 return -EINVAL;
cec736d2 1269
893e0f8f 1270 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
805d1486
LP
1271 if (m <= 0)
1272 return -EBADMSG;
48496df6 1273
805d1486 1274 /* This might alter the window we are looking at */
de190aef
LP
1275 o->data.next_hash_offset = o->data.next_field_offset = 0;
1276 o->data.entry_offset = o->data.entry_array_offset = 0;
1277 o->data.n_entries = 0;
cec736d2 1278
805d1486 1279 h = hash % m;
8db4213e 1280 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1281 if (p == 0)
cec736d2 1282 /* Only entry in the hash table is easy */
de190aef 1283 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1284 else {
48496df6
LP
1285 /* Move back to the previous data object, to patch in
1286 * pointer */
cec736d2 1287
de190aef 1288 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1289 if (r < 0)
1290 return r;
1291
de190aef 1292 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1293 }
1294
de190aef 1295 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1296
dca6219e
LP
1297 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1298 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1299
cec736d2
LP
1300 return 0;
1301}
1302
0dbe57ee
LP
1303static int next_hash_offset(
1304 JournalFile *f,
1305 uint64_t *p,
1306 le64_t *next_hash_offset,
1307 uint64_t *depth,
1308 le64_t *header_max_depth) {
1309
1310 uint64_t nextp;
1311
1312 nextp = le64toh(READ_NOW(*next_hash_offset));
1313 if (nextp > 0) {
1314 if (nextp <= *p) /* Refuse going in loops */
1315 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1316 "Detected hash item loop in %s, refusing.", f->path);
1317
1318 (*depth)++;
1319
1320 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1321 if (header_max_depth && f->writable)
1322 *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth)));
1323 }
1324
1325 *p = nextp;
1326 return 0;
1327}
1328
3c1668da
LP
1329int journal_file_find_field_object_with_hash(
1330 JournalFile *f,
1331 const void *field, uint64_t size, uint64_t hash,
f4474e00 1332 Object **ret, uint64_t *ret_offset) {
3c1668da 1333
0dbe57ee 1334 uint64_t p, osize, h, m, depth = 0;
3c1668da
LP
1335 int r;
1336
1337 assert(f);
c88cc6af 1338 assert(f->header);
3c1668da
LP
1339 assert(field && size > 0);
1340
dade37d4
LP
1341 /* If the field hash table is empty, we can't find anything */
1342 if (le64toh(f->header->field_hash_table_size) <= 0)
1343 return 0;
1344
1345 /* Map the field hash table, if it isn't mapped yet. */
1346 r = journal_file_map_field_hash_table(f);
1347 if (r < 0)
1348 return r;
1349
3c1668da
LP
1350 osize = offsetof(Object, field.payload) + size;
1351
893e0f8f 1352 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
805d1486 1353 if (m <= 0)
3c1668da
LP
1354 return -EBADMSG;
1355
805d1486 1356 h = hash % m;
3c1668da 1357 p = le64toh(f->field_hash_table[h].head_hash_offset);
3c1668da
LP
1358 while (p > 0) {
1359 Object *o;
1360
1361 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1362 if (r < 0)
1363 return r;
1364
1365 if (le64toh(o->field.hash) == hash &&
1366 le64toh(o->object.size) == osize &&
1367 memcmp(o->field.payload, field, size) == 0) {
1368
1369 if (ret)
1370 *ret = o;
f4474e00
LP
1371 if (ret_offset)
1372 *ret_offset = p;
3c1668da
LP
1373
1374 return 1;
1375 }
1376
0dbe57ee
LP
1377 r = next_hash_offset(
1378 f,
1379 &p,
1380 &o->field.next_hash_offset,
1381 &depth,
1382 JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL);
1383 if (r < 0)
1384 return r;
3c1668da
LP
1385 }
1386
1387 return 0;
1388}
1389
4ce534f4
LP
1390uint64_t journal_file_hash_data(
1391 JournalFile *f,
1392 const void *data,
1393 size_t sz) {
1394
1395 assert(f);
1396 assert(data || sz == 0);
1397
1398 /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1399 * function use siphash. Old journal files use the Jenkins hash. */
1400
1401 if (JOURNAL_HEADER_KEYED_HASH(f->header))
1402 return siphash24(data, sz, f->header->file_id.bytes);
1403
1404 return jenkins_hash64(data, sz);
1405}
1406
3c1668da
LP
1407int journal_file_find_field_object(
1408 JournalFile *f,
1409 const void *field, uint64_t size,
f4474e00 1410 Object **ret, uint64_t *ret_offset) {
3c1668da 1411
3c1668da
LP
1412 assert(f);
1413 assert(field && size > 0);
1414
f4474e00
LP
1415 return journal_file_find_field_object_with_hash(
1416 f,
4ce534f4
LP
1417 field, size,
1418 journal_file_hash_data(f, field, size),
f4474e00 1419 ret, ret_offset);
3c1668da
LP
1420}
1421
de190aef
LP
1422int journal_file_find_data_object_with_hash(
1423 JournalFile *f,
1424 const void *data, uint64_t size, uint64_t hash,
f4474e00 1425 Object **ret, uint64_t *ret_offset) {
48496df6 1426
0dbe57ee 1427 uint64_t p, osize, h, m, depth = 0;
cec736d2
LP
1428 int r;
1429
1430 assert(f);
c88cc6af 1431 assert(f->header);
cec736d2
LP
1432 assert(data || size == 0);
1433
dade37d4
LP
1434 /* If there's no data hash table, then there's no entry. */
1435 if (le64toh(f->header->data_hash_table_size) <= 0)
1436 return 0;
1437
1438 /* Map the data hash table, if it isn't mapped yet. */
1439 r = journal_file_map_data_hash_table(f);
1440 if (r < 0)
1441 return r;
1442
cec736d2
LP
1443 osize = offsetof(Object, data.payload) + size;
1444
893e0f8f 1445 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
805d1486 1446 if (m <= 0)
bc85bfee
LP
1447 return -EBADMSG;
1448
805d1486 1449 h = hash % m;
de190aef 1450 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1451
de190aef
LP
1452 while (p > 0) {
1453 Object *o;
cec736d2 1454
de190aef 1455 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1456 if (r < 0)
1457 return r;
1458
807e17f0 1459 if (le64toh(o->data.hash) != hash)
85a131e8 1460 goto next;
807e17f0 1461
d89c8fdf 1462 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
d80b051c 1463#if HAVE_COMPRESSION
fa1c4b51 1464 uint64_t l;
a7f7d1bd 1465 size_t rsize = 0;
cec736d2 1466
893e0f8f 1467 l = le64toh(READ_NOW(o->object.size));
807e17f0 1468 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1469 return -EBADMSG;
1470
807e17f0
LP
1471 l -= offsetof(Object, data.payload);
1472
d89c8fdf 1473 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
319a4f4b 1474 o->data.payload, l, &f->compress_buffer, &rsize, 0);
d89c8fdf
ZJS
1475 if (r < 0)
1476 return r;
807e17f0 1477
b785c858 1478 if (rsize == size &&
807e17f0
LP
1479 memcmp(f->compress_buffer, data, size) == 0) {
1480
1481 if (ret)
1482 *ret = o;
1483
f4474e00
LP
1484 if (ret_offset)
1485 *ret_offset = p;
807e17f0
LP
1486
1487 return 1;
1488 }
3b1a55e1
ZJS
1489#else
1490 return -EPROTONOSUPPORT;
1491#endif
807e17f0
LP
1492 } else if (le64toh(o->object.size) == osize &&
1493 memcmp(o->data.payload, data, size) == 0) {
1494
cec736d2
LP
1495 if (ret)
1496 *ret = o;
1497
f4474e00
LP
1498 if (ret_offset)
1499 *ret_offset = p;
cec736d2 1500
de190aef 1501 return 1;
cec736d2
LP
1502 }
1503
85a131e8 1504 next:
0dbe57ee
LP
1505 r = next_hash_offset(
1506 f,
1507 &p,
1508 &o->data.next_hash_offset,
1509 &depth,
1510 JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL);
1511 if (r < 0)
1512 return r;
cec736d2
LP
1513 }
1514
de190aef
LP
1515 return 0;
1516}
1517
1518int journal_file_find_data_object(
1519 JournalFile *f,
1520 const void *data, uint64_t size,
f4474e00 1521 Object **ret, uint64_t *ret_offset) {
de190aef 1522
de190aef
LP
1523 assert(f);
1524 assert(data || size == 0);
1525
f4474e00
LP
1526 return journal_file_find_data_object_with_hash(
1527 f,
4ce534f4
LP
1528 data, size,
1529 journal_file_hash_data(f, data, size),
f4474e00 1530 ret, ret_offset);
de190aef
LP
1531}
1532
adce225a 1533bool journal_field_valid(const char *p, size_t l, bool allow_protected) {
adce225a
YW
1534 /* We kinda enforce POSIX syntax recommendations for
1535 environment variables here, but make a couple of additional
1536 requirements.
1537
1538 http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1539
f5fbe71d 1540 if (l == SIZE_MAX)
adce225a
YW
1541 l = strlen(p);
1542
1543 /* No empty field names */
1544 if (l <= 0)
1545 return false;
1546
1547 /* Don't allow names longer than 64 chars */
1548 if (l > 64)
1549 return false;
1550
1551 /* Variables starting with an underscore are protected */
1552 if (!allow_protected && p[0] == '_')
1553 return false;
1554
1555 /* Don't allow digits as first character */
1556 if (p[0] >= '0' && p[0] <= '9')
1557 return false;
1558
1559 /* Only allow A-Z0-9 and '_' */
f6a0cfa5 1560 for (const char *a = p; a < p + l; a++)
adce225a
YW
1561 if ((*a < 'A' || *a > 'Z') &&
1562 (*a < '0' || *a > '9') &&
1563 *a != '_')
1564 return false;
1565
1566 return true;
1567}
1568
3c1668da
LP
1569static int journal_file_append_field(
1570 JournalFile *f,
1571 const void *field, uint64_t size,
f4474e00 1572 Object **ret, uint64_t *ret_offset) {
3c1668da
LP
1573
1574 uint64_t hash, p;
1575 uint64_t osize;
1576 Object *o;
1577 int r;
1578
1579 assert(f);
1580 assert(field && size > 0);
1581
f2bd0320
YW
1582 if (!journal_field_valid(field, size, true))
1583 return -EBADMSG;
1584
4ce534f4 1585 hash = journal_file_hash_data(f, field, size);
3c1668da
LP
1586
1587 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1588 if (r < 0)
1589 return r;
8d5a1082 1590 if (r > 0) {
3c1668da
LP
1591
1592 if (ret)
1593 *ret = o;
1594
f4474e00
LP
1595 if (ret_offset)
1596 *ret_offset = p;
3c1668da
LP
1597
1598 return 0;
1599 }
1600
1601 osize = offsetof(Object, field.payload) + size;
1602 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1603 if (r < 0)
1604 return r;
3c1668da
LP
1605
1606 o->field.hash = htole64(hash);
1607 memcpy(o->field.payload, field, size);
1608
1609 r = journal_file_link_field(f, o, p, hash);
1610 if (r < 0)
1611 return r;
1612
1613 /* The linking might have altered the window, so let's
1614 * refresh our pointer */
1615 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1616 if (r < 0)
1617 return r;
1618
349cc4a5 1619#if HAVE_GCRYPT
3c1668da
LP
1620 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1621 if (r < 0)
1622 return r;
1623#endif
1624
1625 if (ret)
1626 *ret = o;
1627
f4474e00
LP
1628 if (ret_offset)
1629 *ret_offset = p;
3c1668da
LP
1630
1631 return 0;
1632}
1633
48496df6
LP
1634static int journal_file_append_data(
1635 JournalFile *f,
1636 const void *data, uint64_t size,
f4474e00 1637 Object **ret, uint64_t *ret_offset) {
48496df6 1638
bc6b326d
DDM
1639 uint64_t hash, p, fp, osize;
1640 Object *o, *fo;
d89c8fdf 1641 int r, compression = 0;
3c1668da 1642 const void *eq;
de190aef
LP
1643
1644 assert(f);
bc6b326d
DDM
1645
1646 if (!data || size == 0)
1647 return -EINVAL;
de190aef 1648
4ce534f4 1649 hash = journal_file_hash_data(f, data, size);
de190aef
LP
1650
1651 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1652 if (r < 0)
1653 return r;
0240c603 1654 if (r > 0) {
de190aef
LP
1655
1656 if (ret)
1657 *ret = o;
1658
f4474e00
LP
1659 if (ret_offset)
1660 *ret_offset = p;
de190aef
LP
1661
1662 return 0;
1663 }
1664
bc6b326d
DDM
1665 eq = memchr(data, '=', size);
1666 if (!eq)
1667 return -EINVAL;
1668
de190aef
LP
1669 osize = offsetof(Object, data.payload) + size;
1670 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1671 if (r < 0)
1672 return r;
1673
cec736d2 1674 o->data.hash = htole64(hash);
807e17f0 1675
d80b051c 1676#if HAVE_COMPRESSION
57850536 1677 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
a7f7d1bd 1678 size_t rsize = 0;
807e17f0 1679
5d6f46b6 1680 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1681
d1afbcd2 1682 if (compression >= 0) {
807e17f0 1683 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1684 o->object.flags |= compression;
807e17f0 1685
fa1c4b51 1686 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1687 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1688 } else
1689 /* Compression didn't work, we don't really care why, let's continue without compression */
1690 compression = 0;
807e17f0
LP
1691 }
1692#endif
1693
75f32f04
ZJS
1694 if (compression == 0)
1695 memcpy_safe(o->data.payload, data, size);
cec736d2 1696
de190aef 1697 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1698 if (r < 0)
1699 return r;
1700
349cc4a5 1701#if HAVE_GCRYPT
33685a5a
FB
1702 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1703 if (r < 0)
1704 return r;
1705#endif
1706
48496df6
LP
1707 /* The linking might have altered the window, so let's
1708 * refresh our pointer */
1709 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1710 if (r < 0)
1711 return r;
1712
bc6b326d
DDM
1713 /* Create field object ... */
1714 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1715 if (r < 0)
1716 return r;
3c1668da 1717
bc6b326d
DDM
1718 /* ... and link it in. */
1719 o->data.next_field_offset = fo->field.head_data_offset;
1720 fo->field.head_data_offset = le64toh(p);
3c1668da 1721
cec736d2
LP
1722 if (ret)
1723 *ret = o;
1724
f4474e00
LP
1725 if (ret_offset)
1726 *ret_offset = p;
cec736d2
LP
1727
1728 return 0;
1729}
1730
1731uint64_t journal_file_entry_n_items(Object *o) {
893e0f8f 1732 uint64_t sz;
cec736d2 1733 assert(o);
b588975f
LP
1734
1735 if (o->object.type != OBJECT_ENTRY)
1736 return 0;
cec736d2 1737
893e0f8f
LP
1738 sz = le64toh(READ_NOW(o->object.size));
1739 if (sz < offsetof(Object, entry.items))
1740 return 0;
1741
1742 return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
cec736d2
LP
1743}
1744
0284adc6 1745uint64_t journal_file_entry_array_n_items(Object *o) {
893e0f8f
LP
1746 uint64_t sz;
1747
de190aef 1748 assert(o);
b588975f
LP
1749
1750 if (o->object.type != OBJECT_ENTRY_ARRAY)
1751 return 0;
de190aef 1752
893e0f8f
LP
1753 sz = le64toh(READ_NOW(o->object.size));
1754 if (sz < offsetof(Object, entry_array.items))
1755 return 0;
1756
1757 return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
de190aef
LP
1758}
1759
fb9a24b6 1760uint64_t journal_file_hash_table_n_items(Object *o) {
893e0f8f
LP
1761 uint64_t sz;
1762
fb9a24b6 1763 assert(o);
b588975f 1764
ec2ce0c5 1765 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1766 return 0;
fb9a24b6 1767
893e0f8f
LP
1768 sz = le64toh(READ_NOW(o->object.size));
1769 if (sz < offsetof(Object, hash_table.items))
1770 return 0;
1771
1772 return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
fb9a24b6
LP
1773}
1774
de190aef 1775static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1776 le64_t *first,
1777 le64_t *idx,
de190aef 1778 uint64_t p) {
cec736d2 1779 int r;
de190aef
LP
1780 uint64_t n = 0, ap = 0, q, i, a, hidx;
1781 Object *o;
1782
cec736d2 1783 assert(f);
c88cc6af 1784 assert(f->header);
de190aef
LP
1785 assert(first);
1786 assert(idx);
1787 assert(p > 0);
cec736d2 1788
de190aef 1789 a = le64toh(*first);
893e0f8f 1790 i = hidx = le64toh(READ_NOW(*idx));
de190aef
LP
1791 while (a > 0) {
1792
1793 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1794 if (r < 0)
1795 return r;
cec736d2 1796
de190aef
LP
1797 n = journal_file_entry_array_n_items(o);
1798 if (i < n) {
1799 o->entry_array.items[i] = htole64(p);
1800 *idx = htole64(hidx + 1);
1801 return 0;
1802 }
cec736d2 1803
de190aef
LP
1804 i -= n;
1805 ap = a;
1806 a = le64toh(o->entry_array.next_entry_array_offset);
1807 }
1808
1809 if (hidx > n)
1810 n = (hidx+1) * 2;
1811 else
1812 n = n * 2;
1813
1814 if (n < 4)
1815 n = 4;
1816
1817 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1818 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1819 &o, &q);
cec736d2
LP
1820 if (r < 0)
1821 return r;
1822
349cc4a5 1823#if HAVE_GCRYPT
5996c7c2 1824 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1825 if (r < 0)
1826 return r;
feb12d3e 1827#endif
b0af6f41 1828
de190aef 1829 o->entry_array.items[i] = htole64(p);
cec736d2 1830
de190aef 1831 if (ap == 0)
7be3aa17 1832 *first = htole64(q);
cec736d2 1833 else {
de190aef 1834 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1835 if (r < 0)
1836 return r;
1837
de190aef
LP
1838 o->entry_array.next_entry_array_offset = htole64(q);
1839 }
cec736d2 1840
2dee23eb
LP
1841 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1842 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1843
de190aef
LP
1844 *idx = htole64(hidx + 1);
1845
1846 return 0;
1847}
cec736d2 1848
de190aef 1849static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1850 le64_t *extra,
1851 le64_t *first,
1852 le64_t *idx,
de190aef
LP
1853 uint64_t p) {
1854
893e0f8f 1855 uint64_t hidx;
de190aef
LP
1856 int r;
1857
1858 assert(f);
1859 assert(extra);
1860 assert(first);
1861 assert(idx);
1862 assert(p > 0);
1863
893e0f8f
LP
1864 hidx = le64toh(READ_NOW(*idx));
1865 if (hidx == UINT64_MAX)
1866 return -EBADMSG;
1867 if (hidx == 0)
de190aef
LP
1868 *extra = htole64(p);
1869 else {
4fd052ae 1870 le64_t i;
de190aef 1871
893e0f8f 1872 i = htole64(hidx - 1);
de190aef
LP
1873 r = link_entry_into_array(f, first, &i, p);
1874 if (r < 0)
1875 return r;
cec736d2
LP
1876 }
1877
893e0f8f 1878 *idx = htole64(hidx + 1);
de190aef
LP
1879 return 0;
1880}
1881
1882static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1883 uint64_t p;
1884 int r;
bfbd5be0 1885
de190aef
LP
1886 assert(f);
1887 assert(o);
1888 assert(offset > 0);
1889
1890 p = le64toh(o->entry.items[i].object_offset);
de190aef 1891 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1892 if (r < 0)
1893 return r;
1894
de190aef
LP
1895 return link_entry_into_array_plus_one(f,
1896 &o->data.entry_offset,
1897 &o->data.entry_array_offset,
1898 &o->data.n_entries,
1899 offset);
cec736d2
LP
1900}
1901
1902static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
f6a0cfa5 1903 uint64_t n;
cec736d2
LP
1904 int r;
1905
1906 assert(f);
c88cc6af 1907 assert(f->header);
cec736d2
LP
1908 assert(o);
1909 assert(offset > 0);
b588975f
LP
1910
1911 if (o->object.type != OBJECT_ENTRY)
1912 return -EINVAL;
cec736d2 1913
b788cc23
LP
1914 __sync_synchronize();
1915
cec736d2 1916 /* Link up the entry itself */
de190aef
LP
1917 r = link_entry_into_array(f,
1918 &f->header->entry_array_offset,
1919 &f->header->n_entries,
1920 offset);
1921 if (r < 0)
1922 return r;
cec736d2 1923
507f22bd 1924 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1925
de190aef 1926 if (f->header->head_entry_realtime == 0)
0ac38b70 1927 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1928
0ac38b70 1929 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1930 f->header->tail_entry_monotonic = o->entry.monotonic;
1931
cec736d2
LP
1932 /* Link up the items */
1933 n = journal_file_entry_n_items(o);
f6a0cfa5 1934 for (uint64_t i = 0; i < n; i++) {
cec736d2
LP
1935 r = journal_file_link_entry_item(f, o, offset, i);
1936 if (r < 0)
1937 return r;
1938 }
1939
cec736d2
LP
1940 return 0;
1941}
1942
1943static int journal_file_append_entry_internal(
1944 JournalFile *f,
1945 const dual_timestamp *ts,
d180c349 1946 const sd_id128_t *boot_id,
cec736d2
LP
1947 uint64_t xor_hash,
1948 const EntryItem items[], unsigned n_items,
de190aef 1949 uint64_t *seqnum,
f4474e00 1950 Object **ret, uint64_t *ret_offset) {
cec736d2
LP
1951 uint64_t np;
1952 uint64_t osize;
1953 Object *o;
1954 int r;
1955
1956 assert(f);
c88cc6af 1957 assert(f->header);
cec736d2 1958 assert(items || n_items == 0);
de190aef 1959 assert(ts);
cec736d2
LP
1960
1961 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1962
de190aef 1963 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1964 if (r < 0)
1965 return r;
1966
d98cc1f2 1967 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1968 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1969 o->entry.realtime = htole64(ts->realtime);
1970 o->entry.monotonic = htole64(ts->monotonic);
cec736d2 1971 o->entry.xor_hash = htole64(xor_hash);
924426a7
CM
1972 if (boot_id)
1973 f->header->boot_id = *boot_id;
1974 o->entry.boot_id = f->header->boot_id;
cec736d2 1975
349cc4a5 1976#if HAVE_GCRYPT
5996c7c2 1977 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41 1978 if (r < 0)
b41b682b 1979 return r;
feb12d3e 1980#endif
b0af6f41 1981
cec736d2
LP
1982 r = journal_file_link_entry(f, o, np);
1983 if (r < 0)
b41b682b 1984 return r;
cec736d2
LP
1985
1986 if (ret)
1987 *ret = o;
1988
f4474e00
LP
1989 if (ret_offset)
1990 *ret_offset = np;
cec736d2 1991
0eaee828 1992 return r;
cec736d2
LP
1993}
1994
cf244689 1995void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1996 assert(f);
1997
c5236850
DT
1998 if (f->fd < 0)
1999 return;
2000
50f20cfd
LP
2001 /* inotify() does not receive IN_MODIFY events from file
2002 * accesses done via mmap(). After each access we hence
2003 * trigger IN_MODIFY by truncating the journal file to its
2004 * current size which triggers IN_MODIFY. */
2005
bc85bfee
LP
2006 __sync_synchronize();
2007
50f20cfd 2008 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 2009 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
2010}
2011
7a24f3bf
VC
2012static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
2013 assert(userdata);
2014
2015 journal_file_post_change(userdata);
2016
2017 return 1;
2018}
2019
2020static void schedule_post_change(JournalFile *f) {
b6cdfbe5 2021 int r;
7a24f3bf
VC
2022
2023 assert(f);
2024 assert(f->post_change_timer);
2025
b6cdfbe5 2026 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
7a24f3bf 2027 if (r < 0) {
e167d7fd
LP
2028 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
2029 goto fail;
7a24f3bf 2030 }
b6cdfbe5 2031 if (r > 0)
7a24f3bf
VC
2032 return;
2033
39cf0351 2034 r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period);
7a24f3bf 2035 if (r < 0) {
e167d7fd
LP
2036 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
2037 goto fail;
7a24f3bf
VC
2038 }
2039
ca5d90d4 2040 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
7a24f3bf 2041 if (r < 0) {
e167d7fd
LP
2042 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
2043 goto fail;
7a24f3bf 2044 }
e167d7fd
LP
2045
2046 return;
2047
2048fail:
2049 /* On failure, let's simply post the change immediately. */
2050 journal_file_post_change(f);
7a24f3bf
VC
2051}
2052
2053/* Enable coalesced change posting in a timer on the provided sd_event instance */
2054int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
2055 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
2056 int r;
2057
2058 assert(f);
2059 assert_return(!f->post_change_timer, -EINVAL);
2060 assert(e);
2061 assert(t);
2062
2063 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
2064 if (r < 0)
2065 return r;
2066
2067 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
2068 if (r < 0)
2069 return r;
2070
1cc6c93a 2071 f->post_change_timer = TAKE_PTR(timer);
7a24f3bf
VC
2072 f->post_change_timer_period = t;
2073
2074 return r;
2075}
2076
93bab288
YW
2077static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
2078 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1f2da9ec
LP
2079}
2080
d180c349
ZJS
2081int journal_file_append_entry(
2082 JournalFile *f,
2083 const dual_timestamp *ts,
2084 const sd_id128_t *boot_id,
2085 const struct iovec iovec[], unsigned n_iovec,
2086 uint64_t *seqnum,
f4474e00 2087 Object **ret, uint64_t *ret_offset) {
d180c349 2088
cec736d2
LP
2089 EntryItem *items;
2090 int r;
2091 uint64_t xor_hash = 0;
de190aef 2092 struct dual_timestamp _ts;
cec736d2
LP
2093
2094 assert(f);
c88cc6af 2095 assert(f->header);
bc6b326d 2096 assert(iovec && n_iovec > 0);
cec736d2 2097
c6273953 2098 if (ts) {
baaa35ad
ZJS
2099 if (!VALID_REALTIME(ts->realtime))
2100 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2101 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
2102 ts->realtime);
2103 if (!VALID_MONOTONIC(ts->monotonic))
2104 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2105 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
2106 ts->monotonic);
c6273953 2107 } else {
de190aef
LP
2108 dual_timestamp_get(&_ts);
2109 ts = &_ts;
2110 }
2111
349cc4a5 2112#if HAVE_GCRYPT
7560fffc
LP
2113 r = journal_file_maybe_append_tag(f, ts->realtime);
2114 if (r < 0)
2115 return r;
feb12d3e 2116#endif
7560fffc 2117
5222651e 2118 items = newa(EntryItem, n_iovec);
cec736d2 2119
86e68f38 2120 for (size_t i = 0; i < n_iovec; i++) {
cec736d2
LP
2121 uint64_t p;
2122 Object *o;
2123
2124 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2125 if (r < 0)
cf244689 2126 return r;
cec736d2 2127
4ce534f4
LP
2128 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2129 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2130 * specific record, and give records with otherwise identical position (i.e. match in seqno,
2131 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2132 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2133 * hash here for that. This also has the benefit that cursors for old and new journal files
2134 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2135 * files things are easier, we can just take the value from the stored record directly. */
2136
2137 if (JOURNAL_HEADER_KEYED_HASH(f->header))
2138 xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
2139 else
2140 xor_hash ^= le64toh(o->data.hash);
2141
d164ac77
DDM
2142 items[i] = (EntryItem) {
2143 .object_offset = htole64(p),
2144 .hash = o->data.hash,
2145 };
cec736d2
LP
2146 }
2147
1f2da9ec
LP
2148 /* Order by the position on disk, in order to improve seek
2149 * times for rotating media. */
93bab288 2150 typesafe_qsort(items, n_iovec, entry_item_cmp);
1f2da9ec 2151
f4474e00 2152 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);
cec736d2 2153
fa6ac760
LP
2154 /* If the memory mapping triggered a SIGBUS then we return an
2155 * IO error and ignore the error code passed down to us, since
2156 * it is very likely just an effect of a nullified replacement
2157 * mapping page */
2158
c3bd54bf 2159 if (mmap_cache_fd_got_sigbus(f->cache_fd))
fa6ac760
LP
2160 r = -EIO;
2161
7a24f3bf
VC
2162 if (f->post_change_timer)
2163 schedule_post_change(f);
2164 else
2165 journal_file_post_change(f);
50f20cfd 2166
cec736d2
LP
2167 return r;
2168}
2169
a4bcff5b 2170typedef struct ChainCacheItem {
fb099c8d 2171 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2172 uint64_t array; /* the cached array */
2173 uint64_t begin; /* the first item in the cached array */
2174 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2175 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2176} ChainCacheItem;
2177
2178static void chain_cache_put(
4743015d 2179 OrderedHashmap *h,
a4bcff5b
LP
2180 ChainCacheItem *ci,
2181 uint64_t first,
2182 uint64_t array,
2183 uint64_t begin,
f268980d
LP
2184 uint64_t total,
2185 uint64_t last_index) {
a4bcff5b
LP
2186
2187 if (!ci) {
34741aa3
LP
2188 /* If the chain item to cache for this chain is the
2189 * first one it's not worth caching anything */
2190 if (array == first)
2191 return;
2192
29433089 2193 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2194 ci = ordered_hashmap_steal_first(h);
29433089
LP
2195 assert(ci);
2196 } else {
a4bcff5b
LP
2197 ci = new(ChainCacheItem, 1);
2198 if (!ci)
2199 return;
2200 }
2201
2202 ci->first = first;
2203
4743015d 2204 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2205 free(ci);
2206 return;
2207 }
2208 } else
2209 assert(ci->first == first);
2210
2211 ci->array = array;
2212 ci->begin = begin;
2213 ci->total = total;
f268980d 2214 ci->last_index = last_index;
a4bcff5b
LP
2215}
2216
f268980d
LP
2217static int generic_array_get(
2218 JournalFile *f,
2219 uint64_t first,
2220 uint64_t i,
f4474e00 2221 Object **ret, uint64_t *ret_offset) {
de190aef 2222
cec736d2 2223 Object *o;
a4bcff5b 2224 uint64_t p = 0, a, t = 0;
cec736d2 2225 int r;
a4bcff5b 2226 ChainCacheItem *ci;
cec736d2
LP
2227
2228 assert(f);
2229
de190aef 2230 a = first;
a4bcff5b
LP
2231
2232 /* Try the chain cache first */
4743015d 2233 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2234 if (ci && i > ci->total) {
2235 a = ci->array;
2236 i -= ci->total;
2237 t = ci->total;
2238 }
2239
de190aef 2240 while (a > 0) {
a4bcff5b 2241 uint64_t k;
cec736d2 2242
de190aef
LP
2243 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2244 if (r < 0)
2245 return r;
cec736d2 2246
a4bcff5b
LP
2247 k = journal_file_entry_array_n_items(o);
2248 if (i < k) {
de190aef 2249 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2250 goto found;
cec736d2
LP
2251 }
2252
a4bcff5b
LP
2253 i -= k;
2254 t += k;
de190aef
LP
2255 a = le64toh(o->entry_array.next_entry_array_offset);
2256 }
2257
a4bcff5b
LP
2258 return 0;
2259
2260found:
2261 /* Let's cache this item for the next invocation */
af13a6b0 2262 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2263
2264 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2265 if (r < 0)
2266 return r;
2267
2268 if (ret)
2269 *ret = o;
2270
f4474e00
LP
2271 if (ret_offset)
2272 *ret_offset = p;
de190aef
LP
2273
2274 return 1;
2275}
2276
f268980d
LP
2277static int generic_array_get_plus_one(
2278 JournalFile *f,
2279 uint64_t extra,
2280 uint64_t first,
2281 uint64_t i,
f4474e00 2282 Object **ret, uint64_t *ret_offset) {
de190aef
LP
2283
2284 Object *o;
2285
2286 assert(f);
2287
2288 if (i == 0) {
2289 int r;
2290
2291 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2292 if (r < 0)
2293 return r;
2294
de190aef
LP
2295 if (ret)
2296 *ret = o;
cec736d2 2297
f4474e00
LP
2298 if (ret_offset)
2299 *ret_offset = extra;
cec736d2 2300
de190aef 2301 return 1;
cec736d2
LP
2302 }
2303
f4474e00 2304 return generic_array_get(f, first, i-1, ret, ret_offset);
de190aef 2305}
cec736d2 2306
de190aef
LP
2307enum {
2308 TEST_FOUND,
2309 TEST_LEFT,
2310 TEST_RIGHT
2311};
cec736d2 2312
f268980d
LP
2313static int generic_array_bisect(
2314 JournalFile *f,
2315 uint64_t first,
2316 uint64_t n,
2317 uint64_t needle,
2318 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2319 direction_t direction,
2320 Object **ret,
f4474e00
LP
2321 uint64_t *ret_offset,
2322 uint64_t *ret_idx) {
f268980d 2323
f5fbe71d 2324 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = UINT64_MAX;
de190aef
LP
2325 bool subtract_one = false;
2326 Object *o, *array = NULL;
2327 int r;
a4bcff5b 2328 ChainCacheItem *ci;
cec736d2 2329
de190aef
LP
2330 assert(f);
2331 assert(test_object);
cec736d2 2332
a4bcff5b 2333 /* Start with the first array in the chain */
de190aef 2334 a = first;
a4bcff5b 2335
4743015d 2336 ci = ordered_hashmap_get(f->chain_cache, &first);
96d4d024 2337 if (ci && n > ci->total && ci->begin != 0) {
a4bcff5b
LP
2338 /* Ah, we have iterated this bisection array chain
2339 * previously! Let's see if we can skip ahead in the
2340 * chain, as far as the last time. But we can't jump
2341 * backwards in the chain, so let's check that
2342 * first. */
2343
2344 r = test_object(f, ci->begin, needle);
2345 if (r < 0)
2346 return r;
2347
2348 if (r == TEST_LEFT) {
f268980d 2349 /* OK, what we are looking for is right of the
a4bcff5b
LP
2350 * begin of this EntryArray, so let's jump
2351 * straight to previously cached array in the
2352 * chain */
2353
2354 a = ci->array;
2355 n -= ci->total;
2356 t = ci->total;
f268980d 2357 last_index = ci->last_index;
a4bcff5b
LP
2358 }
2359 }
2360
de190aef
LP
2361 while (a > 0) {
2362 uint64_t left, right, k, lp;
2363
2364 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2365 if (r < 0)
2366 return r;
2367
de190aef
LP
2368 k = journal_file_entry_array_n_items(array);
2369 right = MIN(k, n);
2370 if (right <= 0)
2371 return 0;
cec736d2 2372
de190aef
LP
2373 i = right - 1;
2374 lp = p = le64toh(array->entry_array.items[i]);
2375 if (p <= 0)
bee6a291
LP
2376 r = -EBADMSG;
2377 else
2378 r = test_object(f, p, needle);
2379 if (r == -EBADMSG) {
2380 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2381 n = i;
2382 continue;
2383 }
de190aef
LP
2384 if (r < 0)
2385 return r;
cec736d2 2386
de190aef
LP
2387 if (r == TEST_FOUND)
2388 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2389
2390 if (r == TEST_RIGHT) {
2391 left = 0;
2392 right -= 1;
f268980d 2393
f5fbe71d 2394 if (last_index != UINT64_MAX) {
f268980d
LP
2395 assert(last_index <= right);
2396
2397 /* If we cached the last index we
2398 * looked at, let's try to not to jump
2399 * too wildly around and see if we can
2400 * limit the range to look at early to
2401 * the immediate neighbors of the last
2402 * index we looked at. */
2403
2404 if (last_index > 0) {
2405 uint64_t x = last_index - 1;
2406
2407 p = le64toh(array->entry_array.items[x]);
2408 if (p <= 0)
2409 return -EBADMSG;
2410
2411 r = test_object(f, p, needle);
2412 if (r < 0)
2413 return r;
2414
2415 if (r == TEST_FOUND)
2416 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2417
2418 if (r == TEST_RIGHT)
2419 right = x;
2420 else
2421 left = x + 1;
2422 }
2423
2424 if (last_index < right) {
2425 uint64_t y = last_index + 1;
2426
2427 p = le64toh(array->entry_array.items[y]);
2428 if (p <= 0)
2429 return -EBADMSG;
2430
2431 r = test_object(f, p, needle);
2432 if (r < 0)
2433 return r;
2434
2435 if (r == TEST_FOUND)
2436 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2437
2438 if (r == TEST_RIGHT)
2439 right = y;
2440 else
2441 left = y + 1;
2442 }
f268980d
LP
2443 }
2444
de190aef
LP
2445 for (;;) {
2446 if (left == right) {
2447 if (direction == DIRECTION_UP)
2448 subtract_one = true;
2449
2450 i = left;
2451 goto found;
2452 }
2453
2454 assert(left < right);
de190aef 2455 i = (left + right) / 2;
f268980d 2456
de190aef
LP
2457 p = le64toh(array->entry_array.items[i]);
2458 if (p <= 0)
bee6a291
LP
2459 r = -EBADMSG;
2460 else
2461 r = test_object(f, p, needle);
2462 if (r == -EBADMSG) {
2463 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2464 right = n = i;
2465 continue;
2466 }
de190aef
LP
2467 if (r < 0)
2468 return r;
cec736d2 2469
de190aef
LP
2470 if (r == TEST_FOUND)
2471 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2472
2473 if (r == TEST_RIGHT)
2474 right = i;
2475 else
2476 left = i + 1;
2477 }
2478 }
2479
2173cbf8 2480 if (k >= n) {
cbdca852
LP
2481 if (direction == DIRECTION_UP) {
2482 i = n;
2483 subtract_one = true;
2484 goto found;
2485 }
2486
cec736d2 2487 return 0;
cbdca852 2488 }
cec736d2 2489
de190aef
LP
2490 last_p = lp;
2491
2492 n -= k;
2493 t += k;
f5fbe71d 2494 last_index = UINT64_MAX;
de190aef 2495 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2496 }
2497
2498 return 0;
de190aef
LP
2499
2500found:
2501 if (subtract_one && t == 0 && i == 0)
2502 return 0;
2503
a4bcff5b 2504 /* Let's cache this item for the next invocation */
f5fbe71d 2505 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
a4bcff5b 2506
de190aef
LP
2507 if (subtract_one && i == 0)
2508 p = last_p;
2509 else if (subtract_one)
2510 p = le64toh(array->entry_array.items[i-1]);
2511 else
2512 p = le64toh(array->entry_array.items[i]);
2513
2514 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2515 if (r < 0)
2516 return r;
2517
2518 if (ret)
2519 *ret = o;
2520
f4474e00
LP
2521 if (ret_offset)
2522 *ret_offset = p;
de190aef 2523
f4474e00
LP
2524 if (ret_idx)
2525 *ret_idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2526
2527 return 1;
cec736d2
LP
2528}
2529
f268980d
LP
2530static int generic_array_bisect_plus_one(
2531 JournalFile *f,
2532 uint64_t extra,
2533 uint64_t first,
2534 uint64_t n,
2535 uint64_t needle,
2536 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2537 direction_t direction,
2538 Object **ret,
f4474e00
LP
2539 uint64_t *ret_offset,
2540 uint64_t *ret_idx) {
de190aef 2541
cec736d2 2542 int r;
cbdca852
LP
2543 bool step_back = false;
2544 Object *o;
cec736d2
LP
2545
2546 assert(f);
de190aef 2547 assert(test_object);
cec736d2 2548
de190aef
LP
2549 if (n <= 0)
2550 return 0;
cec736d2 2551
de190aef
LP
2552 /* This bisects the array in object 'first', but first checks
2553 * an extra */
de190aef
LP
2554 r = test_object(f, extra, needle);
2555 if (r < 0)
2556 return r;
a536e261
LP
2557
2558 if (r == TEST_FOUND)
2559 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2560
cbdca852
LP
2561 /* if we are looking with DIRECTION_UP then we need to first
2562 see if in the actual array there is a matching entry, and
2563 return the last one of that. But if there isn't any we need
2564 to return this one. Hence remember this, and return it
2565 below. */
2566 if (r == TEST_LEFT)
2567 step_back = direction == DIRECTION_UP;
de190aef 2568
cbdca852
LP
2569 if (r == TEST_RIGHT) {
2570 if (direction == DIRECTION_DOWN)
2571 goto found;
2572 else
2573 return 0;
a536e261 2574 }
cec736d2 2575
f4474e00 2576 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx);
de190aef 2577
cbdca852
LP
2578 if (r == 0 && step_back)
2579 goto found;
2580
f4474e00
LP
2581 if (r > 0 && ret_idx)
2582 (*ret_idx)++;
de190aef
LP
2583
2584 return r;
cbdca852
LP
2585
2586found:
2587 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2588 if (r < 0)
2589 return r;
2590
2591 if (ret)
2592 *ret = o;
2593
f4474e00
LP
2594 if (ret_offset)
2595 *ret_offset = extra;
cbdca852 2596
f4474e00
LP
2597 if (ret_idx)
2598 *ret_idx = 0;
cbdca852
LP
2599
2600 return 1;
2601}
2602
44a6b1b6 2603_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2604 assert(f);
2605 assert(p > 0);
2606
2607 if (p == needle)
2608 return TEST_FOUND;
2609 else if (p < needle)
2610 return TEST_LEFT;
2611 else
2612 return TEST_RIGHT;
2613}
2614
de190aef 2615static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
893e0f8f 2616 uint64_t sq;
de190aef
LP
2617 Object *o;
2618 int r;
2619
2620 assert(f);
2621 assert(p > 0);
2622
2623 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2624 if (r < 0)
2625 return r;
2626
893e0f8f
LP
2627 sq = le64toh(READ_NOW(o->entry.seqnum));
2628 if (sq == needle)
de190aef 2629 return TEST_FOUND;
893e0f8f 2630 else if (sq < needle)
de190aef
LP
2631 return TEST_LEFT;
2632 else
2633 return TEST_RIGHT;
2634}
cec736d2 2635
de190aef
LP
2636int journal_file_move_to_entry_by_seqnum(
2637 JournalFile *f,
2638 uint64_t seqnum,
2639 direction_t direction,
2640 Object **ret,
f4474e00 2641 uint64_t *ret_offset) {
c88cc6af
VC
2642 assert(f);
2643 assert(f->header);
de190aef 2644
f4474e00
LP
2645 return generic_array_bisect(
2646 f,
2647 le64toh(f->header->entry_array_offset),
2648 le64toh(f->header->n_entries),
2649 seqnum,
2650 test_object_seqnum,
2651 direction,
2652 ret, ret_offset, NULL);
de190aef 2653}
cec736d2 2654
de190aef
LP
2655static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2656 Object *o;
893e0f8f 2657 uint64_t rt;
de190aef
LP
2658 int r;
2659
2660 assert(f);
2661 assert(p > 0);
2662
2663 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2664 if (r < 0)
2665 return r;
2666
893e0f8f
LP
2667 rt = le64toh(READ_NOW(o->entry.realtime));
2668 if (rt == needle)
de190aef 2669 return TEST_FOUND;
893e0f8f 2670 else if (rt < needle)
de190aef
LP
2671 return TEST_LEFT;
2672 else
2673 return TEST_RIGHT;
cec736d2
LP
2674}
2675
de190aef
LP
2676int journal_file_move_to_entry_by_realtime(
2677 JournalFile *f,
2678 uint64_t realtime,
2679 direction_t direction,
2680 Object **ret,
f4474e00 2681 uint64_t *ret_offset) {
c88cc6af
VC
2682 assert(f);
2683 assert(f->header);
de190aef 2684
f4474e00
LP
2685 return generic_array_bisect(
2686 f,
2687 le64toh(f->header->entry_array_offset),
2688 le64toh(f->header->n_entries),
2689 realtime,
2690 test_object_realtime,
2691 direction,
2692 ret, ret_offset, NULL);
de190aef
LP
2693}
2694
2695static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2696 Object *o;
893e0f8f 2697 uint64_t m;
de190aef
LP
2698 int r;
2699
2700 assert(f);
2701 assert(p > 0);
2702
2703 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2704 if (r < 0)
2705 return r;
2706
893e0f8f
LP
2707 m = le64toh(READ_NOW(o->entry.monotonic));
2708 if (m == needle)
de190aef 2709 return TEST_FOUND;
893e0f8f 2710 else if (m < needle)
de190aef
LP
2711 return TEST_LEFT;
2712 else
2713 return TEST_RIGHT;
2714}
2715
2a560338 2716static int find_data_object_by_boot_id(
47838ab3
ZJS
2717 JournalFile *f,
2718 sd_id128_t boot_id,
2719 Object **o,
2720 uint64_t *b) {
2a560338 2721
fbd0b64f 2722 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2723
2724 sd_id128_to_string(boot_id, t + 9);
2725 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2726}
2727
de190aef
LP
2728int journal_file_move_to_entry_by_monotonic(
2729 JournalFile *f,
2730 sd_id128_t boot_id,
2731 uint64_t monotonic,
2732 direction_t direction,
2733 Object **ret,
f4474e00 2734 uint64_t *ret_offset) {
de190aef 2735
de190aef
LP
2736 Object *o;
2737 int r;
2738
cbdca852 2739 assert(f);
de190aef 2740
47838ab3 2741 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2742 if (r < 0)
2743 return r;
cbdca852 2744 if (r == 0)
de190aef
LP
2745 return -ENOENT;
2746
f4474e00
LP
2747 return generic_array_bisect_plus_one(
2748 f,
2749 le64toh(o->data.entry_offset),
2750 le64toh(o->data.entry_array_offset),
2751 le64toh(o->data.n_entries),
2752 monotonic,
2753 test_object_monotonic,
2754 direction,
2755 ret, ret_offset, NULL);
de190aef
LP
2756}
2757
1fc605b0 2758void journal_file_reset_location(JournalFile *f) {
6573ef05 2759 f->location_type = LOCATION_HEAD;
1fc605b0 2760 f->current_offset = 0;
6573ef05
MS
2761 f->current_seqnum = 0;
2762 f->current_realtime = 0;
2763 f->current_monotonic = 0;
2764 zero(f->current_boot_id);
2765 f->current_xor_hash = 0;
2766}
2767
950c07d4 2768void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2769 f->location_type = LOCATION_SEEK;
2770 f->current_offset = offset;
2771 f->current_seqnum = le64toh(o->entry.seqnum);
2772 f->current_realtime = le64toh(o->entry.realtime);
2773 f->current_monotonic = le64toh(o->entry.monotonic);
2774 f->current_boot_id = o->entry.boot_id;
2775 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2776}
2777
d8ae66d7 2778int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
90c88092
YW
2779 int r;
2780
d8ae66d7 2781 assert(af);
c88cc6af 2782 assert(af->header);
d8ae66d7 2783 assert(bf);
c88cc6af 2784 assert(bf->header);
d8ae66d7
MS
2785 assert(af->location_type == LOCATION_SEEK);
2786 assert(bf->location_type == LOCATION_SEEK);
2787
b17f651a 2788 /* If contents, timestamps and seqnum match, these entries are
7802194a 2789 * identical. */
d8ae66d7
MS
2790 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2791 af->current_monotonic == bf->current_monotonic &&
2792 af->current_realtime == bf->current_realtime &&
b17f651a 2793 af->current_xor_hash == bf->current_xor_hash &&
2794 sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) &&
2795 af->current_seqnum == bf->current_seqnum)
d8ae66d7
MS
2796 return 0;
2797
2798 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2799
2800 /* If this is from the same seqnum source, compare
2801 * seqnums */
90c88092
YW
2802 r = CMP(af->current_seqnum, bf->current_seqnum);
2803 if (r != 0)
2804 return r;
d8ae66d7
MS
2805
2806 /* Wow! This is weird, different data but the same
2807 * seqnums? Something is borked, but let's make the
2808 * best of it and compare by time. */
2809 }
2810
2811 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2812
2813 /* If the boot id matches, compare monotonic time */
90c88092
YW
2814 r = CMP(af->current_monotonic, bf->current_monotonic);
2815 if (r != 0)
2816 return r;
d8ae66d7
MS
2817 }
2818
2819 /* Otherwise, compare UTC time */
90c88092
YW
2820 r = CMP(af->current_realtime, bf->current_realtime);
2821 if (r != 0)
2822 return r;
d8ae66d7
MS
2823
2824 /* Finally, compare by contents */
6dd91b36 2825 return CMP(af->current_xor_hash, bf->current_xor_hash);
d8ae66d7
MS
2826}
2827
aa598ba5
LP
2828static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2829
2830 /* Increase or decrease the specified index, in the right direction. */
2831
2832 if (direction == DIRECTION_DOWN) {
2833 if (*i >= n - 1)
2834 return 0;
2835
2836 (*i) ++;
2837 } else {
2838 if (*i <= 0)
2839 return 0;
2840
2841 (*i) --;
2842 }
2843
2844 return 1;
2845}
2846
b6da4ed0
LP
2847static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2848
2849 /* Consider it an error if any of the two offsets is uninitialized */
2850 if (old_offset == 0 || new_offset == 0)
2851 return false;
2852
2853 /* If we go down, the new offset must be larger than the old one. */
2854 return direction == DIRECTION_DOWN ?
2855 new_offset > old_offset :
2856 new_offset < old_offset;
2857}
2858
de190aef
LP
2859int journal_file_next_entry(
2860 JournalFile *f,
f534928a 2861 uint64_t p,
de190aef 2862 direction_t direction,
f4474e00 2863 Object **ret, uint64_t *ret_offset) {
de190aef 2864
fb099c8d 2865 uint64_t i, n, ofs;
cec736d2
LP
2866 int r;
2867
2868 assert(f);
c88cc6af 2869 assert(f->header);
de190aef 2870
893e0f8f 2871 n = le64toh(READ_NOW(f->header->n_entries));
de190aef
LP
2872 if (n <= 0)
2873 return 0;
cec736d2 2874
f534928a 2875 if (p == 0)
de190aef 2876 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2877 else {
de190aef
LP
2878 r = generic_array_bisect(f,
2879 le64toh(f->header->entry_array_offset),
2880 le64toh(f->header->n_entries),
2881 p,
2882 test_object_offset,
2883 DIRECTION_DOWN,
2884 NULL, NULL,
2885 &i);
2886 if (r <= 0)
2887 return r;
2888
aa598ba5
LP
2889 r = bump_array_index(&i, direction, n);
2890 if (r <= 0)
2891 return r;
cec736d2
LP
2892 }
2893
de190aef 2894 /* And jump to it */
989793d3
LP
2895 for (;;) {
2896 r = generic_array_get(f,
2897 le64toh(f->header->entry_array_offset),
2898 i,
2899 ret, &ofs);
2900 if (r > 0)
2901 break;
2902 if (r != -EBADMSG)
2903 return r;
2904
2905 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2906 * the next one might work for us instead. */
2907 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2908
2909 r = bump_array_index(&i, direction, n);
2910 if (r <= 0)
2911 return r;
caeab8f6 2912 }
fb099c8d 2913
b6da4ed0 2914 /* Ensure our array is properly ordered. */
baaa35ad
ZJS
2915 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2916 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2917 "%s: entry array not properly ordered at entry %" PRIu64,
2918 f->path, i);
fb099c8d 2919
f4474e00
LP
2920 if (ret_offset)
2921 *ret_offset = ofs;
fb099c8d
ZJS
2922
2923 return 1;
de190aef 2924}
cec736d2 2925
de190aef
LP
2926int journal_file_next_entry_for_data(
2927 JournalFile *f,
2928 Object *o, uint64_t p,
2929 uint64_t data_offset,
2930 direction_t direction,
f4474e00 2931 Object **ret, uint64_t *ret_offset) {
de190aef 2932
ded5034e 2933 uint64_t i, n, ofs;
de190aef 2934 Object *d;
989793d3 2935 int r;
cec736d2
LP
2936
2937 assert(f);
de190aef 2938 assert(p > 0 || !o);
cec736d2 2939
de190aef 2940 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2941 if (r < 0)
de190aef 2942 return r;
cec736d2 2943
893e0f8f 2944 n = le64toh(READ_NOW(d->data.n_entries));
de190aef
LP
2945 if (n <= 0)
2946 return n;
cec736d2 2947
de190aef
LP
2948 if (!o)
2949 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2950 else {
2951 if (o->object.type != OBJECT_ENTRY)
2952 return -EINVAL;
cec736d2 2953
de190aef
LP
2954 r = generic_array_bisect_plus_one(f,
2955 le64toh(d->data.entry_offset),
2956 le64toh(d->data.entry_array_offset),
2957 le64toh(d->data.n_entries),
2958 p,
2959 test_object_offset,
2960 DIRECTION_DOWN,
2961 NULL, NULL,
2962 &i);
2963
2964 if (r <= 0)
cec736d2
LP
2965 return r;
2966
aa598ba5
LP
2967 r = bump_array_index(&i, direction, n);
2968 if (r <= 0)
2969 return r;
de190aef 2970 }
cec736d2 2971
989793d3
LP
2972 for (;;) {
2973 r = generic_array_get_plus_one(f,
2974 le64toh(d->data.entry_offset),
2975 le64toh(d->data.entry_array_offset),
2976 i,
2977 ret, &ofs);
2978 if (r > 0)
2979 break;
2980 if (r != -EBADMSG)
2981 return r;
2982
2983 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2984
2985 r = bump_array_index(&i, direction, n);
2986 if (r <= 0)
2987 return r;
2988 }
ded5034e
LP
2989
2990 /* Ensure our array is properly ordered. */
baaa35ad
ZJS
2991 if (p > 0 && check_properly_ordered(ofs, p, direction))
2992 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2993 "%s data entry array not properly ordered at entry %" PRIu64,
2994 f->path, i);
ded5034e 2995
f4474e00
LP
2996 if (ret_offset)
2997 *ret_offset = ofs;
ded5034e
LP
2998
2999 return 1;
de190aef 3000}
cec736d2 3001
cbdca852
LP
3002int journal_file_move_to_entry_by_offset_for_data(
3003 JournalFile *f,
3004 uint64_t data_offset,
3005 uint64_t p,
3006 direction_t direction,
f4474e00 3007 Object **ret, uint64_t *ret_offset) {
cbdca852
LP
3008
3009 int r;
3010 Object *d;
3011
3012 assert(f);
3013
3014 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3015 if (r < 0)
3016 return r;
3017
f4474e00
LP
3018 return generic_array_bisect_plus_one(
3019 f,
3020 le64toh(d->data.entry_offset),
3021 le64toh(d->data.entry_array_offset),
3022 le64toh(d->data.n_entries),
3023 p,
3024 test_object_offset,
3025 direction,
3026 ret, ret_offset, NULL);
cbdca852
LP
3027}
3028
3029int journal_file_move_to_entry_by_monotonic_for_data(
3030 JournalFile *f,
3031 uint64_t data_offset,
3032 sd_id128_t boot_id,
3033 uint64_t monotonic,
3034 direction_t direction,
f4474e00 3035 Object **ret, uint64_t *ret_offset) {
cbdca852 3036
cbdca852
LP
3037 Object *o, *d;
3038 int r;
3039 uint64_t b, z;
3040
3041 assert(f);
3042
3043 /* First, seek by time */
47838ab3 3044 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
3045 if (r < 0)
3046 return r;
3047 if (r == 0)
3048 return -ENOENT;
3049
3050 r = generic_array_bisect_plus_one(f,
3051 le64toh(o->data.entry_offset),
3052 le64toh(o->data.entry_array_offset),
3053 le64toh(o->data.n_entries),
3054 monotonic,
3055 test_object_monotonic,
3056 direction,
3057 NULL, &z, NULL);
3058 if (r <= 0)
3059 return r;
3060
3061 /* And now, continue seeking until we find an entry that
3062 * exists in both bisection arrays */
3063
3064 for (;;) {
3065 Object *qo;
3066 uint64_t p, q;
3067
3068 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3069 if (r < 0)
3070 return r;
3071
3072 r = generic_array_bisect_plus_one(f,
3073 le64toh(d->data.entry_offset),
3074 le64toh(d->data.entry_array_offset),
3075 le64toh(d->data.n_entries),
3076 z,
3077 test_object_offset,
3078 direction,
3079 NULL, &p, NULL);
3080 if (r <= 0)
3081 return r;
3082
3083 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
3084 if (r < 0)
3085 return r;
3086
3087 r = generic_array_bisect_plus_one(f,
3088 le64toh(o->data.entry_offset),
3089 le64toh(o->data.entry_array_offset),
3090 le64toh(o->data.n_entries),
3091 p,
3092 test_object_offset,
3093 direction,
3094 &qo, &q, NULL);
3095
3096 if (r <= 0)
3097 return r;
3098
3099 if (p == q) {
3100 if (ret)
3101 *ret = qo;
f4474e00
LP
3102 if (ret_offset)
3103 *ret_offset = q;
cbdca852
LP
3104
3105 return 1;
3106 }
3107
3108 z = q;
3109 }
cbdca852
LP
3110}
3111
de190aef
LP
3112int journal_file_move_to_entry_by_seqnum_for_data(
3113 JournalFile *f,
3114 uint64_t data_offset,
3115 uint64_t seqnum,
3116 direction_t direction,
f4474e00 3117 Object **ret, uint64_t *ret_offset) {
cec736d2 3118
de190aef
LP
3119 Object *d;
3120 int r;
cec736d2 3121
91a31dde
LP
3122 assert(f);
3123
de190aef 3124 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3125 if (r < 0)
de190aef 3126 return r;
cec736d2 3127
f4474e00
LP
3128 return generic_array_bisect_plus_one(
3129 f,
3130 le64toh(d->data.entry_offset),
3131 le64toh(d->data.entry_array_offset),
3132 le64toh(d->data.n_entries),
3133 seqnum,
3134 test_object_seqnum,
3135 direction,
3136 ret, ret_offset, NULL);
de190aef 3137}
cec736d2 3138
de190aef
LP
3139int journal_file_move_to_entry_by_realtime_for_data(
3140 JournalFile *f,
3141 uint64_t data_offset,
3142 uint64_t realtime,
3143 direction_t direction,
f4474e00 3144 Object **ret, uint64_t *ret_offset) {
de190aef
LP
3145
3146 Object *d;
3147 int r;
3148
91a31dde
LP
3149 assert(f);
3150
de190aef 3151 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3152 if (r < 0)
de190aef
LP
3153 return r;
3154
f4474e00
LP
3155 return generic_array_bisect_plus_one(
3156 f,
3157 le64toh(d->data.entry_offset),
3158 le64toh(d->data.entry_array_offset),
3159 le64toh(d->data.n_entries),
3160 realtime,
3161 test_object_realtime,
3162 direction,
3163 ret, ret_offset, NULL);
cec736d2
LP
3164}
3165
0284adc6 3166void journal_file_dump(JournalFile *f) {
7560fffc 3167 Object *o;
7560fffc 3168 int r;
0284adc6 3169 uint64_t p;
7560fffc
LP
3170
3171 assert(f);
c88cc6af 3172 assert(f->header);
7560fffc 3173
0284adc6 3174 journal_file_print_header(f);
7560fffc 3175
893e0f8f 3176 p = le64toh(READ_NOW(f->header->header_size));
0284adc6 3177 while (p != 0) {
363b2b9a
DDM
3178 const char *s;
3179
d05089d8 3180 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3181 if (r < 0)
3182 goto fail;
7560fffc 3183
363b2b9a 3184 s = journal_object_type_to_string(o->object.type);
7560fffc 3185
363b2b9a 3186 switch (o->object.type) {
3c1668da 3187
0284adc6 3188 case OBJECT_ENTRY:
363b2b9a
DDM
3189 assert(s);
3190
3191 printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3192 s,
507f22bd
ZJS
3193 le64toh(o->entry.seqnum),
3194 le64toh(o->entry.monotonic),
3195 le64toh(o->entry.realtime));
0284adc6 3196 break;
7560fffc 3197
0284adc6 3198 case OBJECT_TAG:
363b2b9a
DDM
3199 assert(s);
3200
3201 printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3202 s,
507f22bd
ZJS
3203 le64toh(o->tag.seqnum),
3204 le64toh(o->tag.epoch));
0284adc6 3205 break;
3c1668da
LP
3206
3207 default:
363b2b9a
DDM
3208 if (s)
3209 printf("Type: %s \n", s);
3210 else
3211 printf("Type: unknown (%i)", o->object.type);
3212
3c1668da 3213 break;
0284adc6 3214 }
7560fffc 3215
d89c8fdf
ZJS
3216 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3217 printf("Flags: %s\n",
3218 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3219
0284adc6
LP
3220 if (p == le64toh(f->header->tail_object_offset))
3221 p = 0;
3222 else
71139898 3223 p += ALIGN64(le64toh(o->object.size));
0284adc6 3224 }
7560fffc 3225
0284adc6
LP
3226 return;
3227fail:
3228 log_error("File corrupt");
7560fffc
LP
3229}
3230
5e62ac8b
ZJS
3231/* Note: the lifetime of the compound literal is the immediately surrounding block. */
3232#define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ")
718fe4b1 3233
0284adc6 3234void journal_file_print_header(JournalFile *f) {
a1a03e30 3235 struct stat st;
7560fffc
LP
3236
3237 assert(f);
c88cc6af 3238 assert(f->header);
7560fffc 3239
2c54acb1 3240 printf("File path: %s\n"
0284adc6
LP
3241 "File ID: %s\n"
3242 "Machine ID: %s\n"
3243 "Boot ID: %s\n"
2c54acb1 3244 "Sequential number ID: %s\n"
0284adc6 3245 "State: %s\n"
2c54acb1 3246 "Compatible flags:%s%s\n"
8653185a 3247 "Incompatible flags:%s%s%s%s%s\n"
507f22bd
ZJS
3248 "Header size: %"PRIu64"\n"
3249 "Arena size: %"PRIu64"\n"
2c54acb1
TN
3250 "Data hash table size: %"PRIu64"\n"
3251 "Field hash table size: %"PRIu64"\n"
3252 "Rotate suggested: %s\n"
3253 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3254 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3255 "Head realtime timestamp: %s (%"PRIx64")\n"
3256 "Tail realtime timestamp: %s (%"PRIx64")\n"
3257 "Tail monotonic timestamp: %s (%"PRIx64")\n"
507f22bd 3258 "Objects: %"PRIu64"\n"
2c54acb1 3259 "Entry objects: %"PRIu64"\n",
0284adc6 3260 f->path,
85b55869
LP
3261 SD_ID128_TO_STRING(f->header->file_id),
3262 SD_ID128_TO_STRING(f->header->machine_id),
3263 SD_ID128_TO_STRING(f->header->boot_id),
3264 SD_ID128_TO_STRING(f->header->seqnum_id),
3223f44f
LP
3265 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3266 f->header->state == STATE_ONLINE ? "ONLINE" :
3267 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3268 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3269 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3270 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3271 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
8653185a 3272 JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",
4ce534f4 3273 JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
d89c8fdf 3274 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3275 le64toh(f->header->header_size),
3276 le64toh(f->header->arena_size),
3277 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3278 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
c8e6e1f1 3279 yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)),
0808b92f
LP
3280 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3281 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
5e62ac8b
ZJS
3282 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3283 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
5291f26d 3284 FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3285 le64toh(f->header->n_objects),
3286 le64toh(f->header->n_entries));
7560fffc 3287
0284adc6 3288 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2c54acb1
TN
3289 printf("Data objects: %"PRIu64"\n"
3290 "Data hash table fill: %.1f%%\n",
507f22bd 3291 le64toh(f->header->n_data),
0284adc6 3292 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3293
0284adc6 3294 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2c54acb1
TN
3295 printf("Field objects: %"PRIu64"\n"
3296 "Field hash table fill: %.1f%%\n",
507f22bd 3297 le64toh(f->header->n_fields),
0284adc6 3298 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3299
3300 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2c54acb1 3301 printf("Tag objects: %"PRIu64"\n",
507f22bd 3302 le64toh(f->header->n_tags));
3223f44f 3303 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2c54acb1 3304 printf("Entry array objects: %"PRIu64"\n",
507f22bd 3305 le64toh(f->header->n_entry_arrays));
a1a03e30 3306
0dbe57ee
LP
3307 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth))
3308 printf("Deepest field hash chain: %" PRIu64"\n",
3309 f->header->field_hash_chain_depth);
3310
3311 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth))
3312 printf("Deepest data hash chain: %" PRIu64"\n",
3313 f->header->data_hash_chain_depth);
3314
a1a03e30 3315 if (fstat(f->fd, &st) >= 0)
2b59bf51 3316 printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3317}
3318
fc68c929
LP
3319static int journal_file_warn_btrfs(JournalFile *f) {
3320 unsigned attrs;
3321 int r;
3322
3323 assert(f);
3324
3325 /* Before we write anything, check if the COW logic is turned
3326 * off on btrfs. Given our write pattern that is quite
3327 * unfriendly to COW file systems this should greatly improve
3328 * performance on COW file systems, such as btrfs, at the
3329 * expense of data integrity features (which shouldn't be too
3330 * bad, given that we do our own checksumming). */
3331
65ddc2c5 3332 r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC);
fc68c929
LP
3333 if (r < 0)
3334 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3335 if (!r)
3336 return 0;
3337
3338 r = read_attr_fd(f->fd, &attrs);
3339 if (r < 0)
3340 return log_warning_errno(r, "Failed to read file attributes: %m");
3341
3342 if (attrs & FS_NOCOW_FL) {
3343 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3344 return 0;
3345 }
3346
3347 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3348 "This is likely to slow down journal access substantially, please consider turning "
3349 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3350
3351 return 1;
3352}
3353
0284adc6 3354int journal_file_open(
5d1ce257 3355 int fd,
0284adc6
LP
3356 const char *fname,
3357 int flags,
3358 mode_t mode,
3359 bool compress,
57850536 3360 uint64_t compress_threshold_bytes,
baed47c3 3361 bool seal,
0284adc6
LP
3362 JournalMetrics *metrics,
3363 MMapCache *mmap_cache,
b58c888f 3364 Set *deferred_closes,
0284adc6
LP
3365 JournalFile *template,
3366 JournalFile **ret) {
7560fffc 3367
fa6ac760 3368 bool newly_created = false;
0284adc6 3369 JournalFile *f;
fa6ac760 3370 void *h;
0284adc6 3371 int r;
7560fffc 3372
0559d3a5 3373 assert(ret);
5d1ce257 3374 assert(fd >= 0 || fname);
7560fffc 3375
ec2ce0c5 3376 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3377 return -EINVAL;
7560fffc 3378
6eda13d3
LP
3379 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3380 return -EINVAL;
7560fffc 3381
971b52c4 3382 f = new(JournalFile, 1);
0284adc6
LP
3383 if (!f)
3384 return -ENOMEM;
7560fffc 3385
971b52c4
LP
3386 *f = (JournalFile) {
3387 .fd = fd,
3388 .mode = mode,
3389
3390 .flags = flags,
971b52c4 3391 .writable = (flags & O_ACCMODE) != O_RDONLY,
7560fffc 3392
8653185a
LP
3393#if HAVE_ZSTD
3394 .compress_zstd = compress,
3395#elif HAVE_LZ4
971b52c4 3396 .compress_lz4 = compress,
349cc4a5 3397#elif HAVE_XZ
971b52c4 3398 .compress_xz = compress,
48b61739 3399#endif
f5fbe71d 3400 .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ?
971b52c4
LP
3401 DEFAULT_COMPRESS_THRESHOLD :
3402 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
349cc4a5 3403#if HAVE_GCRYPT
971b52c4 3404 .seal = seal,
49a32d43 3405#endif
971b52c4 3406 };
7560fffc 3407
4ce534f4
LP
3408 /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
3409 * people really want that */
3410 r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
3411 if (r < 0) {
3412 if (r != -ENXIO)
3413 log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring.");
3414 f->keyed_hash = true;
3415 } else
3416 f->keyed_hash = r;
3417
170a434c 3418 if (DEBUG_LOGGING) {
4ce534f4 3419 static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
170a434c 3420 static uint64_t last_bytes = UINT64_MAX;
170a434c
ZJS
3421
3422 if (last_seal != f->seal ||
4ce534f4 3423 last_keyed_hash != f->keyed_hash ||
170a434c
ZJS
3424 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3425 last_bytes != f->compress_threshold_bytes) {
3426
4ce534f4
LP
3427 log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3428 yes_no(f->seal), yes_no(f->keyed_hash), yes_no(JOURNAL_FILE_COMPRESS(f)),
2b59bf51 3429 FORMAT_BYTES(f->compress_threshold_bytes));
170a434c 3430 last_seal = f->seal;
4ce534f4 3431 last_keyed_hash = f->keyed_hash;
170a434c
ZJS
3432 last_compress = JOURNAL_FILE_COMPRESS(f);
3433 last_bytes = f->compress_threshold_bytes;
3434 }
3435 }
57850536 3436
0284adc6
LP
3437 if (mmap_cache)
3438 f->mmap = mmap_cache_ref(mmap_cache);
3439 else {
84168d80 3440 f->mmap = mmap_cache_new();
0284adc6
LP
3441 if (!f->mmap) {
3442 r = -ENOMEM;
3443 goto fail;
3444 }
3445 }
7560fffc 3446
7645c77b 3447 if (fname) {
5d1ce257 3448 f->path = strdup(fname);
7645c77b
ZJS
3449 if (!f->path) {
3450 r = -ENOMEM;
3451 goto fail;
3452 }
3453 } else {
817b1c5b
LP
3454 assert(fd >= 0);
3455
7645c77b
ZJS
3456 /* If we don't know the path, fill in something explanatory and vaguely useful */
3457 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3458 r = -ENOMEM;
3459 goto fail;
3460 }
0284adc6 3461 }
7560fffc 3462
4743015d 3463 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3464 if (!f->chain_cache) {
3465 r = -ENOMEM;
3466 goto fail;
3467 }
3468
0284adc6 3469 if (f->fd < 0) {
817b1c5b
LP
3470 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3471 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3472 * it doesn't hurt in that case. */
3473
3474 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
5d1ce257
LP
3475 if (f->fd < 0) {
3476 r = -errno;
3477 goto fail;
3478 }
3479
3480 /* fds we opened here by us should also be closed by us. */
3481 f->close_fd = true;
817b1c5b
LP
3482
3483 r = fd_nonblock(f->fd, false);
3484 if (r < 0)
3485 goto fail;
7560fffc 3486 }
7560fffc 3487
104fc4be 3488 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd, prot_from_flags(flags));
be7cdd8e
VC
3489 if (!f->cache_fd) {
3490 r = -ENOMEM;
3491 goto fail;
3492 }
3493
2678031a
LP
3494 r = journal_file_fstat(f);
3495 if (r < 0)
0284adc6 3496 goto fail;
7560fffc 3497
0284adc6 3498 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3499
fc68c929 3500 (void) journal_file_warn_btrfs(f);
11689d2a 3501
4c2e1b39
LP
3502 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3503 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3504 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3505 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3506 * solely on mtime/atime/ctime of the file. */
3507 (void) fd_setcrtime(f->fd, 0);
7560fffc 3508
349cc4a5 3509#if HAVE_GCRYPT
0284adc6 3510 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3511 * just don't do sealing */
49a32d43
LP
3512 if (f->seal) {
3513 r = journal_file_fss_load(f);
3514 if (r < 0)
3515 f->seal = false;
3516 }
feb12d3e 3517#endif
7560fffc 3518
0284adc6
LP
3519 r = journal_file_init_header(f, template);
3520 if (r < 0)
3521 goto fail;
7560fffc 3522
2678031a
LP
3523 r = journal_file_fstat(f);
3524 if (r < 0)
0284adc6 3525 goto fail;
fb0951b0
LP
3526
3527 newly_created = true;
0284adc6 3528 }
7560fffc 3529
0284adc6 3530 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3531 r = -ENODATA;
0284adc6
LP
3532 goto fail;
3533 }
7560fffc 3534
c3bd54bf 3535 r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
5087825e
LP
3536 if (r == -EINVAL) {
3537 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3538 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3539 * code. */
3540 r = -EAFNOSUPPORT;
3541 goto fail;
3542 }
977eaa1e 3543 if (r < 0)
0284adc6 3544 goto fail;
7560fffc 3545
fa6ac760
LP
3546 f->header = h;
3547
0284adc6 3548 if (!newly_created) {
f9168190 3549 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3550
0284adc6
LP
3551 r = journal_file_verify_header(f);
3552 if (r < 0)
3553 goto fail;
3554 }
7560fffc 3555
349cc4a5 3556#if HAVE_GCRYPT
0284adc6 3557 if (!newly_created && f->writable) {
baed47c3 3558 r = journal_file_fss_load(f);
0284adc6
LP
3559 if (r < 0)
3560 goto fail;
3561 }
feb12d3e 3562#endif
cec736d2
LP
3563
3564 if (f->writable) {
4a92baf3
LP
3565 if (metrics) {
3566 journal_default_metrics(metrics, f->fd);
3567 f->metrics = *metrics;
3568 } else if (template)
3569 f->metrics = template->metrics;
3570
cec736d2
LP
3571 r = journal_file_refresh_header(f);
3572 if (r < 0)
3573 goto fail;
3574 }
3575
349cc4a5 3576#if HAVE_GCRYPT
baed47c3 3577 r = journal_file_hmac_setup(f);
14d10188
LP
3578 if (r < 0)
3579 goto fail;
feb12d3e 3580#endif
14d10188 3581
cec736d2 3582 if (newly_created) {
de190aef 3583 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3584 if (r < 0)
3585 goto fail;
3586
de190aef 3587 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3588 if (r < 0)
3589 goto fail;
7560fffc 3590
349cc4a5 3591#if HAVE_GCRYPT
7560fffc
LP
3592 r = journal_file_append_first_tag(f);
3593 if (r < 0)
3594 goto fail;
feb12d3e 3595#endif
cec736d2
LP
3596 }
3597
c3bd54bf 3598 if (mmap_cache_fd_got_sigbus(f->cache_fd)) {
fa6ac760
LP
3599 r = -EIO;
3600 goto fail;
3601 }
3602
7a24f3bf 3603 if (template && template->post_change_timer) {
e167d7fd
LP
3604 r = journal_file_enable_post_change_timer(
3605 f,
3606 sd_event_source_get_event(template->post_change_timer),
3607 template->post_change_timer_period);
7a24f3bf 3608
7a24f3bf
VC
3609 if (r < 0)
3610 goto fail;
3611 }
3612
f8e2f4d6 3613 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3614 f->close_fd = true;
3615
0559d3a5 3616 *ret = f;
cec736d2
LP
3617 return 0;
3618
3619fail:
c3bd54bf 3620 if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd))
fa6ac760
LP
3621 r = -EIO;
3622
69a3a6fd 3623 (void) journal_file_close(f);
cec736d2
LP
3624
3625 return r;
3626}
0ac38b70 3627
7a4d21ad 3628int journal_file_archive(JournalFile *f) {
57535f47 3629 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3630
3631 assert(f);
0ac38b70 3632
7a4d21ad 3633 if (!f->writable)
0ac38b70
LP
3634 return -EINVAL;
3635
5d1ce257 3636 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3637 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
7a4d21ad 3638 if (path_startswith(f->path, "/proc/self/fd"))
5d1ce257
LP
3639 return -EINVAL;
3640
7a4d21ad 3641 if (!endswith(f->path, ".journal"))
0ac38b70
LP
3642 return -EINVAL;
3643
7a4d21ad
LP
3644 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3645 (int) strlen(f->path) - 8, f->path,
3646 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3647 le64toh(f->header->head_entry_seqnum),
3648 le64toh(f->header->head_entry_realtime)) < 0)
0ac38b70
LP
3649 return -ENOMEM;
3650
7a4d21ad
LP
3651 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3652 * ignore that case. */
3653 if (rename(f->path, p) < 0 && errno != ENOENT)
0ac38b70
LP
3654 return -errno;
3655
1fcefd88 3656 /* Sync the rename to disk */
7a4d21ad
LP
3657 (void) fsync_directory_of_file(f->fd);
3658
3659 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3660 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3661 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3662 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3663 * occurs. */
3664 f->archive = true;
3665
3666 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3667 * files when we archive them */
3668 f->defrag_on_close = true;
3669
3670 return 0;
3671}
3672
3673JournalFile* journal_initiate_close(
3674 JournalFile *f,
3675 Set *deferred_closes) {
3676
3677 int r;
3678
3679 assert(f);
3680
3681 if (deferred_closes) {
0ac38b70 3682
7a4d21ad
LP
3683 r = set_put(deferred_closes, f);
3684 if (r < 0)
3685 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3686 else {
3687 (void) journal_file_set_offline(f, false);
3688 return NULL;
3689 }
3690 }
3691
3692 return journal_file_close(f);
3693}
3694
3695int journal_file_rotate(
3696 JournalFile **f,
3697 bool compress,
3698 uint64_t compress_threshold_bytes,
3699 bool seal,
3700 Set *deferred_closes) {
3701
3702 JournalFile *new_file = NULL;
3703 int r;
3704
3705 assert(f);
3706 assert(*f);
3707
3708 r = journal_file_archive(*f);
3709 if (r < 0)
3710 return r;
3711
3712 r = journal_file_open(
3713 -1,
3714 (*f)->path,
3715 (*f)->flags,
3716 (*f)->mode,
3717 compress,
3718 compress_threshold_bytes,
3719 seal,
3720 NULL, /* metrics */
3721 (*f)->mmap,
3722 deferred_closes,
3723 *f, /* template */
3724 &new_file);
3725
3726 journal_initiate_close(*f, deferred_closes);
0ac38b70 3727 *f = new_file;
7a4d21ad 3728
0ac38b70
LP
3729 return r;
3730}
3731
68127658
LP
3732int journal_file_dispose(int dir_fd, const char *fname) {
3733 _cleanup_free_ char *p = NULL;
3734 _cleanup_close_ int fd = -1;
3735
3736 assert(fname);
3737
24ee0f9d 3738 /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
68127658
LP
3739 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3740 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3741 * for writing anymore. */
3742
3743 if (!endswith(fname, ".journal"))
3744 return -EINVAL;
3745
3746 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3747 (int) strlen(fname) - 8, fname,
3748 now(CLOCK_REALTIME),
3749 random_u64()) < 0)
3750 return -ENOMEM;
3751
3752 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3753 return -errno;
3754
3755 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3756 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3757 if (fd < 0)
3758 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3759 else {
3760 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3761 (void) btrfs_defrag_fd(fd);
3762 }
3763
3764 return 0;
3765}
3766
9447a7f1
LP
3767int journal_file_open_reliably(
3768 const char *fname,
3769 int flags,
3770 mode_t mode,
7560fffc 3771 bool compress,
57850536 3772 uint64_t compress_threshold_bytes,
baed47c3 3773 bool seal,
4a92baf3 3774 JournalMetrics *metrics,
27370278 3775 MMapCache *mmap_cache,
b58c888f 3776 Set *deferred_closes,
9447a7f1
LP
3777 JournalFile *template,
3778 JournalFile **ret) {
3779
68127658 3780 int r;
9447a7f1 3781
57850536
AG
3782 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3783 deferred_closes, template, ret);
288359db 3784 if (!IN_SET(r,
b288cdeb
ZJS
3785 -EBADMSG, /* Corrupted */
3786 -ENODATA, /* Truncated */
3787 -EHOSTDOWN, /* Other machine */
3788 -EPROTONOSUPPORT, /* Incompatible feature */
3789 -EBUSY, /* Unclean shutdown */
3790 -ESHUTDOWN, /* Already archived */
288359db 3791 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3792 -EIDRM, /* File has been deleted */
3793 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3794 return r;
3795
3796 if ((flags & O_ACCMODE) == O_RDONLY)
3797 return r;
3798
3799 if (!(flags & O_CREAT))
3800 return r;
3801
7560fffc
LP
3802 if (!endswith(fname, ".journal"))
3803 return r;
3804
5c70eab4 3805 /* The file is corrupted. Rotate it away and try it again (but only once) */
65089b82 3806 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3807
68127658
LP
3808 r = journal_file_dispose(AT_FDCWD, fname);
3809 if (r < 0)
3810 return r;
3811
57850536
AG
3812 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3813 deferred_closes, template, ret);
9447a7f1
LP
3814}
3815
5a271b08 3816int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
f6a0cfa5 3817 uint64_t q, n, xor_hash = 0;
d180c349 3818 const sd_id128_t *boot_id;
f6a0cfa5
SS
3819 dual_timestamp ts;
3820 EntryItem *items;
3821 int r;
cf244689
LP
3822
3823 assert(from);
3824 assert(to);
3825 assert(o);
3826 assert(p);
3827
3828 if (!to->writable)
3829 return -EPERM;
3830
d164ac77
DDM
3831 ts = (dual_timestamp) {
3832 .monotonic = le64toh(o->entry.monotonic),
3833 .realtime = le64toh(o->entry.realtime),
3834 };
d180c349 3835 boot_id = &o->entry.boot_id;
cf244689 3836
cf244689 3837 n = journal_file_entry_n_items(o);
5222651e 3838 items = newa(EntryItem, n);
cf244689 3839
f6a0cfa5 3840 for (uint64_t i = 0; i < n; i++) {
4fd052ae
FC
3841 uint64_t l, h;
3842 le64_t le_hash;
cf244689
LP
3843 size_t t;
3844 void *data;
3845 Object *u;
3846
3847 q = le64toh(o->entry.items[i].object_offset);
3848 le_hash = o->entry.items[i].hash;
3849
3850 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3851 if (r < 0)
3852 return r;
3853
3854 if (le_hash != o->data.hash)
3855 return -EBADMSG;
3856
893e0f8f
LP
3857 l = le64toh(READ_NOW(o->object.size));
3858 if (l < offsetof(Object, data.payload))
3859 return -EBADMSG;
3860
3861 l -= offsetof(Object, data.payload);
cf244689
LP
3862 t = (size_t) l;
3863
3864 /* We hit the limit on 32bit machines */
3865 if ((uint64_t) t != l)
3866 return -E2BIG;
3867
d89c8fdf 3868 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
d80b051c 3869#if HAVE_COMPRESSION
a7f7d1bd 3870 size_t rsize = 0;
cf244689 3871
319a4f4b
LP
3872 r = decompress_blob(
3873 o->object.flags & OBJECT_COMPRESSION_MASK,
3874 o->data.payload, l,
3875 &from->compress_buffer, &rsize,
3876 0);
d89c8fdf
ZJS
3877 if (r < 0)
3878 return r;
cf244689
LP
3879
3880 data = from->compress_buffer;
3881 l = rsize;
3b1a55e1
ZJS
3882#else
3883 return -EPROTONOSUPPORT;
3884#endif
cf244689
LP
3885 } else
3886 data = o->data.payload;
3887
bc6b326d
DDM
3888 if (l == 0)
3889 return -EBADMSG;
3890
cf244689
LP
3891 r = journal_file_append_data(to, data, l, &u, &h);
3892 if (r < 0)
3893 return r;
3894
4ce534f4
LP
3895 if (JOURNAL_HEADER_KEYED_HASH(to->header))
3896 xor_hash ^= jenkins_hash64(data, l);
3897 else
3898 xor_hash ^= le64toh(u->data.hash);
3899
d164ac77
DDM
3900 items[i] = (EntryItem) {
3901 .object_offset = htole64(h),
3902 .hash = u->data.hash,
3903 };
cf244689
LP
3904
3905 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3906 if (r < 0)
3907 return r;
3908 }
3909
bc6b326d 3910 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);
fa6ac760 3911
c3bd54bf 3912 if (mmap_cache_fd_got_sigbus(to->cache_fd))
fa6ac760
LP
3913 return -EIO;
3914
3915 return r;
cf244689 3916}
babfc091 3917
8580d1f7
LP
3918void journal_reset_metrics(JournalMetrics *m) {
3919 assert(m);
3920
3921 /* Set everything to "pick automatic values". */
3922
3923 *m = (JournalMetrics) {
f5fbe71d
YW
3924 .min_use = UINT64_MAX,
3925 .max_use = UINT64_MAX,
3926 .min_size = UINT64_MAX,
3927 .max_size = UINT64_MAX,
3928 .keep_free = UINT64_MAX,
3929 .n_max_files = UINT64_MAX,
8580d1f7
LP
3930 };
3931}
3932
babfc091 3933void journal_default_metrics(JournalMetrics *m, int fd) {
babfc091 3934 struct statvfs ss;
6aae0b1a 3935 uint64_t fs_size = 0;
babfc091
LP
3936
3937 assert(m);
3938 assert(fd >= 0);
3939
3940 if (fstatvfs(fd, &ss) >= 0)
3941 fs_size = ss.f_frsize * ss.f_blocks;
6aae0b1a 3942 else
8fc58f1a 3943 log_debug_errno(errno, "Failed to determine disk size: %m");
babfc091 3944
f5fbe71d 3945 if (m->max_use == UINT64_MAX) {
babfc091 3946
6aae0b1a
ZJS
3947 if (fs_size > 0)
3948 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3949 MAX_USE_LOWER, MAX_USE_UPPER);
3950 else
3951 m->max_use = MAX_USE_LOWER;
babfc091
LP
3952 } else {
3953 m->max_use = PAGE_ALIGN(m->max_use);
3954
8580d1f7 3955 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3956 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3957 }
3958
f5fbe71d 3959 if (m->min_use == UINT64_MAX) {
6aae0b1a
ZJS
3960 if (fs_size > 0)
3961 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3962 MIN_USE_LOW, MIN_USE_HIGH);
3963 else
3964 m->min_use = MIN_USE_LOW;
3965 }
8580d1f7
LP
3966
3967 if (m->min_use > m->max_use)
3968 m->min_use = m->max_use;
3969
f5fbe71d 3970 if (m->max_size == UINT64_MAX)
6aae0b1a
ZJS
3971 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3972 MAX_SIZE_UPPER);
3973 else
babfc091
LP
3974 m->max_size = PAGE_ALIGN(m->max_size);
3975
8580d1f7
LP
3976 if (m->max_size != 0) {
3977 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3978 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3979
8580d1f7
LP
3980 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3981 m->max_use = m->max_size*2;
3982 }
babfc091 3983
f5fbe71d 3984 if (m->min_size == UINT64_MAX)
babfc091 3985 m->min_size = JOURNAL_FILE_SIZE_MIN;
6aae0b1a
ZJS
3986 else
3987 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3988 JOURNAL_FILE_SIZE_MIN,
3989 m->max_size ?: UINT64_MAX);
babfc091 3990
f5fbe71d 3991 if (m->keep_free == UINT64_MAX) {
6aae0b1a
ZJS
3992 if (fs_size > 0)
3993 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3994 KEEP_FREE_UPPER);
3995 else
babfc091
LP
3996 m->keep_free = DEFAULT_KEEP_FREE;
3997 }
3998
f5fbe71d 3999 if (m->n_max_files == UINT64_MAX)
8580d1f7
LP
4000 m->n_max_files = DEFAULT_N_MAX_FILES;
4001
4002 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
2b59bf51
ZJS
4003 FORMAT_BYTES(m->min_use),
4004 FORMAT_BYTES(m->max_use),
4005 FORMAT_BYTES(m->max_size),
4006 FORMAT_BYTES(m->min_size),
4007 FORMAT_BYTES(m->keep_free),
8580d1f7 4008 m->n_max_files);
babfc091 4009}
08984293
LP
4010
4011int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 4012 assert(f);
c88cc6af 4013 assert(f->header);
08984293
LP
4014 assert(from || to);
4015
4016 if (from) {
162566a4
LP
4017 if (f->header->head_entry_realtime == 0)
4018 return -ENOENT;
08984293 4019
162566a4 4020 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
4021 }
4022
4023 if (to) {
162566a4
LP
4024 if (f->header->tail_entry_realtime == 0)
4025 return -ENOENT;
08984293 4026
162566a4 4027 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
4028 }
4029
4030 return 1;
4031}
4032
4033int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
4034 Object *o;
4035 uint64_t p;
4036 int r;
4037
4038 assert(f);
4039 assert(from || to);
4040
47838ab3 4041 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
4042 if (r <= 0)
4043 return r;
4044
4045 if (le64toh(o->data.n_entries) <= 0)
4046 return 0;
4047
4048 if (from) {
4049 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
4050 if (r < 0)
4051 return r;
4052
4053 *from = le64toh(o->entry.monotonic);
4054 }
4055
4056 if (to) {
4057 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
4058 if (r < 0)
4059 return r;
4060
4061 r = generic_array_get_plus_one(f,
4062 le64toh(o->data.entry_offset),
4063 le64toh(o->data.entry_array_offset),
4064 le64toh(o->data.n_entries)-1,
4065 &o, NULL);
4066 if (r <= 0)
4067 return r;
4068
4069 *to = le64toh(o->entry.monotonic);
4070 }
4071
4072 return 1;
4073}
dca6219e 4074
c8e6e1f1 4075bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) {
dca6219e 4076 assert(f);
c88cc6af 4077 assert(f->header);
dca6219e
LP
4078
4079 /* If we gained new header fields we gained new features,
4080 * hence suggest a rotation */
361f9cbc 4081 if (le64toh(f->header->header_size) < sizeof(Header)) {
c8e6e1f1 4082 log_full(log_level, "%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 4083 return true;
361f9cbc 4084 }
dca6219e 4085
0dbe57ee
LP
4086 /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
4087 * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
4088 * need the n_data field, which only exists in newer versions. */
dca6219e
LP
4089
4090 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 4091 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
c8e6e1f1
DDM
4092 log_full(log_level,
4093 "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
4094 f->path,
4095 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
4096 le64toh(f->header->n_data),
4097 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
4098 (unsigned long long) f->last_stat.st_size,
4099 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 4100 return true;
361f9cbc 4101 }
dca6219e
LP
4102
4103 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 4104 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
c8e6e1f1
DDM
4105 log_full(log_level,
4106 "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
4107 f->path,
4108 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
4109 le64toh(f->header->n_fields),
4110 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 4111 return true;
361f9cbc 4112 }
dca6219e 4113
0dbe57ee
LP
4114 /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
4115 * longest chain is longer than some threshold, let's suggest rotation. */
4116 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) &&
4117 le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
c8e6e1f1
DDM
4118 log_full(log_level,
4119 "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.",
4120 f->path, le64toh(f->header->data_hash_chain_depth));
0dbe57ee
LP
4121 return true;
4122 }
4123
4124 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) &&
4125 le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
c8e6e1f1
DDM
4126 log_full(log_level,
4127 "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",
4128 f->path, le64toh(f->header->field_hash_chain_depth));
0dbe57ee
LP
4129 return true;
4130 }
4131
0598fd4a
LP
4132 /* Are the data objects properly indexed by field objects? */
4133 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
4134 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
4135 le64toh(f->header->n_data) > 0 &&
012181ea
DDM
4136 le64toh(f->header->n_fields) == 0) {
4137 log_full(log_level,
4138 "Data objects of %s are not indexed by field objects, suggesting rotation.",
4139 f->path);
0598fd4a 4140 return true;
012181ea 4141 }
0598fd4a 4142
fb0951b0
LP
4143 if (max_file_usec > 0) {
4144 usec_t t, h;
4145
4146 h = le64toh(f->header->head_entry_realtime);
4147 t = now(CLOCK_REALTIME);
4148
012181ea
DDM
4149 if (h > 0 && t > h + max_file_usec) {
4150 log_full(log_level,
4151 "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.",
4152 f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC));
fb0951b0 4153 return true;
012181ea 4154 }
fb0951b0
LP
4155 }
4156
dca6219e
LP
4157 return false;
4158}
363b2b9a
DDM
4159
4160static const char * const journal_object_type_table[] = {
4161 [OBJECT_UNUSED] = "unused",
4162 [OBJECT_DATA] = "data",
4163 [OBJECT_FIELD] = "field",
4164 [OBJECT_ENTRY] = "entry",
4165 [OBJECT_DATA_HASH_TABLE] = "data hash table",
4166 [OBJECT_FIELD_HASH_TABLE] = "field hash table",
4167 [OBJECT_ENTRY_ARRAY] = "entry array",
4168 [OBJECT_TAG] = "tag",
4169};
4170
4171DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType);