]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Merge pull request #17799 from yuwata/oss-fuzz-25353
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
cec736d2 2
cec736d2 3#include <errno.h>
cec736d2 4#include <fcntl.h>
11689d2a 5#include <linux/fs.h>
ac2e41f5 6#include <pthread.h>
07630cea
LP
7#include <stddef.h>
8#include <sys/mman.h>
9#include <sys/statvfs.h>
10#include <sys/uio.h>
11#include <unistd.h>
fb0951b0 12
a03d4359
ZJS
13#include "sd-event.h"
14
b5efdb8a 15#include "alloc-util.h"
f27a3864 16#include "btrfs-util.h"
c8b3094d 17#include "chattr-util.h"
07630cea 18#include "compress.h"
4ce534f4 19#include "env-util.h"
3ffd4af2 20#include "fd-util.h"
aa892669 21#include "format-util.h"
11b29a96 22#include "fs-util.h"
0284adc6 23#include "journal-authenticate.h"
cec736d2
LP
24#include "journal-def.h"
25#include "journal-file.h"
26#include "lookup3.h"
0a970718 27#include "memory-util.h"
5d1ce257 28#include "path-util.h"
3df3e884 29#include "random-util.h"
b58c888f 30#include "set.h"
760877e9 31#include "sort-util.h"
3cc44114 32#include "stat-util.h"
07630cea 33#include "string-util.h"
4761fd0f 34#include "strv.h"
89a5a90c 35#include "xattr-util.h"
cec736d2 36
4a92baf3
LP
37#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 39
57850536
AG
40#define DEFAULT_COMPRESS_THRESHOLD (512ULL)
41#define MIN_COMPRESS_THRESHOLD (8ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
6aae0b1a 44#define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
6aae0b1a
ZJS
48#define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
49#define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
babfc091 50
6aae0b1a
ZJS
51/* Those are the lower and upper bounds for the minimal use limit,
52 * i.e. how much we'll use even if keep_free suggests otherwise. */
53#define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
54#define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
8580d1f7 55
babfc091 56/* This is the upper bound if we deduce max_size from max_use */
6aae0b1a 57#define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
babfc091
LP
58
59/* This is the upper bound if we deduce the keep_free value from the
60 * file system size */
6aae0b1a 61#define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
babfc091
LP
62
63/* This is the keep_free value when we can't determine the system
64 * size */
6aae0b1a 65#define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
babfc091 66
8580d1f7 67/* This is the default maximum number of journal files to keep around. */
6aae0b1a 68#define DEFAULT_N_MAX_FILES 100
8580d1f7 69
dca6219e
LP
70/* n_data was the first entry we added after the initial file format design */
71#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 72
a4bcff5b
LP
73/* How many entries to keep in the entry array chain cache at max */
74#define CHAIN_CACHE_MAX 20
75
a676e665 76/* How much to increase the journal file size at once each time we allocate something new. */
6aae0b1a 77#define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
a676e665 78
2678031a
LP
79/* Reread fstat() of the file for detecting deletions at least this often */
80#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
81
fa6ac760
LP
82/* The mmap context to use for the header we pick as one above the last defined typed */
83#define CONTEXT_HEADER _OBJECT_TYPE_MAX
84
0dbe57ee
LP
85/* Longest hash chain to rotate after */
86#define HASH_CHAIN_DEPTH_MAX 100
87
51804460
ZJS
88#ifdef __clang__
89# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
90#endif
91
ac2e41f5
VC
92/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
93 * As a result we use atomic operations on f->offline_state for inter-thread communications with
94 * journal_file_set_offline() and journal_file_set_online(). */
95static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 96 assert(f);
ac2e41f5
VC
97 assert(f->fd >= 0);
98 assert(f->header);
99
100 for (;;) {
101 switch (f->offline_state) {
102 case OFFLINE_CANCEL:
103 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
104 continue;
105 return;
106
107 case OFFLINE_AGAIN_FROM_SYNCING:
108 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
109 continue;
110 break;
111
112 case OFFLINE_AGAIN_FROM_OFFLINING:
113 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
114 continue;
115 break;
116
117 case OFFLINE_SYNCING:
118 (void) fsync(f->fd);
26687bf8 119
ac2e41f5
VC
120 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
121 continue;
26687bf8 122
8eb85171 123 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
124 (void) fsync(f->fd);
125 break;
126
127 case OFFLINE_OFFLINING:
128 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
129 continue;
4831981d 130 _fallthrough_;
ac2e41f5
VC
131 case OFFLINE_DONE:
132 return;
133
134 case OFFLINE_JOINED:
135 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
136 return;
137 }
138 }
139}
140
141static void * journal_file_set_offline_thread(void *arg) {
142 JournalFile *f = arg;
143
fa7ff4cf
LP
144 (void) pthread_setname_np(pthread_self(), "journal-offline");
145
ac2e41f5
VC
146 journal_file_set_offline_internal(f);
147
148 return NULL;
149}
150
151static int journal_file_set_offline_thread_join(JournalFile *f) {
152 int r;
153
154 assert(f);
155
156 if (f->offline_state == OFFLINE_JOINED)
157 return 0;
158
159 r = pthread_join(f->offline_thread, NULL);
160 if (r)
161 return -r;
162
163 f->offline_state = OFFLINE_JOINED;
26687bf8 164
be7cdd8e 165 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
166 return -EIO;
167
ac2e41f5
VC
168 return 0;
169}
26687bf8 170
ac2e41f5
VC
171/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
172static bool journal_file_set_offline_try_restart(JournalFile *f) {
173 for (;;) {
174 switch (f->offline_state) {
175 case OFFLINE_AGAIN_FROM_SYNCING:
176 case OFFLINE_AGAIN_FROM_OFFLINING:
177 return true;
178
179 case OFFLINE_CANCEL:
180 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
181 continue;
182 return true;
183
184 case OFFLINE_SYNCING:
185 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
186 continue;
187 return true;
188
189 case OFFLINE_OFFLINING:
190 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
191 continue;
192 return true;
26687bf8
OS
193
194 default:
ac2e41f5
VC
195 return false;
196 }
26687bf8
OS
197 }
198}
199
ac2e41f5
VC
200/* Sets a journal offline.
201 *
202 * If wait is false then an offline is dispatched in a separate thread for a
203 * subsequent journal_file_set_offline() or journal_file_set_online() of the
204 * same journal to synchronize with.
205 *
206 * If wait is true, then either an existing offline thread will be restarted
207 * and joined, or if none exists the offline is simply performed in this
208 * context without involving another thread.
209 */
210int journal_file_set_offline(JournalFile *f, bool wait) {
211 bool restarted;
212 int r;
213
26687bf8
OS
214 assert(f);
215
216 if (!f->writable)
217 return -EPERM;
218
846e5418 219 if (f->fd < 0 || !f->header)
26687bf8
OS
220 return -EINVAL;
221
b8f99e27
VC
222 /* An offlining journal is implicitly online and may modify f->header->state,
223 * we must also join any potentially lingering offline thread when not online. */
224 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
225 return journal_file_set_offline_thread_join(f);
26687bf8 226
ac2e41f5
VC
227 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
228 restarted = journal_file_set_offline_try_restart(f);
229 if ((restarted && wait) || !restarted) {
230 r = journal_file_set_offline_thread_join(f);
231 if (r < 0)
232 return r;
233 }
26687bf8 234
ac2e41f5
VC
235 if (restarted)
236 return 0;
237
238 /* Initiate a new offline. */
239 f->offline_state = OFFLINE_SYNCING;
fa6ac760 240
ac2e41f5
VC
241 if (wait) /* Without using a thread if waiting. */
242 journal_file_set_offline_internal(f);
243 else {
5e9f01e8
LP
244 sigset_t ss, saved_ss;
245 int k;
246
cd2a429e 247 assert_se(sigfillset(&ss) >= 0);
08f9e80b
CM
248 /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
249 * Asynchronous SIGBUS signals can safely be handled by either thread. */
250 assert_se(sigdelset(&ss, SIGBUS) >= 0);
5e9f01e8
LP
251
252 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
253 if (r > 0)
254 return -r;
255
ac2e41f5 256 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
257
258 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
259 if (r > 0) {
260 f->offline_state = OFFLINE_JOINED;
ac2e41f5 261 return -r;
ec9ffa2c 262 }
5e9f01e8
LP
263 if (k > 0)
264 return -k;
ac2e41f5
VC
265 }
266
267 return 0;
268}
269
270static int journal_file_set_online(JournalFile *f) {
83bf6b67 271 bool wait = true;
ac2e41f5
VC
272
273 assert(f);
274
275 if (!f->writable)
276 return -EPERM;
277
846e5418 278 if (f->fd < 0 || !f->header)
ac2e41f5
VC
279 return -EINVAL;
280
83bf6b67 281 while (wait) {
ac2e41f5
VC
282 switch (f->offline_state) {
283 case OFFLINE_JOINED:
284 /* No offline thread, no need to wait. */
83bf6b67 285 wait = false;
ac2e41f5
VC
286 break;
287
288 case OFFLINE_SYNCING:
289 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
290 continue;
291 /* Canceled syncing prior to offlining, no need to wait. */
83bf6b67 292 wait = false;
ac2e41f5
VC
293 break;
294
295 case OFFLINE_AGAIN_FROM_SYNCING:
296 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
297 continue;
298 /* Canceled restart from syncing, no need to wait. */
83bf6b67 299 wait = false;
ac2e41f5
VC
300 break;
301
302 case OFFLINE_AGAIN_FROM_OFFLINING:
303 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
304 continue;
305 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 306 _fallthrough_;
ac2e41f5
VC
307 default: {
308 int r;
309
310 r = journal_file_set_offline_thread_join(f);
311 if (r < 0)
312 return r;
313
83bf6b67 314 wait = false;
ac2e41f5
VC
315 break;
316 }
317 }
318 }
26687bf8 319
be7cdd8e 320 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
321 return -EIO;
322
ac2e41f5
VC
323 switch (f->header->state) {
324 case STATE_ONLINE:
325 return 0;
26687bf8 326
ac2e41f5
VC
327 case STATE_OFFLINE:
328 f->header->state = STATE_ONLINE;
329 (void) fsync(f->fd);
330 return 0;
331
332 default:
333 return -EINVAL;
334 }
26687bf8
OS
335}
336
b58c888f
VC
337bool journal_file_is_offlining(JournalFile *f) {
338 assert(f);
339
340 __sync_synchronize();
341
3742095b 342 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
343 return false;
344
345 return true;
346}
347
804ae586 348JournalFile* journal_file_close(JournalFile *f) {
c377a6f3
YW
349 if (!f)
350 return NULL;
cec736d2 351
349cc4a5 352#if HAVE_GCRYPT
b0af6f41 353 /* Write the final tag */
43cd8794
FB
354 if (f->seal && f->writable) {
355 int r;
356
357 r = journal_file_append_tag(f);
358 if (r < 0)
359 log_error_errno(r, "Failed to append tag when closing journal: %m");
360 }
feb12d3e 361#endif
b0af6f41 362
7a24f3bf 363 if (f->post_change_timer) {
b6cdfbe5
ZJS
364 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
365 journal_file_post_change(f);
7a24f3bf 366
1d3fe304 367 sd_event_source_disable_unref(f->post_change_timer);
7a24f3bf
VC
368 }
369
ac2e41f5 370 journal_file_set_offline(f, true);
cec736d2 371
be7cdd8e
VC
372 if (f->mmap && f->cache_fd)
373 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 374
11689d2a
LP
375 if (f->fd >= 0 && f->defrag_on_close) {
376
377 /* Be friendly to btrfs: turn COW back on again now,
378 * and defragment the file. We won't write to the file
379 * ever again, hence remove all fragmentation, and
380 * reenable all the good bits COW usually provides
381 * (such as data checksumming). */
382
db9a4254 383 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
11689d2a
LP
384 (void) btrfs_defrag_fd(f->fd);
385 }
f27a3864 386
5d1ce257
LP
387 if (f->close_fd)
388 safe_close(f->fd);
cec736d2 389 free(f->path);
807e17f0 390
f649045c 391 mmap_cache_unref(f->mmap);
16e9f408 392
4743015d 393 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 394
d80b051c 395#if HAVE_COMPRESSION
807e17f0
LP
396 free(f->compress_buffer);
397#endif
398
349cc4a5 399#if HAVE_GCRYPT
baed47c3
LP
400 if (f->fss_file)
401 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 402 else
b7c9ae91
LP
403 free(f->fsprg_state);
404
405 free(f->fsprg_seed);
7560fffc
LP
406
407 if (f->hmac)
408 gcry_md_close(f->hmac);
409#endif
410
6b430fdb 411 return mfree(f);
cec736d2
LP
412}
413
0ac38b70 414static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 415 Header h = {};
cec736d2
LP
416 ssize_t k;
417 int r;
418
419 assert(f);
420
7560fffc 421 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 422 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 423
d89c8fdf
ZJS
424 h.incompatible_flags |= htole32(
425 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
4ce534f4 426 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4 |
8653185a 427 f->compress_zstd * HEADER_INCOMPATIBLE_COMPRESSED_ZSTD |
4ce534f4 428 f->keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
7560fffc 429
d89c8fdf
ZJS
430 h.compatible_flags = htole32(
431 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 432
cec736d2
LP
433 r = sd_id128_randomize(&h.file_id);
434 if (r < 0)
435 return r;
436
0ac38b70
LP
437 if (template) {
438 h.seqnum_id = template->header->seqnum_id;
beec0085 439 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
440 } else
441 h.seqnum_id = h.file_id;
cec736d2
LP
442
443 k = pwrite(f->fd, &h, sizeof(h), 0);
444 if (k < 0)
445 return -errno;
446
447 if (k != sizeof(h))
448 return -EIO;
449
450 return 0;
451}
452
453static int journal_file_refresh_header(JournalFile *f) {
fa6ac760 454 int r;
cec736d2
LP
455
456 assert(f);
c88cc6af 457 assert(f->header);
cec736d2
LP
458
459 r = sd_id128_get_machine(&f->header->machine_id);
fd4885df
ZJS
460 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
461 /* We don't have a machine-id, let's continue without */
462 zero(f->header->machine_id);
463 else if (r < 0)
cec736d2
LP
464 return r;
465
e958c057 466 r = sd_id128_get_boot(&f->header->boot_id);
cec736d2
LP
467 if (r < 0)
468 return r;
469
fa6ac760 470 r = journal_file_set_online(f);
b788cc23 471
7560fffc 472 /* Sync the online state to disk */
fb426037 473 (void) fsync(f->fd);
b788cc23 474
a0fe2a2d
LP
475 /* We likely just created a new file, also sync the directory this file is located in. */
476 (void) fsync_directory_of_file(f->fd);
477
fa6ac760 478 return r;
cec736d2
LP
479}
480
4214009f
ZJS
481static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
482 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
483 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
484 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
485 uint32_t flags;
486
4214009f
ZJS
487 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
488
489 if (flags & ~supported) {
490 if (flags & ~any)
4761fd0f 491 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
492 f->path, type, flags & ~any);
493 flags = (flags & any) & ~supported;
4761fd0f 494 if (flags) {
8653185a 495 const char* strv[5];
4761fd0f
ZJS
496 unsigned n = 0;
497 _cleanup_free_ char *t = NULL;
498
4ce534f4
LP
499 if (compatible) {
500 if (flags & HEADER_COMPATIBLE_SEALED)
501 strv[n++] = "sealed";
502 } else {
503 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
504 strv[n++] = "xz-compressed";
505 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
506 strv[n++] = "lz4-compressed";
8653185a
LP
507 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
508 strv[n++] = "zstd-compressed";
4ce534f4
LP
509 if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
510 strv[n++] = "keyed-hash";
511 }
4761fd0f
ZJS
512 strv[n] = NULL;
513 assert(n < ELEMENTSOF(strv));
514
515 t = strv_join((char**) strv, ", ");
516 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
517 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
518 }
4214009f
ZJS
519 return true;
520 }
521
522 return false;
523}
524
525static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
526 uint64_t arena_size, header_size;
527
cec736d2 528 assert(f);
c88cc6af 529 assert(f->header);
cec736d2 530
7560fffc 531 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
532 return -EBADMSG;
533
4214009f
ZJS
534 /* In both read and write mode we refuse to open files with incompatible
535 * flags we don't know. */
536 if (warn_wrong_flags(f, false))
cec736d2
LP
537 return -EPROTONOSUPPORT;
538
4214009f
ZJS
539 /* When open for writing we refuse to open files with compatible flags, too. */
540 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 541 return -EPROTONOSUPPORT;
7560fffc 542
db11ac1a
LP
543 if (f->header->state >= _STATE_MAX)
544 return -EBADMSG;
545
893e0f8f 546 header_size = le64toh(READ_NOW(f->header->header_size));
6f94e420 547
dca6219e 548 /* The first addition was n_data, so check that we are at least this large */
6f94e420 549 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
550 return -EBADMSG;
551
8088cbd3 552 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
553 return -EBADMSG;
554
893e0f8f 555 arena_size = le64toh(READ_NOW(f->header->arena_size));
6f94e420
TS
556
557 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
558 return -ENODATA;
559
6f94e420 560 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
561 return -ENODATA;
562
7762e02b
LP
563 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
564 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
565 !VALID64(le64toh(f->header->tail_object_offset)) ||
566 !VALID64(le64toh(f->header->entry_array_offset)))
567 return -ENODATA;
568
cec736d2 569 if (f->writable) {
cec736d2 570 sd_id128_t machine_id;
ae739cc1 571 uint8_t state;
cec736d2
LP
572 int r;
573
574 r = sd_id128_get_machine(&machine_id);
575 if (r < 0)
576 return r;
577
578 if (!sd_id128_equal(machine_id, f->header->machine_id))
579 return -EHOSTDOWN;
580
de190aef 581 state = f->header->state;
cec736d2 582
b288cdeb
ZJS
583 if (state == STATE_ARCHIVED)
584 return -ESHUTDOWN; /* Already archived */
baaa35ad
ZJS
585 else if (state == STATE_ONLINE)
586 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
587 "Journal file %s is already online. Assuming unclean closing.",
588 f->path);
589 else if (state != STATE_OFFLINE)
590 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
591 "Journal file %s has unknown state %i.",
592 f->path, state);
ae739cc1 593
5b3cc0c8
YN
594 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
595 return -EBADMSG;
596
ae739cc1
LP
597 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
598 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
599 * bisection. */
baaa35ad
ZJS
600 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
601 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
602 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
603 f->path);
cec736d2
LP
604 }
605
d89c8fdf
ZJS
606 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
607 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
8653185a 608 f->compress_zstd = JOURNAL_HEADER_COMPRESSED_ZSTD(f->header);
c586dbf1 609
f1889c91 610 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 611
4ce534f4
LP
612 f->keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
613
cec736d2
LP
614 return 0;
615}
616
28ca867a 617int journal_file_fstat(JournalFile *f) {
3cc44114
LP
618 int r;
619
2678031a
LP
620 assert(f);
621 assert(f->fd >= 0);
622
623 if (fstat(f->fd, &f->last_stat) < 0)
624 return -errno;
625
626 f->last_stat_usec = now(CLOCK_MONOTONIC);
627
e9dd6984 628 /* Refuse dealing with files that aren't regular */
3cc44114
LP
629 r = stat_verify_regular(&f->last_stat);
630 if (r < 0)
631 return r;
8d6a4d33 632
2678031a
LP
633 /* Refuse appending to files that are already deleted */
634 if (f->last_stat.st_nlink <= 0)
635 return -EIDRM;
636
637 return 0;
638}
639
cec736d2 640static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
893e0f8f 641 uint64_t old_size, new_size, old_header_size, old_arena_size;
fec2aa2f 642 int r;
cec736d2
LP
643
644 assert(f);
c88cc6af 645 assert(f->header);
cec736d2 646
893e0f8f
LP
647 /* We assume that this file is not sparse, and we know that for sure, since we always call
648 * posix_fallocate() ourselves */
649
650 if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset)
651 return -EINVAL;
cec736d2 652
be7cdd8e 653 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
654 return -EIO;
655
893e0f8f
LP
656 old_header_size = le64toh(READ_NOW(f->header->header_size));
657 old_arena_size = le64toh(READ_NOW(f->header->arena_size));
658 if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size)
659 return -EBADMSG;
660
661 old_size = old_header_size + old_arena_size;
cec736d2 662
893e0f8f 663 new_size = MAX(PAGE_ALIGN(offset + size), old_header_size);
bc85bfee 664
2678031a
LP
665 if (new_size <= old_size) {
666
667 /* We already pre-allocated enough space, but before
668 * we write to it, let's check with fstat() if the
669 * file got deleted, in order make sure we don't throw
670 * away the data immediately. Don't check fstat() for
671 * all writes though, but only once ever 10s. */
672
673 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
674 return 0;
675
676 return journal_file_fstat(f);
677 }
678
679 /* Allocate more space. */
cec736d2 680
a676e665 681 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 682 return -E2BIG;
cec736d2 683
a676e665 684 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
685 struct statvfs svfs;
686
687 if (fstatvfs(f->fd, &svfs) >= 0) {
688 uint64_t available;
689
070052ab 690 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
691
692 if (new_size - old_size > available)
693 return -E2BIG;
694 }
695 }
696
eda4b58b 697 /* Increase by larger blocks at once */
be6b0c21 698 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
eda4b58b
LP
699 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
700 new_size = f->metrics.max_size;
701
bc85bfee
LP
702 /* Note that the glibc fallocate() fallback is very
703 inefficient, hence we try to minimize the allocation area
704 as we can. */
fec2aa2f
GV
705 r = posix_fallocate(f->fd, old_size, new_size - old_size);
706 if (r != 0)
707 return -r;
cec736d2 708
893e0f8f 709 f->header->arena_size = htole64(new_size - old_header_size);
cec736d2 710
2678031a 711 return journal_file_fstat(f);
cec736d2
LP
712}
713
78519831 714static unsigned type_to_context(ObjectType type) {
d3d3208f 715 /* One context for each type, plus one catch-all for the rest */
69adae51 716 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 717 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 718 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
719}
720
71139898
LP
721static int journal_file_move_to(
722 JournalFile *f,
723 ObjectType type,
724 bool keep_always,
725 uint64_t offset,
726 uint64_t size,
258190a0 727 void **ret) {
71139898 728
2678031a
LP
729 int r;
730
cec736d2 731 assert(f);
cec736d2
LP
732 assert(ret);
733
7762e02b
LP
734 if (size <= 0)
735 return -EINVAL;
736
893e0f8f
LP
737 if (size > UINT64_MAX - offset)
738 return -EBADMSG;
739
2a59ea54 740 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
741 if (offset + size > (uint64_t) f->last_stat.st_size) {
742 /* Hmm, out of range? Let's refresh the fstat() data
743 * first, before we trust that check. */
744
2678031a
LP
745 r = journal_file_fstat(f);
746 if (r < 0)
747 return r;
748
749 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
750 return -EADDRNOTAVAIL;
751 }
752
258190a0 753 return mmap_cache_get(f->mmap, f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
754}
755
16e9f408
LP
756static uint64_t minimum_header_size(Object *o) {
757
b8e891e6 758 static const uint64_t table[] = {
16e9f408
LP
759 [OBJECT_DATA] = sizeof(DataObject),
760 [OBJECT_FIELD] = sizeof(FieldObject),
761 [OBJECT_ENTRY] = sizeof(EntryObject),
762 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
763 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
764 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
765 [OBJECT_TAG] = sizeof(TagObject),
766 };
767
768 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
769 return sizeof(ObjectHeader);
770
771 return table[o->object.type];
772}
773
24754f36
TR
774/* Lightweight object checks. We want this to be fast, so that we won't
775 * slowdown every journal_file_move_to_object() call too much. */
776static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
777 assert(f);
778 assert(o);
779
780 switch (o->object.type) {
781
a602d93e 782 case OBJECT_DATA:
baaa35ad
ZJS
783 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
784 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
785 "Bad n_entries: %" PRIu64 ": %" PRIu64,
786 le64toh(o->data.n_entries),
787 offset);
788
20ee282b 789 if (le64toh(o->object.size) <= offsetof(DataObject, payload))
baaa35ad
ZJS
790 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
791 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
792 offsetof(DataObject, payload),
793 le64toh(o->object.size),
794 offset);
24754f36 795
10e8445b
TR
796 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
797 !VALID64(le64toh(o->data.next_field_offset)) ||
798 !VALID64(le64toh(o->data.entry_offset)) ||
baaa35ad
ZJS
799 !VALID64(le64toh(o->data.entry_array_offset)))
800 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
801 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
802 le64toh(o->data.next_hash_offset),
803 le64toh(o->data.next_field_offset),
804 le64toh(o->data.entry_offset),
805 le64toh(o->data.entry_array_offset),
806 offset);
24754f36
TR
807
808 break;
24754f36
TR
809
810 case OBJECT_FIELD:
20ee282b 811 if (le64toh(o->object.size) <= offsetof(FieldObject, payload))
baaa35ad
ZJS
812 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
813 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
814 offsetof(FieldObject, payload),
815 le64toh(o->object.size),
816 offset);
24754f36 817
10e8445b 818 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
baaa35ad
ZJS
819 !VALID64(le64toh(o->field.head_data_offset)))
820 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
821 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
822 le64toh(o->field.next_hash_offset),
823 le64toh(o->field.head_data_offset),
824 offset);
24754f36
TR
825 break;
826
893e0f8f
LP
827 case OBJECT_ENTRY: {
828 uint64_t sz;
829
830 sz = le64toh(READ_NOW(o->object.size));
831 if (sz < offsetof(EntryObject, items) ||
832 (sz - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
baaa35ad
ZJS
833 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
834 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
835 offsetof(EntryObject, items),
893e0f8f 836 sz,
baaa35ad
ZJS
837 offset);
838
893e0f8f 839 if ((sz - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
baaa35ad
ZJS
840 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
841 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
893e0f8f 842 (sz - offsetof(EntryObject, items)) / sizeof(EntryItem),
baaa35ad
ZJS
843 offset);
844
845 if (le64toh(o->entry.seqnum) <= 0)
846 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
847 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
848 le64toh(o->entry.seqnum),
849 offset);
850
851 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
852 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
853 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
854 le64toh(o->entry.realtime),
855 offset);
856
857 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
858 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
859 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
860 le64toh(o->entry.monotonic),
861 offset);
24754f36
TR
862
863 break;
893e0f8f 864 }
24754f36
TR
865
866 case OBJECT_DATA_HASH_TABLE:
893e0f8f
LP
867 case OBJECT_FIELD_HASH_TABLE: {
868 uint64_t sz;
869
870 sz = le64toh(READ_NOW(o->object.size));
871 if (sz < offsetof(HashTableObject, items) ||
872 (sz - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
873 (sz - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
baaa35ad
ZJS
874 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
875 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
876 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
893e0f8f 877 sz,
baaa35ad 878 offset);
24754f36
TR
879
880 break;
893e0f8f 881 }
24754f36 882
893e0f8f
LP
883 case OBJECT_ENTRY_ARRAY: {
884 uint64_t sz;
885
886 sz = le64toh(READ_NOW(o->object.size));
887 if (sz < offsetof(EntryArrayObject, items) ||
888 (sz - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
889 (sz - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
baaa35ad
ZJS
890 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
891 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
893e0f8f 892 sz,
baaa35ad
ZJS
893 offset);
894
895 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
896 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
897 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
898 le64toh(o->entry_array.next_entry_array_offset),
899 offset);
24754f36
TR
900
901 break;
893e0f8f 902 }
24754f36
TR
903
904 case OBJECT_TAG:
baaa35ad
ZJS
905 if (le64toh(o->object.size) != sizeof(TagObject))
906 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
907 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
908 le64toh(o->object.size),
909 offset);
24754f36 910
baaa35ad
ZJS
911 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
912 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
913 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
914 le64toh(o->tag.epoch), offset);
24754f36
TR
915
916 break;
917 }
918
919 return 0;
920}
921
78519831 922int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
923 int r;
924 void *t;
925 Object *o;
926 uint64_t s;
927
928 assert(f);
929 assert(ret);
930
db11ac1a 931 /* Objects may only be located at multiple of 64 bit */
baaa35ad
ZJS
932 if (!VALID64(offset))
933 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
934 "Attempt to move to object at non-64bit boundary: %" PRIu64,
935 offset);
db11ac1a 936
50809d7a 937 /* Object may not be located in the file header */
baaa35ad
ZJS
938 if (offset < le64toh(f->header->header_size))
939 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
940 "Attempt to move to object located in file header: %" PRIu64,
941 offset);
50809d7a 942
258190a0 943 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
944 if (r < 0)
945 return r;
946
947 o = (Object*) t;
893e0f8f 948 s = le64toh(READ_NOW(o->object.size));
cec736d2 949
baaa35ad
ZJS
950 if (s == 0)
951 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
952 "Attempt to move to uninitialized object: %" PRIu64,
953 offset);
954 if (s < sizeof(ObjectHeader))
955 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
956 "Attempt to move to overly short object: %" PRIu64,
957 offset);
958
959 if (o->object.type <= OBJECT_UNUSED)
960 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
961 "Attempt to move to object with invalid type: %" PRIu64,
962 offset);
963
964 if (s < minimum_header_size(o))
965 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
966 "Attempt to move to truncated object: %" PRIu64,
967 offset);
968
969 if (type > OBJECT_UNUSED && o->object.type != type)
970 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
971 "Attempt to move to object of unexpected type: %" PRIu64,
972 offset);
cec736d2 973
258190a0
VC
974 r = journal_file_move_to(f, type, false, offset, s, &t);
975 if (r < 0)
976 return r;
cec736d2 977
258190a0 978 o = (Object*) t;
cec736d2 979
24754f36
TR
980 r = journal_file_check_object(f, offset, o);
981 if (r < 0)
982 return r;
983
cec736d2
LP
984 *ret = o;
985 return 0;
986}
987
d98cc1f2 988static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
989 uint64_t r;
990
991 assert(f);
c88cc6af 992 assert(f->header);
cec736d2 993
beec0085 994 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
995
996 if (seqnum) {
de190aef 997 /* If an external seqnum counter was passed, we update
c2373f84
LP
998 * both the local and the external one, and set it to
999 * the maximum of both */
1000
1001 if (*seqnum + 1 > r)
1002 r = *seqnum + 1;
1003
1004 *seqnum = r;
1005 }
1006
beec0085 1007 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1008
beec0085
LP
1009 if (f->header->head_entry_seqnum == 0)
1010 f->header->head_entry_seqnum = htole64(r);
de190aef 1011
cec736d2
LP
1012 return r;
1013}
1014
f4474e00
LP
1015int journal_file_append_object(
1016 JournalFile *f,
1017 ObjectType type,
1018 uint64_t size,
1019 Object **ret,
1020 uint64_t *ret_offset) {
1021
cec736d2
LP
1022 int r;
1023 uint64_t p;
1024 Object *tail, *o;
1025 void *t;
1026
1027 assert(f);
c88cc6af 1028 assert(f->header);
d05089d8 1029 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2 1030 assert(size >= sizeof(ObjectHeader));
cec736d2 1031
26687bf8
OS
1032 r = journal_file_set_online(f);
1033 if (r < 0)
1034 return r;
1035
cec736d2 1036 p = le64toh(f->header->tail_object_offset);
cec736d2 1037 if (p == 0)
23b0b2b2 1038 p = le64toh(f->header->header_size);
cec736d2 1039 else {
893e0f8f
LP
1040 uint64_t sz;
1041
d05089d8 1042 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1043 if (r < 0)
1044 return r;
1045
893e0f8f
LP
1046 sz = le64toh(READ_NOW(tail->object.size));
1047 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
1048 return -EBADMSG;
1049
1050 sz = ALIGN64(sz);
1051 if (p > UINT64_MAX - sz)
1052 return -EBADMSG;
1053
1054 p += sz;
cec736d2
LP
1055 }
1056
1057 r = journal_file_allocate(f, p, size);
1058 if (r < 0)
1059 return r;
1060
258190a0 1061 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
1062 if (r < 0)
1063 return r;
1064
1065 o = (Object*) t;
71139898
LP
1066 o->object = (ObjectHeader) {
1067 .type = type,
1068 .size = htole64(size),
1069 };
cec736d2
LP
1070
1071 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1072 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1073
f4474e00
LP
1074 if (ret)
1075 *ret = o;
1076
1077 if (ret_offset)
1078 *ret_offset = p;
cec736d2
LP
1079
1080 return 0;
1081}
1082
de190aef 1083static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1084 uint64_t s, p;
1085 Object *o;
1086 int r;
1087
1088 assert(f);
c88cc6af 1089 assert(f->header);
cec736d2 1090
070052ab
LP
1091 /* We estimate that we need 1 hash table entry per 768 bytes
1092 of journal file and we want to make sure we never get
1093 beyond 75% fill level. Calculate the hash table size for
1094 the maximum file size based on these metrics. */
4a92baf3 1095
dfabe643 1096 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1097 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1098 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1099
5030c85a 1100 log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));
4a92baf3 1101
de190aef
LP
1102 r = journal_file_append_object(f,
1103 OBJECT_DATA_HASH_TABLE,
1104 offsetof(Object, hash_table.items) + s,
1105 &o, &p);
cec736d2
LP
1106 if (r < 0)
1107 return r;
1108
29804cc1 1109 memzero(o->hash_table.items, s);
cec736d2 1110
de190aef
LP
1111 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1112 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1113
1114 return 0;
1115}
1116
de190aef 1117static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1118 uint64_t s, p;
1119 Object *o;
1120 int r;
1121
1122 assert(f);
c88cc6af 1123 assert(f->header);
cec736d2 1124
3c1668da
LP
1125 /* We use a fixed size hash table for the fields as this
1126 * number should grow very slowly only */
1127
de190aef 1128 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
5030c85a
LP
1129 log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem));
1130
de190aef
LP
1131 r = journal_file_append_object(f,
1132 OBJECT_FIELD_HASH_TABLE,
1133 offsetof(Object, hash_table.items) + s,
1134 &o, &p);
cec736d2
LP
1135 if (r < 0)
1136 return r;
1137
29804cc1 1138 memzero(o->hash_table.items, s);
cec736d2 1139
de190aef
LP
1140 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1141 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1142
1143 return 0;
1144}
1145
dade37d4 1146int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1147 uint64_t s, p;
1148 void *t;
1149 int r;
1150
1151 assert(f);
c88cc6af 1152 assert(f->header);
cec736d2 1153
dade37d4
LP
1154 if (f->data_hash_table)
1155 return 0;
1156
de190aef
LP
1157 p = le64toh(f->header->data_hash_table_offset);
1158 s = le64toh(f->header->data_hash_table_size);
cec736d2 1159
de190aef 1160 r = journal_file_move_to(f,
16e9f408 1161 OBJECT_DATA_HASH_TABLE,
fcde2389 1162 true,
de190aef 1163 p, s,
258190a0 1164 &t);
cec736d2
LP
1165 if (r < 0)
1166 return r;
1167
de190aef 1168 f->data_hash_table = t;
cec736d2
LP
1169 return 0;
1170}
1171
dade37d4 1172int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1173 uint64_t s, p;
1174 void *t;
1175 int r;
1176
1177 assert(f);
c88cc6af 1178 assert(f->header);
cec736d2 1179
dade37d4
LP
1180 if (f->field_hash_table)
1181 return 0;
1182
de190aef
LP
1183 p = le64toh(f->header->field_hash_table_offset);
1184 s = le64toh(f->header->field_hash_table_size);
cec736d2 1185
de190aef 1186 r = journal_file_move_to(f,
16e9f408 1187 OBJECT_FIELD_HASH_TABLE,
fcde2389 1188 true,
de190aef 1189 p, s,
258190a0 1190 &t);
cec736d2
LP
1191 if (r < 0)
1192 return r;
1193
de190aef 1194 f->field_hash_table = t;
cec736d2
LP
1195 return 0;
1196}
1197
3c1668da
LP
1198static int journal_file_link_field(
1199 JournalFile *f,
1200 Object *o,
1201 uint64_t offset,
1202 uint64_t hash) {
1203
805d1486 1204 uint64_t p, h, m;
3c1668da
LP
1205 int r;
1206
1207 assert(f);
c88cc6af 1208 assert(f->header);
90d222c1 1209 assert(f->field_hash_table);
3c1668da
LP
1210 assert(o);
1211 assert(offset > 0);
1212
1213 if (o->object.type != OBJECT_FIELD)
1214 return -EINVAL;
1215
893e0f8f 1216 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
805d1486
LP
1217 if (m <= 0)
1218 return -EBADMSG;
3c1668da 1219
805d1486 1220 /* This might alter the window we are looking at */
3c1668da
LP
1221 o->field.next_hash_offset = o->field.head_data_offset = 0;
1222
805d1486 1223 h = hash % m;
3c1668da
LP
1224 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1225 if (p == 0)
1226 f->field_hash_table[h].head_hash_offset = htole64(offset);
1227 else {
1228 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1229 if (r < 0)
1230 return r;
1231
1232 o->field.next_hash_offset = htole64(offset);
1233 }
1234
1235 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1236
1237 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1238 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1239
1240 return 0;
1241}
1242
1243static int journal_file_link_data(
1244 JournalFile *f,
1245 Object *o,
1246 uint64_t offset,
1247 uint64_t hash) {
1248
805d1486 1249 uint64_t p, h, m;
cec736d2
LP
1250 int r;
1251
1252 assert(f);
c88cc6af 1253 assert(f->header);
90d222c1 1254 assert(f->data_hash_table);
cec736d2
LP
1255 assert(o);
1256 assert(offset > 0);
b588975f
LP
1257
1258 if (o->object.type != OBJECT_DATA)
1259 return -EINVAL;
cec736d2 1260
893e0f8f 1261 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
805d1486
LP
1262 if (m <= 0)
1263 return -EBADMSG;
48496df6 1264
805d1486 1265 /* This might alter the window we are looking at */
de190aef
LP
1266 o->data.next_hash_offset = o->data.next_field_offset = 0;
1267 o->data.entry_offset = o->data.entry_array_offset = 0;
1268 o->data.n_entries = 0;
cec736d2 1269
805d1486 1270 h = hash % m;
8db4213e 1271 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1272 if (p == 0)
cec736d2 1273 /* Only entry in the hash table is easy */
de190aef 1274 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1275 else {
48496df6
LP
1276 /* Move back to the previous data object, to patch in
1277 * pointer */
cec736d2 1278
de190aef 1279 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1280 if (r < 0)
1281 return r;
1282
de190aef 1283 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1284 }
1285
de190aef 1286 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1287
dca6219e
LP
1288 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1289 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1290
cec736d2
LP
1291 return 0;
1292}
1293
0dbe57ee
LP
1294static int next_hash_offset(
1295 JournalFile *f,
1296 uint64_t *p,
1297 le64_t *next_hash_offset,
1298 uint64_t *depth,
1299 le64_t *header_max_depth) {
1300
1301 uint64_t nextp;
1302
1303 nextp = le64toh(READ_NOW(*next_hash_offset));
1304 if (nextp > 0) {
1305 if (nextp <= *p) /* Refuse going in loops */
1306 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1307 "Detected hash item loop in %s, refusing.", f->path);
1308
1309 (*depth)++;
1310
1311 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1312 if (header_max_depth && f->writable)
1313 *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth)));
1314 }
1315
1316 *p = nextp;
1317 return 0;
1318}
1319
3c1668da
LP
1320int journal_file_find_field_object_with_hash(
1321 JournalFile *f,
1322 const void *field, uint64_t size, uint64_t hash,
f4474e00 1323 Object **ret, uint64_t *ret_offset) {
3c1668da 1324
0dbe57ee 1325 uint64_t p, osize, h, m, depth = 0;
3c1668da
LP
1326 int r;
1327
1328 assert(f);
c88cc6af 1329 assert(f->header);
3c1668da
LP
1330 assert(field && size > 0);
1331
dade37d4
LP
1332 /* If the field hash table is empty, we can't find anything */
1333 if (le64toh(f->header->field_hash_table_size) <= 0)
1334 return 0;
1335
1336 /* Map the field hash table, if it isn't mapped yet. */
1337 r = journal_file_map_field_hash_table(f);
1338 if (r < 0)
1339 return r;
1340
3c1668da
LP
1341 osize = offsetof(Object, field.payload) + size;
1342
893e0f8f 1343 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
805d1486 1344 if (m <= 0)
3c1668da
LP
1345 return -EBADMSG;
1346
805d1486 1347 h = hash % m;
3c1668da 1348 p = le64toh(f->field_hash_table[h].head_hash_offset);
3c1668da
LP
1349 while (p > 0) {
1350 Object *o;
1351
1352 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1353 if (r < 0)
1354 return r;
1355
1356 if (le64toh(o->field.hash) == hash &&
1357 le64toh(o->object.size) == osize &&
1358 memcmp(o->field.payload, field, size) == 0) {
1359
1360 if (ret)
1361 *ret = o;
f4474e00
LP
1362 if (ret_offset)
1363 *ret_offset = p;
3c1668da
LP
1364
1365 return 1;
1366 }
1367
0dbe57ee
LP
1368 r = next_hash_offset(
1369 f,
1370 &p,
1371 &o->field.next_hash_offset,
1372 &depth,
1373 JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL);
1374 if (r < 0)
1375 return r;
3c1668da
LP
1376 }
1377
1378 return 0;
1379}
1380
4ce534f4
LP
1381uint64_t journal_file_hash_data(
1382 JournalFile *f,
1383 const void *data,
1384 size_t sz) {
1385
1386 assert(f);
1387 assert(data || sz == 0);
1388
1389 /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1390 * function use siphash. Old journal files use the Jenkins hash. */
1391
1392 if (JOURNAL_HEADER_KEYED_HASH(f->header))
1393 return siphash24(data, sz, f->header->file_id.bytes);
1394
1395 return jenkins_hash64(data, sz);
1396}
1397
3c1668da
LP
1398int journal_file_find_field_object(
1399 JournalFile *f,
1400 const void *field, uint64_t size,
f4474e00 1401 Object **ret, uint64_t *ret_offset) {
3c1668da 1402
3c1668da
LP
1403 assert(f);
1404 assert(field && size > 0);
1405
f4474e00
LP
1406 return journal_file_find_field_object_with_hash(
1407 f,
4ce534f4
LP
1408 field, size,
1409 journal_file_hash_data(f, field, size),
f4474e00 1410 ret, ret_offset);
3c1668da
LP
1411}
1412
de190aef
LP
1413int journal_file_find_data_object_with_hash(
1414 JournalFile *f,
1415 const void *data, uint64_t size, uint64_t hash,
f4474e00 1416 Object **ret, uint64_t *ret_offset) {
48496df6 1417
0dbe57ee 1418 uint64_t p, osize, h, m, depth = 0;
cec736d2
LP
1419 int r;
1420
1421 assert(f);
c88cc6af 1422 assert(f->header);
cec736d2
LP
1423 assert(data || size == 0);
1424
dade37d4
LP
1425 /* If there's no data hash table, then there's no entry. */
1426 if (le64toh(f->header->data_hash_table_size) <= 0)
1427 return 0;
1428
1429 /* Map the data hash table, if it isn't mapped yet. */
1430 r = journal_file_map_data_hash_table(f);
1431 if (r < 0)
1432 return r;
1433
cec736d2
LP
1434 osize = offsetof(Object, data.payload) + size;
1435
893e0f8f 1436 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
805d1486 1437 if (m <= 0)
bc85bfee
LP
1438 return -EBADMSG;
1439
805d1486 1440 h = hash % m;
de190aef 1441 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1442
de190aef
LP
1443 while (p > 0) {
1444 Object *o;
cec736d2 1445
de190aef 1446 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1447 if (r < 0)
1448 return r;
1449
807e17f0 1450 if (le64toh(o->data.hash) != hash)
85a131e8 1451 goto next;
807e17f0 1452
d89c8fdf 1453 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
d80b051c 1454#if HAVE_COMPRESSION
fa1c4b51 1455 uint64_t l;
a7f7d1bd 1456 size_t rsize = 0;
cec736d2 1457
893e0f8f 1458 l = le64toh(READ_NOW(o->object.size));
807e17f0 1459 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1460 return -EBADMSG;
1461
807e17f0
LP
1462 l -= offsetof(Object, data.payload);
1463
d89c8fdf
ZJS
1464 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1465 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1466 if (r < 0)
1467 return r;
807e17f0 1468
b785c858 1469 if (rsize == size &&
807e17f0
LP
1470 memcmp(f->compress_buffer, data, size) == 0) {
1471
1472 if (ret)
1473 *ret = o;
1474
f4474e00
LP
1475 if (ret_offset)
1476 *ret_offset = p;
807e17f0
LP
1477
1478 return 1;
1479 }
3b1a55e1
ZJS
1480#else
1481 return -EPROTONOSUPPORT;
1482#endif
807e17f0
LP
1483 } else if (le64toh(o->object.size) == osize &&
1484 memcmp(o->data.payload, data, size) == 0) {
1485
cec736d2
LP
1486 if (ret)
1487 *ret = o;
1488
f4474e00
LP
1489 if (ret_offset)
1490 *ret_offset = p;
cec736d2 1491
de190aef 1492 return 1;
cec736d2
LP
1493 }
1494
85a131e8 1495 next:
0dbe57ee
LP
1496 r = next_hash_offset(
1497 f,
1498 &p,
1499 &o->data.next_hash_offset,
1500 &depth,
1501 JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL);
1502 if (r < 0)
1503 return r;
cec736d2
LP
1504 }
1505
de190aef
LP
1506 return 0;
1507}
1508
1509int journal_file_find_data_object(
1510 JournalFile *f,
1511 const void *data, uint64_t size,
f4474e00 1512 Object **ret, uint64_t *ret_offset) {
de190aef 1513
de190aef
LP
1514 assert(f);
1515 assert(data || size == 0);
1516
f4474e00
LP
1517 return journal_file_find_data_object_with_hash(
1518 f,
4ce534f4
LP
1519 data, size,
1520 journal_file_hash_data(f, data, size),
f4474e00 1521 ret, ret_offset);
de190aef
LP
1522}
1523
adce225a
YW
1524bool journal_field_valid(const char *p, size_t l, bool allow_protected) {
1525 const char *a;
1526
1527 /* We kinda enforce POSIX syntax recommendations for
1528 environment variables here, but make a couple of additional
1529 requirements.
1530
1531 http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1532
1533 if (l == (size_t) -1)
1534 l = strlen(p);
1535
1536 /* No empty field names */
1537 if (l <= 0)
1538 return false;
1539
1540 /* Don't allow names longer than 64 chars */
1541 if (l > 64)
1542 return false;
1543
1544 /* Variables starting with an underscore are protected */
1545 if (!allow_protected && p[0] == '_')
1546 return false;
1547
1548 /* Don't allow digits as first character */
1549 if (p[0] >= '0' && p[0] <= '9')
1550 return false;
1551
1552 /* Only allow A-Z0-9 and '_' */
1553 for (a = p; a < p + l; a++)
1554 if ((*a < 'A' || *a > 'Z') &&
1555 (*a < '0' || *a > '9') &&
1556 *a != '_')
1557 return false;
1558
1559 return true;
1560}
1561
3c1668da
LP
1562static int journal_file_append_field(
1563 JournalFile *f,
1564 const void *field, uint64_t size,
f4474e00 1565 Object **ret, uint64_t *ret_offset) {
3c1668da
LP
1566
1567 uint64_t hash, p;
1568 uint64_t osize;
1569 Object *o;
1570 int r;
1571
1572 assert(f);
1573 assert(field && size > 0);
1574
f2bd0320
YW
1575 if (!journal_field_valid(field, size, true))
1576 return -EBADMSG;
1577
4ce534f4 1578 hash = journal_file_hash_data(f, field, size);
3c1668da
LP
1579
1580 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1581 if (r < 0)
1582 return r;
1583 else if (r > 0) {
1584
1585 if (ret)
1586 *ret = o;
1587
f4474e00
LP
1588 if (ret_offset)
1589 *ret_offset = p;
3c1668da
LP
1590
1591 return 0;
1592 }
1593
1594 osize = offsetof(Object, field.payload) + size;
1595 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1596 if (r < 0)
1597 return r;
3c1668da
LP
1598
1599 o->field.hash = htole64(hash);
1600 memcpy(o->field.payload, field, size);
1601
1602 r = journal_file_link_field(f, o, p, hash);
1603 if (r < 0)
1604 return r;
1605
1606 /* The linking might have altered the window, so let's
1607 * refresh our pointer */
1608 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1609 if (r < 0)
1610 return r;
1611
349cc4a5 1612#if HAVE_GCRYPT
3c1668da
LP
1613 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1614 if (r < 0)
1615 return r;
1616#endif
1617
1618 if (ret)
1619 *ret = o;
1620
f4474e00
LP
1621 if (ret_offset)
1622 *ret_offset = p;
3c1668da
LP
1623
1624 return 0;
1625}
1626
48496df6
LP
1627static int journal_file_append_data(
1628 JournalFile *f,
1629 const void *data, uint64_t size,
f4474e00 1630 Object **ret, uint64_t *ret_offset) {
48496df6 1631
de190aef
LP
1632 uint64_t hash, p;
1633 uint64_t osize;
1634 Object *o;
d89c8fdf 1635 int r, compression = 0;
3c1668da 1636 const void *eq;
de190aef
LP
1637
1638 assert(f);
1639 assert(data || size == 0);
1640
4ce534f4 1641 hash = journal_file_hash_data(f, data, size);
de190aef
LP
1642
1643 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1644 if (r < 0)
1645 return r;
0240c603 1646 if (r > 0) {
de190aef
LP
1647
1648 if (ret)
1649 *ret = o;
1650
f4474e00
LP
1651 if (ret_offset)
1652 *ret_offset = p;
de190aef
LP
1653
1654 return 0;
1655 }
1656
1657 osize = offsetof(Object, data.payload) + size;
1658 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1659 if (r < 0)
1660 return r;
1661
cec736d2 1662 o->data.hash = htole64(hash);
807e17f0 1663
d80b051c 1664#if HAVE_COMPRESSION
57850536 1665 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
a7f7d1bd 1666 size_t rsize = 0;
807e17f0 1667
5d6f46b6 1668 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1669
d1afbcd2 1670 if (compression >= 0) {
807e17f0 1671 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1672 o->object.flags |= compression;
807e17f0 1673
fa1c4b51 1674 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1675 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1676 } else
1677 /* Compression didn't work, we don't really care why, let's continue without compression */
1678 compression = 0;
807e17f0
LP
1679 }
1680#endif
1681
75f32f04
ZJS
1682 if (compression == 0)
1683 memcpy_safe(o->data.payload, data, size);
cec736d2 1684
de190aef 1685 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1686 if (r < 0)
1687 return r;
1688
349cc4a5 1689#if HAVE_GCRYPT
33685a5a
FB
1690 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1691 if (r < 0)
1692 return r;
1693#endif
1694
48496df6
LP
1695 /* The linking might have altered the window, so let's
1696 * refresh our pointer */
1697 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1698 if (r < 0)
1699 return r;
1700
08c6f819
SL
1701 if (!data)
1702 eq = NULL;
1703 else
1704 eq = memchr(data, '=', size);
3c1668da 1705 if (eq && eq > data) {
748db592 1706 Object *fo = NULL;
3c1668da 1707 uint64_t fp;
3c1668da
LP
1708
1709 /* Create field object ... */
1710 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1711 if (r < 0)
1712 return r;
1713
1714 /* ... and link it in. */
1715 o->data.next_field_offset = fo->field.head_data_offset;
1716 fo->field.head_data_offset = le64toh(p);
1717 }
1718
cec736d2
LP
1719 if (ret)
1720 *ret = o;
1721
f4474e00
LP
1722 if (ret_offset)
1723 *ret_offset = p;
cec736d2
LP
1724
1725 return 0;
1726}
1727
1728uint64_t journal_file_entry_n_items(Object *o) {
893e0f8f 1729 uint64_t sz;
cec736d2 1730 assert(o);
b588975f
LP
1731
1732 if (o->object.type != OBJECT_ENTRY)
1733 return 0;
cec736d2 1734
893e0f8f
LP
1735 sz = le64toh(READ_NOW(o->object.size));
1736 if (sz < offsetof(Object, entry.items))
1737 return 0;
1738
1739 return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
cec736d2
LP
1740}
1741
0284adc6 1742uint64_t journal_file_entry_array_n_items(Object *o) {
893e0f8f
LP
1743 uint64_t sz;
1744
de190aef 1745 assert(o);
b588975f
LP
1746
1747 if (o->object.type != OBJECT_ENTRY_ARRAY)
1748 return 0;
de190aef 1749
893e0f8f
LP
1750 sz = le64toh(READ_NOW(o->object.size));
1751 if (sz < offsetof(Object, entry_array.items))
1752 return 0;
1753
1754 return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
de190aef
LP
1755}
1756
fb9a24b6 1757uint64_t journal_file_hash_table_n_items(Object *o) {
893e0f8f
LP
1758 uint64_t sz;
1759
fb9a24b6 1760 assert(o);
b588975f 1761
ec2ce0c5 1762 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1763 return 0;
fb9a24b6 1764
893e0f8f
LP
1765 sz = le64toh(READ_NOW(o->object.size));
1766 if (sz < offsetof(Object, hash_table.items))
1767 return 0;
1768
1769 return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
fb9a24b6
LP
1770}
1771
de190aef 1772static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1773 le64_t *first,
1774 le64_t *idx,
de190aef 1775 uint64_t p) {
cec736d2 1776 int r;
de190aef
LP
1777 uint64_t n = 0, ap = 0, q, i, a, hidx;
1778 Object *o;
1779
cec736d2 1780 assert(f);
c88cc6af 1781 assert(f->header);
de190aef
LP
1782 assert(first);
1783 assert(idx);
1784 assert(p > 0);
cec736d2 1785
de190aef 1786 a = le64toh(*first);
893e0f8f 1787 i = hidx = le64toh(READ_NOW(*idx));
de190aef
LP
1788 while (a > 0) {
1789
1790 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1791 if (r < 0)
1792 return r;
cec736d2 1793
de190aef
LP
1794 n = journal_file_entry_array_n_items(o);
1795 if (i < n) {
1796 o->entry_array.items[i] = htole64(p);
1797 *idx = htole64(hidx + 1);
1798 return 0;
1799 }
cec736d2 1800
de190aef
LP
1801 i -= n;
1802 ap = a;
1803 a = le64toh(o->entry_array.next_entry_array_offset);
1804 }
1805
1806 if (hidx > n)
1807 n = (hidx+1) * 2;
1808 else
1809 n = n * 2;
1810
1811 if (n < 4)
1812 n = 4;
1813
1814 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1815 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1816 &o, &q);
cec736d2
LP
1817 if (r < 0)
1818 return r;
1819
349cc4a5 1820#if HAVE_GCRYPT
5996c7c2 1821 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1822 if (r < 0)
1823 return r;
feb12d3e 1824#endif
b0af6f41 1825
de190aef 1826 o->entry_array.items[i] = htole64(p);
cec736d2 1827
de190aef 1828 if (ap == 0)
7be3aa17 1829 *first = htole64(q);
cec736d2 1830 else {
de190aef 1831 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1832 if (r < 0)
1833 return r;
1834
de190aef
LP
1835 o->entry_array.next_entry_array_offset = htole64(q);
1836 }
cec736d2 1837
2dee23eb
LP
1838 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1839 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1840
de190aef
LP
1841 *idx = htole64(hidx + 1);
1842
1843 return 0;
1844}
cec736d2 1845
de190aef 1846static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1847 le64_t *extra,
1848 le64_t *first,
1849 le64_t *idx,
de190aef
LP
1850 uint64_t p) {
1851
893e0f8f 1852 uint64_t hidx;
de190aef
LP
1853 int r;
1854
1855 assert(f);
1856 assert(extra);
1857 assert(first);
1858 assert(idx);
1859 assert(p > 0);
1860
893e0f8f
LP
1861 hidx = le64toh(READ_NOW(*idx));
1862 if (hidx == UINT64_MAX)
1863 return -EBADMSG;
1864 if (hidx == 0)
de190aef
LP
1865 *extra = htole64(p);
1866 else {
4fd052ae 1867 le64_t i;
de190aef 1868
893e0f8f 1869 i = htole64(hidx - 1);
de190aef
LP
1870 r = link_entry_into_array(f, first, &i, p);
1871 if (r < 0)
1872 return r;
cec736d2
LP
1873 }
1874
893e0f8f 1875 *idx = htole64(hidx + 1);
de190aef
LP
1876 return 0;
1877}
1878
1879static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1880 uint64_t p;
1881 int r;
bfbd5be0 1882
de190aef
LP
1883 assert(f);
1884 assert(o);
1885 assert(offset > 0);
1886
1887 p = le64toh(o->entry.items[i].object_offset);
de190aef 1888 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1889 if (r < 0)
1890 return r;
1891
de190aef
LP
1892 return link_entry_into_array_plus_one(f,
1893 &o->data.entry_offset,
1894 &o->data.entry_array_offset,
1895 &o->data.n_entries,
1896 offset);
cec736d2
LP
1897}
1898
1899static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1900 uint64_t n, i;
cec736d2
LP
1901 int r;
1902
1903 assert(f);
c88cc6af 1904 assert(f->header);
cec736d2
LP
1905 assert(o);
1906 assert(offset > 0);
b588975f
LP
1907
1908 if (o->object.type != OBJECT_ENTRY)
1909 return -EINVAL;
cec736d2 1910
b788cc23
LP
1911 __sync_synchronize();
1912
cec736d2 1913 /* Link up the entry itself */
de190aef
LP
1914 r = link_entry_into_array(f,
1915 &f->header->entry_array_offset,
1916 &f->header->n_entries,
1917 offset);
1918 if (r < 0)
1919 return r;
cec736d2 1920
507f22bd 1921 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1922
de190aef 1923 if (f->header->head_entry_realtime == 0)
0ac38b70 1924 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1925
0ac38b70 1926 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1927 f->header->tail_entry_monotonic = o->entry.monotonic;
1928
cec736d2
LP
1929 /* Link up the items */
1930 n = journal_file_entry_n_items(o);
1931 for (i = 0; i < n; i++) {
1932 r = journal_file_link_entry_item(f, o, offset, i);
1933 if (r < 0)
1934 return r;
1935 }
1936
cec736d2
LP
1937 return 0;
1938}
1939
1940static int journal_file_append_entry_internal(
1941 JournalFile *f,
1942 const dual_timestamp *ts,
d180c349 1943 const sd_id128_t *boot_id,
cec736d2
LP
1944 uint64_t xor_hash,
1945 const EntryItem items[], unsigned n_items,
de190aef 1946 uint64_t *seqnum,
f4474e00 1947 Object **ret, uint64_t *ret_offset) {
cec736d2
LP
1948 uint64_t np;
1949 uint64_t osize;
1950 Object *o;
1951 int r;
1952
1953 assert(f);
c88cc6af 1954 assert(f->header);
cec736d2 1955 assert(items || n_items == 0);
de190aef 1956 assert(ts);
cec736d2
LP
1957
1958 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1959
de190aef 1960 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1961 if (r < 0)
1962 return r;
1963
d98cc1f2 1964 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1965 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1966 o->entry.realtime = htole64(ts->realtime);
1967 o->entry.monotonic = htole64(ts->monotonic);
cec736d2 1968 o->entry.xor_hash = htole64(xor_hash);
924426a7
CM
1969 if (boot_id)
1970 f->header->boot_id = *boot_id;
1971 o->entry.boot_id = f->header->boot_id;
cec736d2 1972
349cc4a5 1973#if HAVE_GCRYPT
5996c7c2 1974 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1975 if (r < 0)
1976 return r;
feb12d3e 1977#endif
b0af6f41 1978
cec736d2
LP
1979 r = journal_file_link_entry(f, o, np);
1980 if (r < 0)
1981 return r;
1982
1983 if (ret)
1984 *ret = o;
1985
f4474e00
LP
1986 if (ret_offset)
1987 *ret_offset = np;
cec736d2
LP
1988
1989 return 0;
1990}
1991
cf244689 1992void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1993 assert(f);
1994
c5236850
DT
1995 if (f->fd < 0)
1996 return;
1997
50f20cfd
LP
1998 /* inotify() does not receive IN_MODIFY events from file
1999 * accesses done via mmap(). After each access we hence
2000 * trigger IN_MODIFY by truncating the journal file to its
2001 * current size which triggers IN_MODIFY. */
2002
bc85bfee
LP
2003 __sync_synchronize();
2004
50f20cfd 2005 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 2006 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
2007}
2008
7a24f3bf
VC
2009static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
2010 assert(userdata);
2011
2012 journal_file_post_change(userdata);
2013
2014 return 1;
2015}
2016
2017static void schedule_post_change(JournalFile *f) {
b6cdfbe5 2018 int r;
7a24f3bf
VC
2019
2020 assert(f);
2021 assert(f->post_change_timer);
2022
b6cdfbe5 2023 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
7a24f3bf 2024 if (r < 0) {
e167d7fd
LP
2025 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
2026 goto fail;
7a24f3bf 2027 }
b6cdfbe5 2028 if (r > 0)
7a24f3bf
VC
2029 return;
2030
39cf0351 2031 r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period);
7a24f3bf 2032 if (r < 0) {
e167d7fd
LP
2033 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
2034 goto fail;
7a24f3bf
VC
2035 }
2036
ca5d90d4 2037 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
7a24f3bf 2038 if (r < 0) {
e167d7fd
LP
2039 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
2040 goto fail;
7a24f3bf 2041 }
e167d7fd
LP
2042
2043 return;
2044
2045fail:
2046 /* On failure, let's simply post the change immediately. */
2047 journal_file_post_change(f);
7a24f3bf
VC
2048}
2049
2050/* Enable coalesced change posting in a timer on the provided sd_event instance */
2051int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
2052 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
2053 int r;
2054
2055 assert(f);
2056 assert_return(!f->post_change_timer, -EINVAL);
2057 assert(e);
2058 assert(t);
2059
2060 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
2061 if (r < 0)
2062 return r;
2063
2064 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
2065 if (r < 0)
2066 return r;
2067
1cc6c93a 2068 f->post_change_timer = TAKE_PTR(timer);
7a24f3bf
VC
2069 f->post_change_timer_period = t;
2070
2071 return r;
2072}
2073
93bab288
YW
2074static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
2075 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1f2da9ec
LP
2076}
2077
d180c349
ZJS
2078int journal_file_append_entry(
2079 JournalFile *f,
2080 const dual_timestamp *ts,
2081 const sd_id128_t *boot_id,
2082 const struct iovec iovec[], unsigned n_iovec,
2083 uint64_t *seqnum,
f4474e00 2084 Object **ret, uint64_t *ret_offset) {
d180c349 2085
cec736d2
LP
2086 unsigned i;
2087 EntryItem *items;
2088 int r;
2089 uint64_t xor_hash = 0;
de190aef 2090 struct dual_timestamp _ts;
cec736d2
LP
2091
2092 assert(f);
c88cc6af 2093 assert(f->header);
cec736d2
LP
2094 assert(iovec || n_iovec == 0);
2095
c6273953 2096 if (ts) {
baaa35ad
ZJS
2097 if (!VALID_REALTIME(ts->realtime))
2098 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2099 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
2100 ts->realtime);
2101 if (!VALID_MONOTONIC(ts->monotonic))
2102 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2103 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
2104 ts->monotonic);
c6273953 2105 } else {
de190aef
LP
2106 dual_timestamp_get(&_ts);
2107 ts = &_ts;
2108 }
2109
349cc4a5 2110#if HAVE_GCRYPT
7560fffc
LP
2111 r = journal_file_maybe_append_tag(f, ts->realtime);
2112 if (r < 0)
2113 return r;
feb12d3e 2114#endif
7560fffc 2115
64825d3c 2116 /* alloca() can't take 0, hence let's allocate at least one */
cf409d15 2117 items = newa(EntryItem, MAX(1u, n_iovec));
cec736d2
LP
2118
2119 for (i = 0; i < n_iovec; i++) {
2120 uint64_t p;
2121 Object *o;
2122
2123 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2124 if (r < 0)
cf244689 2125 return r;
cec736d2 2126
4ce534f4
LP
2127 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2128 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2129 * specific record, and give records with otherwise identical position (i.e. match in seqno,
2130 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2131 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2132 * hash here for that. This also has the benefit that cursors for old and new journal files
2133 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2134 * files things are easier, we can just take the value from the stored record directly. */
2135
2136 if (JOURNAL_HEADER_KEYED_HASH(f->header))
2137 xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
2138 else
2139 xor_hash ^= le64toh(o->data.hash);
2140
cec736d2 2141 items[i].object_offset = htole64(p);
de7b95cd 2142 items[i].hash = o->data.hash;
cec736d2
LP
2143 }
2144
1f2da9ec
LP
2145 /* Order by the position on disk, in order to improve seek
2146 * times for rotating media. */
93bab288 2147 typesafe_qsort(items, n_iovec, entry_item_cmp);
1f2da9ec 2148
f4474e00 2149 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);
cec736d2 2150
fa6ac760
LP
2151 /* If the memory mapping triggered a SIGBUS then we return an
2152 * IO error and ignore the error code passed down to us, since
2153 * it is very likely just an effect of a nullified replacement
2154 * mapping page */
2155
be7cdd8e 2156 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2157 r = -EIO;
2158
7a24f3bf
VC
2159 if (f->post_change_timer)
2160 schedule_post_change(f);
2161 else
2162 journal_file_post_change(f);
50f20cfd 2163
cec736d2
LP
2164 return r;
2165}
2166
a4bcff5b 2167typedef struct ChainCacheItem {
fb099c8d 2168 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2169 uint64_t array; /* the cached array */
2170 uint64_t begin; /* the first item in the cached array */
2171 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2172 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2173} ChainCacheItem;
2174
2175static void chain_cache_put(
4743015d 2176 OrderedHashmap *h,
a4bcff5b
LP
2177 ChainCacheItem *ci,
2178 uint64_t first,
2179 uint64_t array,
2180 uint64_t begin,
f268980d
LP
2181 uint64_t total,
2182 uint64_t last_index) {
a4bcff5b
LP
2183
2184 if (!ci) {
34741aa3
LP
2185 /* If the chain item to cache for this chain is the
2186 * first one it's not worth caching anything */
2187 if (array == first)
2188 return;
2189
29433089 2190 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2191 ci = ordered_hashmap_steal_first(h);
29433089
LP
2192 assert(ci);
2193 } else {
a4bcff5b
LP
2194 ci = new(ChainCacheItem, 1);
2195 if (!ci)
2196 return;
2197 }
2198
2199 ci->first = first;
2200
4743015d 2201 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2202 free(ci);
2203 return;
2204 }
2205 } else
2206 assert(ci->first == first);
2207
2208 ci->array = array;
2209 ci->begin = begin;
2210 ci->total = total;
f268980d 2211 ci->last_index = last_index;
a4bcff5b
LP
2212}
2213
f268980d
LP
2214static int generic_array_get(
2215 JournalFile *f,
2216 uint64_t first,
2217 uint64_t i,
f4474e00 2218 Object **ret, uint64_t *ret_offset) {
de190aef 2219
cec736d2 2220 Object *o;
a4bcff5b 2221 uint64_t p = 0, a, t = 0;
cec736d2 2222 int r;
a4bcff5b 2223 ChainCacheItem *ci;
cec736d2
LP
2224
2225 assert(f);
2226
de190aef 2227 a = first;
a4bcff5b
LP
2228
2229 /* Try the chain cache first */
4743015d 2230 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2231 if (ci && i > ci->total) {
2232 a = ci->array;
2233 i -= ci->total;
2234 t = ci->total;
2235 }
2236
de190aef 2237 while (a > 0) {
a4bcff5b 2238 uint64_t k;
cec736d2 2239
de190aef
LP
2240 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2241 if (r < 0)
2242 return r;
cec736d2 2243
a4bcff5b
LP
2244 k = journal_file_entry_array_n_items(o);
2245 if (i < k) {
de190aef 2246 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2247 goto found;
cec736d2
LP
2248 }
2249
a4bcff5b
LP
2250 i -= k;
2251 t += k;
de190aef
LP
2252 a = le64toh(o->entry_array.next_entry_array_offset);
2253 }
2254
a4bcff5b
LP
2255 return 0;
2256
2257found:
2258 /* Let's cache this item for the next invocation */
af13a6b0 2259 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2260
2261 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2262 if (r < 0)
2263 return r;
2264
2265 if (ret)
2266 *ret = o;
2267
f4474e00
LP
2268 if (ret_offset)
2269 *ret_offset = p;
de190aef
LP
2270
2271 return 1;
2272}
2273
f268980d
LP
2274static int generic_array_get_plus_one(
2275 JournalFile *f,
2276 uint64_t extra,
2277 uint64_t first,
2278 uint64_t i,
f4474e00 2279 Object **ret, uint64_t *ret_offset) {
de190aef
LP
2280
2281 Object *o;
2282
2283 assert(f);
2284
2285 if (i == 0) {
2286 int r;
2287
2288 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2289 if (r < 0)
2290 return r;
2291
de190aef
LP
2292 if (ret)
2293 *ret = o;
cec736d2 2294
f4474e00
LP
2295 if (ret_offset)
2296 *ret_offset = extra;
cec736d2 2297
de190aef 2298 return 1;
cec736d2
LP
2299 }
2300
f4474e00 2301 return generic_array_get(f, first, i-1, ret, ret_offset);
de190aef 2302}
cec736d2 2303
de190aef
LP
2304enum {
2305 TEST_FOUND,
2306 TEST_LEFT,
2307 TEST_RIGHT
2308};
cec736d2 2309
f268980d
LP
2310static int generic_array_bisect(
2311 JournalFile *f,
2312 uint64_t first,
2313 uint64_t n,
2314 uint64_t needle,
2315 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2316 direction_t direction,
2317 Object **ret,
f4474e00
LP
2318 uint64_t *ret_offset,
2319 uint64_t *ret_idx) {
f268980d
LP
2320
2321 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2322 bool subtract_one = false;
2323 Object *o, *array = NULL;
2324 int r;
a4bcff5b 2325 ChainCacheItem *ci;
cec736d2 2326
de190aef
LP
2327 assert(f);
2328 assert(test_object);
cec736d2 2329
a4bcff5b 2330 /* Start with the first array in the chain */
de190aef 2331 a = first;
a4bcff5b 2332
4743015d 2333 ci = ordered_hashmap_get(f->chain_cache, &first);
96d4d024 2334 if (ci && n > ci->total && ci->begin != 0) {
a4bcff5b
LP
2335 /* Ah, we have iterated this bisection array chain
2336 * previously! Let's see if we can skip ahead in the
2337 * chain, as far as the last time. But we can't jump
2338 * backwards in the chain, so let's check that
2339 * first. */
2340
2341 r = test_object(f, ci->begin, needle);
2342 if (r < 0)
2343 return r;
2344
2345 if (r == TEST_LEFT) {
f268980d 2346 /* OK, what we are looking for is right of the
a4bcff5b
LP
2347 * begin of this EntryArray, so let's jump
2348 * straight to previously cached array in the
2349 * chain */
2350
2351 a = ci->array;
2352 n -= ci->total;
2353 t = ci->total;
f268980d 2354 last_index = ci->last_index;
a4bcff5b
LP
2355 }
2356 }
2357
de190aef
LP
2358 while (a > 0) {
2359 uint64_t left, right, k, lp;
2360
2361 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2362 if (r < 0)
2363 return r;
2364
de190aef
LP
2365 k = journal_file_entry_array_n_items(array);
2366 right = MIN(k, n);
2367 if (right <= 0)
2368 return 0;
cec736d2 2369
de190aef
LP
2370 i = right - 1;
2371 lp = p = le64toh(array->entry_array.items[i]);
2372 if (p <= 0)
bee6a291
LP
2373 r = -EBADMSG;
2374 else
2375 r = test_object(f, p, needle);
2376 if (r == -EBADMSG) {
2377 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2378 n = i;
2379 continue;
2380 }
de190aef
LP
2381 if (r < 0)
2382 return r;
cec736d2 2383
de190aef
LP
2384 if (r == TEST_FOUND)
2385 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2386
2387 if (r == TEST_RIGHT) {
2388 left = 0;
2389 right -= 1;
f268980d
LP
2390
2391 if (last_index != (uint64_t) -1) {
2392 assert(last_index <= right);
2393
2394 /* If we cached the last index we
2395 * looked at, let's try to not to jump
2396 * too wildly around and see if we can
2397 * limit the range to look at early to
2398 * the immediate neighbors of the last
2399 * index we looked at. */
2400
2401 if (last_index > 0) {
2402 uint64_t x = last_index - 1;
2403
2404 p = le64toh(array->entry_array.items[x]);
2405 if (p <= 0)
2406 return -EBADMSG;
2407
2408 r = test_object(f, p, needle);
2409 if (r < 0)
2410 return r;
2411
2412 if (r == TEST_FOUND)
2413 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2414
2415 if (r == TEST_RIGHT)
2416 right = x;
2417 else
2418 left = x + 1;
2419 }
2420
2421 if (last_index < right) {
2422 uint64_t y = last_index + 1;
2423
2424 p = le64toh(array->entry_array.items[y]);
2425 if (p <= 0)
2426 return -EBADMSG;
2427
2428 r = test_object(f, p, needle);
2429 if (r < 0)
2430 return r;
2431
2432 if (r == TEST_FOUND)
2433 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2434
2435 if (r == TEST_RIGHT)
2436 right = y;
2437 else
2438 left = y + 1;
2439 }
f268980d
LP
2440 }
2441
de190aef
LP
2442 for (;;) {
2443 if (left == right) {
2444 if (direction == DIRECTION_UP)
2445 subtract_one = true;
2446
2447 i = left;
2448 goto found;
2449 }
2450
2451 assert(left < right);
de190aef 2452 i = (left + right) / 2;
f268980d 2453
de190aef
LP
2454 p = le64toh(array->entry_array.items[i]);
2455 if (p <= 0)
bee6a291
LP
2456 r = -EBADMSG;
2457 else
2458 r = test_object(f, p, needle);
2459 if (r == -EBADMSG) {
2460 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2461 right = n = i;
2462 continue;
2463 }
de190aef
LP
2464 if (r < 0)
2465 return r;
cec736d2 2466
de190aef
LP
2467 if (r == TEST_FOUND)
2468 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2469
2470 if (r == TEST_RIGHT)
2471 right = i;
2472 else
2473 left = i + 1;
2474 }
2475 }
2476
2173cbf8 2477 if (k >= n) {
cbdca852
LP
2478 if (direction == DIRECTION_UP) {
2479 i = n;
2480 subtract_one = true;
2481 goto found;
2482 }
2483
cec736d2 2484 return 0;
cbdca852 2485 }
cec736d2 2486
de190aef
LP
2487 last_p = lp;
2488
2489 n -= k;
2490 t += k;
f268980d 2491 last_index = (uint64_t) -1;
de190aef 2492 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2493 }
2494
2495 return 0;
de190aef
LP
2496
2497found:
2498 if (subtract_one && t == 0 && i == 0)
2499 return 0;
2500
a4bcff5b 2501 /* Let's cache this item for the next invocation */
af13a6b0 2502 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2503
de190aef
LP
2504 if (subtract_one && i == 0)
2505 p = last_p;
2506 else if (subtract_one)
2507 p = le64toh(array->entry_array.items[i-1]);
2508 else
2509 p = le64toh(array->entry_array.items[i]);
2510
2511 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2512 if (r < 0)
2513 return r;
2514
2515 if (ret)
2516 *ret = o;
2517
f4474e00
LP
2518 if (ret_offset)
2519 *ret_offset = p;
de190aef 2520
f4474e00
LP
2521 if (ret_idx)
2522 *ret_idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2523
2524 return 1;
cec736d2
LP
2525}
2526
f268980d
LP
2527static int generic_array_bisect_plus_one(
2528 JournalFile *f,
2529 uint64_t extra,
2530 uint64_t first,
2531 uint64_t n,
2532 uint64_t needle,
2533 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2534 direction_t direction,
2535 Object **ret,
f4474e00
LP
2536 uint64_t *ret_offset,
2537 uint64_t *ret_idx) {
de190aef 2538
cec736d2 2539 int r;
cbdca852
LP
2540 bool step_back = false;
2541 Object *o;
cec736d2
LP
2542
2543 assert(f);
de190aef 2544 assert(test_object);
cec736d2 2545
de190aef
LP
2546 if (n <= 0)
2547 return 0;
cec736d2 2548
de190aef
LP
2549 /* This bisects the array in object 'first', but first checks
2550 * an extra */
de190aef
LP
2551 r = test_object(f, extra, needle);
2552 if (r < 0)
2553 return r;
a536e261
LP
2554
2555 if (r == TEST_FOUND)
2556 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2557
cbdca852
LP
2558 /* if we are looking with DIRECTION_UP then we need to first
2559 see if in the actual array there is a matching entry, and
2560 return the last one of that. But if there isn't any we need
2561 to return this one. Hence remember this, and return it
2562 below. */
2563 if (r == TEST_LEFT)
2564 step_back = direction == DIRECTION_UP;
de190aef 2565
cbdca852
LP
2566 if (r == TEST_RIGHT) {
2567 if (direction == DIRECTION_DOWN)
2568 goto found;
2569 else
2570 return 0;
a536e261 2571 }
cec736d2 2572
f4474e00 2573 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx);
de190aef 2574
cbdca852
LP
2575 if (r == 0 && step_back)
2576 goto found;
2577
f4474e00
LP
2578 if (r > 0 && ret_idx)
2579 (*ret_idx)++;
de190aef
LP
2580
2581 return r;
cbdca852
LP
2582
2583found:
2584 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2585 if (r < 0)
2586 return r;
2587
2588 if (ret)
2589 *ret = o;
2590
f4474e00
LP
2591 if (ret_offset)
2592 *ret_offset = extra;
cbdca852 2593
f4474e00
LP
2594 if (ret_idx)
2595 *ret_idx = 0;
cbdca852
LP
2596
2597 return 1;
2598}
2599
44a6b1b6 2600_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2601 assert(f);
2602 assert(p > 0);
2603
2604 if (p == needle)
2605 return TEST_FOUND;
2606 else if (p < needle)
2607 return TEST_LEFT;
2608 else
2609 return TEST_RIGHT;
2610}
2611
de190aef 2612static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
893e0f8f 2613 uint64_t sq;
de190aef
LP
2614 Object *o;
2615 int r;
2616
2617 assert(f);
2618 assert(p > 0);
2619
2620 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2621 if (r < 0)
2622 return r;
2623
893e0f8f
LP
2624 sq = le64toh(READ_NOW(o->entry.seqnum));
2625 if (sq == needle)
de190aef 2626 return TEST_FOUND;
893e0f8f 2627 else if (sq < needle)
de190aef
LP
2628 return TEST_LEFT;
2629 else
2630 return TEST_RIGHT;
2631}
cec736d2 2632
de190aef
LP
2633int journal_file_move_to_entry_by_seqnum(
2634 JournalFile *f,
2635 uint64_t seqnum,
2636 direction_t direction,
2637 Object **ret,
f4474e00 2638 uint64_t *ret_offset) {
c88cc6af
VC
2639 assert(f);
2640 assert(f->header);
de190aef 2641
f4474e00
LP
2642 return generic_array_bisect(
2643 f,
2644 le64toh(f->header->entry_array_offset),
2645 le64toh(f->header->n_entries),
2646 seqnum,
2647 test_object_seqnum,
2648 direction,
2649 ret, ret_offset, NULL);
de190aef 2650}
cec736d2 2651
de190aef
LP
2652static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2653 Object *o;
893e0f8f 2654 uint64_t rt;
de190aef
LP
2655 int r;
2656
2657 assert(f);
2658 assert(p > 0);
2659
2660 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2661 if (r < 0)
2662 return r;
2663
893e0f8f
LP
2664 rt = le64toh(READ_NOW(o->entry.realtime));
2665 if (rt == needle)
de190aef 2666 return TEST_FOUND;
893e0f8f 2667 else if (rt < needle)
de190aef
LP
2668 return TEST_LEFT;
2669 else
2670 return TEST_RIGHT;
cec736d2
LP
2671}
2672
de190aef
LP
2673int journal_file_move_to_entry_by_realtime(
2674 JournalFile *f,
2675 uint64_t realtime,
2676 direction_t direction,
2677 Object **ret,
f4474e00 2678 uint64_t *ret_offset) {
c88cc6af
VC
2679 assert(f);
2680 assert(f->header);
de190aef 2681
f4474e00
LP
2682 return generic_array_bisect(
2683 f,
2684 le64toh(f->header->entry_array_offset),
2685 le64toh(f->header->n_entries),
2686 realtime,
2687 test_object_realtime,
2688 direction,
2689 ret, ret_offset, NULL);
de190aef
LP
2690}
2691
2692static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2693 Object *o;
893e0f8f 2694 uint64_t m;
de190aef
LP
2695 int r;
2696
2697 assert(f);
2698 assert(p > 0);
2699
2700 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2701 if (r < 0)
2702 return r;
2703
893e0f8f
LP
2704 m = le64toh(READ_NOW(o->entry.monotonic));
2705 if (m == needle)
de190aef 2706 return TEST_FOUND;
893e0f8f 2707 else if (m < needle)
de190aef
LP
2708 return TEST_LEFT;
2709 else
2710 return TEST_RIGHT;
2711}
2712
2a560338 2713static int find_data_object_by_boot_id(
47838ab3
ZJS
2714 JournalFile *f,
2715 sd_id128_t boot_id,
2716 Object **o,
2717 uint64_t *b) {
2a560338 2718
fbd0b64f 2719 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2720
2721 sd_id128_to_string(boot_id, t + 9);
2722 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2723}
2724
de190aef
LP
2725int journal_file_move_to_entry_by_monotonic(
2726 JournalFile *f,
2727 sd_id128_t boot_id,
2728 uint64_t monotonic,
2729 direction_t direction,
2730 Object **ret,
f4474e00 2731 uint64_t *ret_offset) {
de190aef 2732
de190aef
LP
2733 Object *o;
2734 int r;
2735
cbdca852 2736 assert(f);
de190aef 2737
47838ab3 2738 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2739 if (r < 0)
2740 return r;
cbdca852 2741 if (r == 0)
de190aef
LP
2742 return -ENOENT;
2743
f4474e00
LP
2744 return generic_array_bisect_plus_one(
2745 f,
2746 le64toh(o->data.entry_offset),
2747 le64toh(o->data.entry_array_offset),
2748 le64toh(o->data.n_entries),
2749 monotonic,
2750 test_object_monotonic,
2751 direction,
2752 ret, ret_offset, NULL);
de190aef
LP
2753}
2754
1fc605b0 2755void journal_file_reset_location(JournalFile *f) {
6573ef05 2756 f->location_type = LOCATION_HEAD;
1fc605b0 2757 f->current_offset = 0;
6573ef05
MS
2758 f->current_seqnum = 0;
2759 f->current_realtime = 0;
2760 f->current_monotonic = 0;
2761 zero(f->current_boot_id);
2762 f->current_xor_hash = 0;
2763}
2764
950c07d4 2765void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2766 f->location_type = LOCATION_SEEK;
2767 f->current_offset = offset;
2768 f->current_seqnum = le64toh(o->entry.seqnum);
2769 f->current_realtime = le64toh(o->entry.realtime);
2770 f->current_monotonic = le64toh(o->entry.monotonic);
2771 f->current_boot_id = o->entry.boot_id;
2772 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2773}
2774
d8ae66d7 2775int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
90c88092
YW
2776 int r;
2777
d8ae66d7 2778 assert(af);
c88cc6af 2779 assert(af->header);
d8ae66d7 2780 assert(bf);
c88cc6af 2781 assert(bf->header);
d8ae66d7
MS
2782 assert(af->location_type == LOCATION_SEEK);
2783 assert(bf->location_type == LOCATION_SEEK);
2784
b17f651a 2785 /* If contents, timestamps and seqnum match, these entries are
2786 * identical*/
d8ae66d7
MS
2787 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2788 af->current_monotonic == bf->current_monotonic &&
2789 af->current_realtime == bf->current_realtime &&
b17f651a 2790 af->current_xor_hash == bf->current_xor_hash &&
2791 sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) &&
2792 af->current_seqnum == bf->current_seqnum)
d8ae66d7
MS
2793 return 0;
2794
2795 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2796
2797 /* If this is from the same seqnum source, compare
2798 * seqnums */
90c88092
YW
2799 r = CMP(af->current_seqnum, bf->current_seqnum);
2800 if (r != 0)
2801 return r;
d8ae66d7
MS
2802
2803 /* Wow! This is weird, different data but the same
2804 * seqnums? Something is borked, but let's make the
2805 * best of it and compare by time. */
2806 }
2807
2808 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2809
2810 /* If the boot id matches, compare monotonic time */
90c88092
YW
2811 r = CMP(af->current_monotonic, bf->current_monotonic);
2812 if (r != 0)
2813 return r;
d8ae66d7
MS
2814 }
2815
2816 /* Otherwise, compare UTC time */
90c88092
YW
2817 r = CMP(af->current_realtime, bf->current_realtime);
2818 if (r != 0)
2819 return r;
d8ae66d7
MS
2820
2821 /* Finally, compare by contents */
6dd91b36 2822 return CMP(af->current_xor_hash, bf->current_xor_hash);
d8ae66d7
MS
2823}
2824
aa598ba5
LP
2825static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2826
2827 /* Increase or decrease the specified index, in the right direction. */
2828
2829 if (direction == DIRECTION_DOWN) {
2830 if (*i >= n - 1)
2831 return 0;
2832
2833 (*i) ++;
2834 } else {
2835 if (*i <= 0)
2836 return 0;
2837
2838 (*i) --;
2839 }
2840
2841 return 1;
2842}
2843
b6da4ed0
LP
2844static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2845
2846 /* Consider it an error if any of the two offsets is uninitialized */
2847 if (old_offset == 0 || new_offset == 0)
2848 return false;
2849
2850 /* If we go down, the new offset must be larger than the old one. */
2851 return direction == DIRECTION_DOWN ?
2852 new_offset > old_offset :
2853 new_offset < old_offset;
2854}
2855
de190aef
LP
2856int journal_file_next_entry(
2857 JournalFile *f,
f534928a 2858 uint64_t p,
de190aef 2859 direction_t direction,
f4474e00 2860 Object **ret, uint64_t *ret_offset) {
de190aef 2861
fb099c8d 2862 uint64_t i, n, ofs;
cec736d2
LP
2863 int r;
2864
2865 assert(f);
c88cc6af 2866 assert(f->header);
de190aef 2867
893e0f8f 2868 n = le64toh(READ_NOW(f->header->n_entries));
de190aef
LP
2869 if (n <= 0)
2870 return 0;
cec736d2 2871
f534928a 2872 if (p == 0)
de190aef 2873 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2874 else {
de190aef
LP
2875 r = generic_array_bisect(f,
2876 le64toh(f->header->entry_array_offset),
2877 le64toh(f->header->n_entries),
2878 p,
2879 test_object_offset,
2880 DIRECTION_DOWN,
2881 NULL, NULL,
2882 &i);
2883 if (r <= 0)
2884 return r;
2885
aa598ba5
LP
2886 r = bump_array_index(&i, direction, n);
2887 if (r <= 0)
2888 return r;
cec736d2
LP
2889 }
2890
de190aef 2891 /* And jump to it */
989793d3
LP
2892 for (;;) {
2893 r = generic_array_get(f,
2894 le64toh(f->header->entry_array_offset),
2895 i,
2896 ret, &ofs);
2897 if (r > 0)
2898 break;
2899 if (r != -EBADMSG)
2900 return r;
2901
2902 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2903 * the next one might work for us instead. */
2904 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2905
2906 r = bump_array_index(&i, direction, n);
2907 if (r <= 0)
2908 return r;
caeab8f6 2909 }
fb099c8d 2910
b6da4ed0 2911 /* Ensure our array is properly ordered. */
baaa35ad
ZJS
2912 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2913 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2914 "%s: entry array not properly ordered at entry %" PRIu64,
2915 f->path, i);
fb099c8d 2916
f4474e00
LP
2917 if (ret_offset)
2918 *ret_offset = ofs;
fb099c8d
ZJS
2919
2920 return 1;
de190aef 2921}
cec736d2 2922
de190aef
LP
2923int journal_file_next_entry_for_data(
2924 JournalFile *f,
2925 Object *o, uint64_t p,
2926 uint64_t data_offset,
2927 direction_t direction,
f4474e00 2928 Object **ret, uint64_t *ret_offset) {
de190aef 2929
ded5034e 2930 uint64_t i, n, ofs;
de190aef 2931 Object *d;
989793d3 2932 int r;
cec736d2
LP
2933
2934 assert(f);
de190aef 2935 assert(p > 0 || !o);
cec736d2 2936
de190aef 2937 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2938 if (r < 0)
de190aef 2939 return r;
cec736d2 2940
893e0f8f 2941 n = le64toh(READ_NOW(d->data.n_entries));
de190aef
LP
2942 if (n <= 0)
2943 return n;
cec736d2 2944
de190aef
LP
2945 if (!o)
2946 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2947 else {
2948 if (o->object.type != OBJECT_ENTRY)
2949 return -EINVAL;
cec736d2 2950
de190aef
LP
2951 r = generic_array_bisect_plus_one(f,
2952 le64toh(d->data.entry_offset),
2953 le64toh(d->data.entry_array_offset),
2954 le64toh(d->data.n_entries),
2955 p,
2956 test_object_offset,
2957 DIRECTION_DOWN,
2958 NULL, NULL,
2959 &i);
2960
2961 if (r <= 0)
cec736d2
LP
2962 return r;
2963
aa598ba5
LP
2964 r = bump_array_index(&i, direction, n);
2965 if (r <= 0)
2966 return r;
de190aef 2967 }
cec736d2 2968
989793d3
LP
2969 for (;;) {
2970 r = generic_array_get_plus_one(f,
2971 le64toh(d->data.entry_offset),
2972 le64toh(d->data.entry_array_offset),
2973 i,
2974 ret, &ofs);
2975 if (r > 0)
2976 break;
2977 if (r != -EBADMSG)
2978 return r;
2979
2980 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2981
2982 r = bump_array_index(&i, direction, n);
2983 if (r <= 0)
2984 return r;
2985 }
ded5034e
LP
2986
2987 /* Ensure our array is properly ordered. */
baaa35ad
ZJS
2988 if (p > 0 && check_properly_ordered(ofs, p, direction))
2989 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2990 "%s data entry array not properly ordered at entry %" PRIu64,
2991 f->path, i);
ded5034e 2992
f4474e00
LP
2993 if (ret_offset)
2994 *ret_offset = ofs;
ded5034e
LP
2995
2996 return 1;
de190aef 2997}
cec736d2 2998
cbdca852
LP
2999int journal_file_move_to_entry_by_offset_for_data(
3000 JournalFile *f,
3001 uint64_t data_offset,
3002 uint64_t p,
3003 direction_t direction,
f4474e00 3004 Object **ret, uint64_t *ret_offset) {
cbdca852
LP
3005
3006 int r;
3007 Object *d;
3008
3009 assert(f);
3010
3011 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3012 if (r < 0)
3013 return r;
3014
f4474e00
LP
3015 return generic_array_bisect_plus_one(
3016 f,
3017 le64toh(d->data.entry_offset),
3018 le64toh(d->data.entry_array_offset),
3019 le64toh(d->data.n_entries),
3020 p,
3021 test_object_offset,
3022 direction,
3023 ret, ret_offset, NULL);
cbdca852
LP
3024}
3025
3026int journal_file_move_to_entry_by_monotonic_for_data(
3027 JournalFile *f,
3028 uint64_t data_offset,
3029 sd_id128_t boot_id,
3030 uint64_t monotonic,
3031 direction_t direction,
f4474e00 3032 Object **ret, uint64_t *ret_offset) {
cbdca852 3033
cbdca852
LP
3034 Object *o, *d;
3035 int r;
3036 uint64_t b, z;
3037
3038 assert(f);
3039
3040 /* First, seek by time */
47838ab3 3041 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
3042 if (r < 0)
3043 return r;
3044 if (r == 0)
3045 return -ENOENT;
3046
3047 r = generic_array_bisect_plus_one(f,
3048 le64toh(o->data.entry_offset),
3049 le64toh(o->data.entry_array_offset),
3050 le64toh(o->data.n_entries),
3051 monotonic,
3052 test_object_monotonic,
3053 direction,
3054 NULL, &z, NULL);
3055 if (r <= 0)
3056 return r;
3057
3058 /* And now, continue seeking until we find an entry that
3059 * exists in both bisection arrays */
3060
3061 for (;;) {
3062 Object *qo;
3063 uint64_t p, q;
3064
3065 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
3066 if (r < 0)
3067 return r;
3068
3069 r = generic_array_bisect_plus_one(f,
3070 le64toh(d->data.entry_offset),
3071 le64toh(d->data.entry_array_offset),
3072 le64toh(d->data.n_entries),
3073 z,
3074 test_object_offset,
3075 direction,
3076 NULL, &p, NULL);
3077 if (r <= 0)
3078 return r;
3079
3080 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
3081 if (r < 0)
3082 return r;
3083
3084 r = generic_array_bisect_plus_one(f,
3085 le64toh(o->data.entry_offset),
3086 le64toh(o->data.entry_array_offset),
3087 le64toh(o->data.n_entries),
3088 p,
3089 test_object_offset,
3090 direction,
3091 &qo, &q, NULL);
3092
3093 if (r <= 0)
3094 return r;
3095
3096 if (p == q) {
3097 if (ret)
3098 *ret = qo;
f4474e00
LP
3099 if (ret_offset)
3100 *ret_offset = q;
cbdca852
LP
3101
3102 return 1;
3103 }
3104
3105 z = q;
3106 }
cbdca852
LP
3107}
3108
de190aef
LP
3109int journal_file_move_to_entry_by_seqnum_for_data(
3110 JournalFile *f,
3111 uint64_t data_offset,
3112 uint64_t seqnum,
3113 direction_t direction,
f4474e00 3114 Object **ret, uint64_t *ret_offset) {
cec736d2 3115
de190aef
LP
3116 Object *d;
3117 int r;
cec736d2 3118
91a31dde
LP
3119 assert(f);
3120
de190aef 3121 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3122 if (r < 0)
de190aef 3123 return r;
cec736d2 3124
f4474e00
LP
3125 return generic_array_bisect_plus_one(
3126 f,
3127 le64toh(d->data.entry_offset),
3128 le64toh(d->data.entry_array_offset),
3129 le64toh(d->data.n_entries),
3130 seqnum,
3131 test_object_seqnum,
3132 direction,
3133 ret, ret_offset, NULL);
de190aef 3134}
cec736d2 3135
de190aef
LP
3136int journal_file_move_to_entry_by_realtime_for_data(
3137 JournalFile *f,
3138 uint64_t data_offset,
3139 uint64_t realtime,
3140 direction_t direction,
f4474e00 3141 Object **ret, uint64_t *ret_offset) {
de190aef
LP
3142
3143 Object *d;
3144 int r;
3145
91a31dde
LP
3146 assert(f);
3147
de190aef 3148 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3149 if (r < 0)
de190aef
LP
3150 return r;
3151
f4474e00
LP
3152 return generic_array_bisect_plus_one(
3153 f,
3154 le64toh(d->data.entry_offset),
3155 le64toh(d->data.entry_array_offset),
3156 le64toh(d->data.n_entries),
3157 realtime,
3158 test_object_realtime,
3159 direction,
3160 ret, ret_offset, NULL);
cec736d2
LP
3161}
3162
0284adc6 3163void journal_file_dump(JournalFile *f) {
7560fffc 3164 Object *o;
7560fffc 3165 int r;
0284adc6 3166 uint64_t p;
7560fffc
LP
3167
3168 assert(f);
c88cc6af 3169 assert(f->header);
7560fffc 3170
0284adc6 3171 journal_file_print_header(f);
7560fffc 3172
893e0f8f 3173 p = le64toh(READ_NOW(f->header->header_size));
0284adc6 3174 while (p != 0) {
d05089d8 3175 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3176 if (r < 0)
3177 goto fail;
7560fffc 3178
0284adc6 3179 switch (o->object.type) {
d98cc1f2 3180
0284adc6
LP
3181 case OBJECT_UNUSED:
3182 printf("Type: OBJECT_UNUSED\n");
3183 break;
d98cc1f2 3184
0284adc6
LP
3185 case OBJECT_DATA:
3186 printf("Type: OBJECT_DATA\n");
3187 break;
7560fffc 3188
3c1668da
LP
3189 case OBJECT_FIELD:
3190 printf("Type: OBJECT_FIELD\n");
3191 break;
3192
0284adc6 3193 case OBJECT_ENTRY:
507f22bd
ZJS
3194 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3195 le64toh(o->entry.seqnum),
3196 le64toh(o->entry.monotonic),
3197 le64toh(o->entry.realtime));
0284adc6 3198 break;
7560fffc 3199
0284adc6
LP
3200 case OBJECT_FIELD_HASH_TABLE:
3201 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3202 break;
7560fffc 3203
0284adc6
LP
3204 case OBJECT_DATA_HASH_TABLE:
3205 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3206 break;
7560fffc 3207
0284adc6
LP
3208 case OBJECT_ENTRY_ARRAY:
3209 printf("Type: OBJECT_ENTRY_ARRAY\n");
3210 break;
7560fffc 3211
0284adc6 3212 case OBJECT_TAG:
507f22bd
ZJS
3213 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3214 le64toh(o->tag.seqnum),
3215 le64toh(o->tag.epoch));
0284adc6 3216 break;
3c1668da
LP
3217
3218 default:
8facc349 3219 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3220 break;
0284adc6 3221 }
7560fffc 3222
d89c8fdf
ZJS
3223 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3224 printf("Flags: %s\n",
3225 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3226
0284adc6
LP
3227 if (p == le64toh(f->header->tail_object_offset))
3228 p = 0;
3229 else
71139898 3230 p += ALIGN64(le64toh(o->object.size));
0284adc6 3231 }
7560fffc 3232
0284adc6
LP
3233 return;
3234fail:
3235 log_error("File corrupt");
7560fffc
LP
3236}
3237
718fe4b1
ZJS
3238static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3239 const char *x;
3240
3241 x = format_timestamp(buf, l, t);
3242 if (x)
3243 return x;
3244 return " --- ";
3245}
3246
0284adc6 3247void journal_file_print_header(JournalFile *f) {
5905d7cf 3248 char a[SD_ID128_STRING_MAX], b[SD_ID128_STRING_MAX], c[SD_ID128_STRING_MAX], d[SD_ID128_STRING_MAX];
ed375beb 3249 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3250 struct stat st;
3251 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3252
3253 assert(f);
c88cc6af 3254 assert(f->header);
7560fffc 3255
2c54acb1 3256 printf("File path: %s\n"
0284adc6
LP
3257 "File ID: %s\n"
3258 "Machine ID: %s\n"
3259 "Boot ID: %s\n"
2c54acb1 3260 "Sequential number ID: %s\n"
0284adc6 3261 "State: %s\n"
2c54acb1 3262 "Compatible flags:%s%s\n"
8653185a 3263 "Incompatible flags:%s%s%s%s%s\n"
507f22bd
ZJS
3264 "Header size: %"PRIu64"\n"
3265 "Arena size: %"PRIu64"\n"
2c54acb1
TN
3266 "Data hash table size: %"PRIu64"\n"
3267 "Field hash table size: %"PRIu64"\n"
3268 "Rotate suggested: %s\n"
3269 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3270 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3271 "Head realtime timestamp: %s (%"PRIx64")\n"
3272 "Tail realtime timestamp: %s (%"PRIx64")\n"
3273 "Tail monotonic timestamp: %s (%"PRIx64")\n"
507f22bd 3274 "Objects: %"PRIu64"\n"
2c54acb1 3275 "Entry objects: %"PRIu64"\n",
0284adc6
LP
3276 f->path,
3277 sd_id128_to_string(f->header->file_id, a),
3278 sd_id128_to_string(f->header->machine_id, b),
3279 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3280 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3281 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3282 f->header->state == STATE_ONLINE ? "ONLINE" :
3283 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3284 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3285 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3286 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3287 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
8653185a 3288 JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",
4ce534f4 3289 JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
d89c8fdf 3290 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3291 le64toh(f->header->header_size),
3292 le64toh(f->header->arena_size),
3293 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3294 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3295 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3296 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3297 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3298 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3299 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3300 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3301 le64toh(f->header->n_objects),
3302 le64toh(f->header->n_entries));
7560fffc 3303
0284adc6 3304 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2c54acb1
TN
3305 printf("Data objects: %"PRIu64"\n"
3306 "Data hash table fill: %.1f%%\n",
507f22bd 3307 le64toh(f->header->n_data),
0284adc6 3308 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3309
0284adc6 3310 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2c54acb1
TN
3311 printf("Field objects: %"PRIu64"\n"
3312 "Field hash table fill: %.1f%%\n",
507f22bd 3313 le64toh(f->header->n_fields),
0284adc6 3314 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3315
3316 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2c54acb1 3317 printf("Tag objects: %"PRIu64"\n",
507f22bd 3318 le64toh(f->header->n_tags));
3223f44f 3319 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2c54acb1 3320 printf("Entry array objects: %"PRIu64"\n",
507f22bd 3321 le64toh(f->header->n_entry_arrays));
a1a03e30 3322
0dbe57ee
LP
3323 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth))
3324 printf("Deepest field hash chain: %" PRIu64"\n",
3325 f->header->field_hash_chain_depth);
3326
3327 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth))
3328 printf("Deepest data hash chain: %" PRIu64"\n",
3329 f->header->data_hash_chain_depth);
3330
a1a03e30 3331 if (fstat(f->fd, &st) >= 0)
59f448cf 3332 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3333}
3334
fc68c929
LP
3335static int journal_file_warn_btrfs(JournalFile *f) {
3336 unsigned attrs;
3337 int r;
3338
3339 assert(f);
3340
3341 /* Before we write anything, check if the COW logic is turned
3342 * off on btrfs. Given our write pattern that is quite
3343 * unfriendly to COW file systems this should greatly improve
3344 * performance on COW file systems, such as btrfs, at the
3345 * expense of data integrity features (which shouldn't be too
3346 * bad, given that we do our own checksumming). */
3347
3348 r = btrfs_is_filesystem(f->fd);
3349 if (r < 0)
3350 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3351 if (!r)
3352 return 0;
3353
3354 r = read_attr_fd(f->fd, &attrs);
3355 if (r < 0)
3356 return log_warning_errno(r, "Failed to read file attributes: %m");
3357
3358 if (attrs & FS_NOCOW_FL) {
3359 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3360 return 0;
3361 }
3362
3363 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3364 "This is likely to slow down journal access substantially, please consider turning "
3365 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3366
3367 return 1;
3368}
3369
0284adc6 3370int journal_file_open(
5d1ce257 3371 int fd,
0284adc6
LP
3372 const char *fname,
3373 int flags,
3374 mode_t mode,
3375 bool compress,
57850536 3376 uint64_t compress_threshold_bytes,
baed47c3 3377 bool seal,
0284adc6
LP
3378 JournalMetrics *metrics,
3379 MMapCache *mmap_cache,
b58c888f 3380 Set *deferred_closes,
0284adc6
LP
3381 JournalFile *template,
3382 JournalFile **ret) {
7560fffc 3383
fa6ac760 3384 bool newly_created = false;
0284adc6 3385 JournalFile *f;
fa6ac760 3386 void *h;
0284adc6 3387 int r;
7560fffc 3388
0559d3a5 3389 assert(ret);
5d1ce257 3390 assert(fd >= 0 || fname);
7560fffc 3391
ec2ce0c5 3392 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3393 return -EINVAL;
7560fffc 3394
6eda13d3
LP
3395 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3396 return -EINVAL;
7560fffc 3397
971b52c4 3398 f = new(JournalFile, 1);
0284adc6
LP
3399 if (!f)
3400 return -ENOMEM;
7560fffc 3401
971b52c4
LP
3402 *f = (JournalFile) {
3403 .fd = fd,
3404 .mode = mode,
3405
3406 .flags = flags,
971b52c4 3407 .writable = (flags & O_ACCMODE) != O_RDONLY,
7560fffc 3408
8653185a
LP
3409#if HAVE_ZSTD
3410 .compress_zstd = compress,
3411#elif HAVE_LZ4
971b52c4 3412 .compress_lz4 = compress,
349cc4a5 3413#elif HAVE_XZ
971b52c4 3414 .compress_xz = compress,
48b61739 3415#endif
971b52c4
LP
3416 .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
3417 DEFAULT_COMPRESS_THRESHOLD :
3418 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
349cc4a5 3419#if HAVE_GCRYPT
971b52c4 3420 .seal = seal,
49a32d43 3421#endif
971b52c4 3422 };
7560fffc 3423
4ce534f4
LP
3424 /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
3425 * people really want that */
3426 r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
3427 if (r < 0) {
3428 if (r != -ENXIO)
3429 log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring.");
3430 f->keyed_hash = true;
3431 } else
3432 f->keyed_hash = r;
3433
170a434c 3434 if (DEBUG_LOGGING) {
4ce534f4 3435 static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
170a434c
ZJS
3436 static uint64_t last_bytes = UINT64_MAX;
3437 char bytes[FORMAT_BYTES_MAX];
3438
3439 if (last_seal != f->seal ||
4ce534f4 3440 last_keyed_hash != f->keyed_hash ||
170a434c
ZJS
3441 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3442 last_bytes != f->compress_threshold_bytes) {
3443
4ce534f4
LP
3444 log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3445 yes_no(f->seal), yes_no(f->keyed_hash), yes_no(JOURNAL_FILE_COMPRESS(f)),
170a434c
ZJS
3446 format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes));
3447 last_seal = f->seal;
4ce534f4 3448 last_keyed_hash = f->keyed_hash;
170a434c
ZJS
3449 last_compress = JOURNAL_FILE_COMPRESS(f);
3450 last_bytes = f->compress_threshold_bytes;
3451 }
3452 }
57850536 3453
0284adc6
LP
3454 if (mmap_cache)
3455 f->mmap = mmap_cache_ref(mmap_cache);
3456 else {
84168d80 3457 f->mmap = mmap_cache_new();
0284adc6
LP
3458 if (!f->mmap) {
3459 r = -ENOMEM;
3460 goto fail;
3461 }
3462 }
7560fffc 3463
7645c77b 3464 if (fname) {
5d1ce257 3465 f->path = strdup(fname);
7645c77b
ZJS
3466 if (!f->path) {
3467 r = -ENOMEM;
3468 goto fail;
3469 }
3470 } else {
817b1c5b
LP
3471 assert(fd >= 0);
3472
7645c77b
ZJS
3473 /* If we don't know the path, fill in something explanatory and vaguely useful */
3474 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3475 r = -ENOMEM;
3476 goto fail;
3477 }
0284adc6 3478 }
7560fffc 3479
4743015d 3480 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3481 if (!f->chain_cache) {
3482 r = -ENOMEM;
3483 goto fail;
3484 }
3485
0284adc6 3486 if (f->fd < 0) {
817b1c5b
LP
3487 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3488 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3489 * it doesn't hurt in that case. */
3490
3491 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
5d1ce257
LP
3492 if (f->fd < 0) {
3493 r = -errno;
3494 goto fail;
3495 }
3496
3497 /* fds we opened here by us should also be closed by us. */
3498 f->close_fd = true;
817b1c5b
LP
3499
3500 r = fd_nonblock(f->fd, false);
3501 if (r < 0)
3502 goto fail;
7560fffc 3503 }
7560fffc 3504
104fc4be 3505 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd, prot_from_flags(flags));
be7cdd8e
VC
3506 if (!f->cache_fd) {
3507 r = -ENOMEM;
3508 goto fail;
3509 }
3510
2678031a
LP
3511 r = journal_file_fstat(f);
3512 if (r < 0)
0284adc6 3513 goto fail;
7560fffc 3514
0284adc6 3515 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3516
fc68c929 3517 (void) journal_file_warn_btrfs(f);
11689d2a 3518
4c2e1b39
LP
3519 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3520 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3521 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3522 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3523 * solely on mtime/atime/ctime of the file. */
3524 (void) fd_setcrtime(f->fd, 0);
7560fffc 3525
349cc4a5 3526#if HAVE_GCRYPT
0284adc6 3527 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3528 * just don't do sealing */
49a32d43
LP
3529 if (f->seal) {
3530 r = journal_file_fss_load(f);
3531 if (r < 0)
3532 f->seal = false;
3533 }
feb12d3e 3534#endif
7560fffc 3535
0284adc6
LP
3536 r = journal_file_init_header(f, template);
3537 if (r < 0)
3538 goto fail;
7560fffc 3539
2678031a
LP
3540 r = journal_file_fstat(f);
3541 if (r < 0)
0284adc6 3542 goto fail;
fb0951b0
LP
3543
3544 newly_created = true;
0284adc6 3545 }
7560fffc 3546
0284adc6 3547 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3548 r = -ENODATA;
0284adc6
LP
3549 goto fail;
3550 }
7560fffc 3551
258190a0 3552 r = mmap_cache_get(f->mmap, f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
5087825e
LP
3553 if (r == -EINVAL) {
3554 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3555 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3556 * code. */
3557 r = -EAFNOSUPPORT;
3558 goto fail;
3559 }
977eaa1e 3560 if (r < 0)
0284adc6 3561 goto fail;
7560fffc 3562
fa6ac760
LP
3563 f->header = h;
3564
0284adc6 3565 if (!newly_created) {
f9168190 3566 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3567
0284adc6
LP
3568 r = journal_file_verify_header(f);
3569 if (r < 0)
3570 goto fail;
3571 }
7560fffc 3572
349cc4a5 3573#if HAVE_GCRYPT
0284adc6 3574 if (!newly_created && f->writable) {
baed47c3 3575 r = journal_file_fss_load(f);
0284adc6
LP
3576 if (r < 0)
3577 goto fail;
3578 }
feb12d3e 3579#endif
cec736d2
LP
3580
3581 if (f->writable) {
4a92baf3
LP
3582 if (metrics) {
3583 journal_default_metrics(metrics, f->fd);
3584 f->metrics = *metrics;
3585 } else if (template)
3586 f->metrics = template->metrics;
3587
cec736d2
LP
3588 r = journal_file_refresh_header(f);
3589 if (r < 0)
3590 goto fail;
3591 }
3592
349cc4a5 3593#if HAVE_GCRYPT
baed47c3 3594 r = journal_file_hmac_setup(f);
14d10188
LP
3595 if (r < 0)
3596 goto fail;
feb12d3e 3597#endif
14d10188 3598
cec736d2 3599 if (newly_created) {
de190aef 3600 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3601 if (r < 0)
3602 goto fail;
3603
de190aef 3604 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3605 if (r < 0)
3606 goto fail;
7560fffc 3607
349cc4a5 3608#if HAVE_GCRYPT
7560fffc
LP
3609 r = journal_file_append_first_tag(f);
3610 if (r < 0)
3611 goto fail;
feb12d3e 3612#endif
cec736d2
LP
3613 }
3614
be7cdd8e 3615 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3616 r = -EIO;
3617 goto fail;
3618 }
3619
7a24f3bf 3620 if (template && template->post_change_timer) {
e167d7fd
LP
3621 r = journal_file_enable_post_change_timer(
3622 f,
3623 sd_event_source_get_event(template->post_change_timer),
3624 template->post_change_timer_period);
7a24f3bf 3625
7a24f3bf
VC
3626 if (r < 0)
3627 goto fail;
3628 }
3629
f8e2f4d6 3630 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3631 f->close_fd = true;
3632
0559d3a5 3633 *ret = f;
cec736d2
LP
3634 return 0;
3635
3636fail:
be7cdd8e 3637 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3638 r = -EIO;
3639
69a3a6fd 3640 (void) journal_file_close(f);
cec736d2
LP
3641
3642 return r;
3643}
0ac38b70 3644
7a4d21ad 3645int journal_file_archive(JournalFile *f) {
57535f47 3646 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3647
3648 assert(f);
0ac38b70 3649
7a4d21ad 3650 if (!f->writable)
0ac38b70
LP
3651 return -EINVAL;
3652
5d1ce257 3653 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3654 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
7a4d21ad 3655 if (path_startswith(f->path, "/proc/self/fd"))
5d1ce257
LP
3656 return -EINVAL;
3657
7a4d21ad 3658 if (!endswith(f->path, ".journal"))
0ac38b70
LP
3659 return -EINVAL;
3660
7a4d21ad
LP
3661 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3662 (int) strlen(f->path) - 8, f->path,
3663 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3664 le64toh(f->header->head_entry_seqnum),
3665 le64toh(f->header->head_entry_realtime)) < 0)
0ac38b70
LP
3666 return -ENOMEM;
3667
7a4d21ad
LP
3668 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3669 * ignore that case. */
3670 if (rename(f->path, p) < 0 && errno != ENOENT)
0ac38b70
LP
3671 return -errno;
3672
1fcefd88 3673 /* Sync the rename to disk */
7a4d21ad
LP
3674 (void) fsync_directory_of_file(f->fd);
3675
3676 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3677 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3678 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3679 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3680 * occurs. */
3681 f->archive = true;
3682
3683 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3684 * files when we archive them */
3685 f->defrag_on_close = true;
3686
3687 return 0;
3688}
3689
3690JournalFile* journal_initiate_close(
3691 JournalFile *f,
3692 Set *deferred_closes) {
3693
3694 int r;
3695
3696 assert(f);
3697
3698 if (deferred_closes) {
0ac38b70 3699
7a4d21ad
LP
3700 r = set_put(deferred_closes, f);
3701 if (r < 0)
3702 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3703 else {
3704 (void) journal_file_set_offline(f, false);
3705 return NULL;
3706 }
3707 }
3708
3709 return journal_file_close(f);
3710}
3711
3712int journal_file_rotate(
3713 JournalFile **f,
3714 bool compress,
3715 uint64_t compress_threshold_bytes,
3716 bool seal,
3717 Set *deferred_closes) {
3718
3719 JournalFile *new_file = NULL;
3720 int r;
3721
3722 assert(f);
3723 assert(*f);
3724
3725 r = journal_file_archive(*f);
3726 if (r < 0)
3727 return r;
3728
3729 r = journal_file_open(
3730 -1,
3731 (*f)->path,
3732 (*f)->flags,
3733 (*f)->mode,
3734 compress,
3735 compress_threshold_bytes,
3736 seal,
3737 NULL, /* metrics */
3738 (*f)->mmap,
3739 deferred_closes,
3740 *f, /* template */
3741 &new_file);
3742
3743 journal_initiate_close(*f, deferred_closes);
0ac38b70 3744 *f = new_file;
7a4d21ad 3745
0ac38b70
LP
3746 return r;
3747}
3748
68127658
LP
3749int journal_file_dispose(int dir_fd, const char *fname) {
3750 _cleanup_free_ char *p = NULL;
3751 _cleanup_close_ int fd = -1;
3752
3753 assert(fname);
3754
24ee0f9d 3755 /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
68127658
LP
3756 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3757 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3758 * for writing anymore. */
3759
3760 if (!endswith(fname, ".journal"))
3761 return -EINVAL;
3762
3763 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3764 (int) strlen(fname) - 8, fname,
3765 now(CLOCK_REALTIME),
3766 random_u64()) < 0)
3767 return -ENOMEM;
3768
3769 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3770 return -errno;
3771
3772 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3773 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3774 if (fd < 0)
3775 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3776 else {
3777 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3778 (void) btrfs_defrag_fd(fd);
3779 }
3780
3781 return 0;
3782}
3783
9447a7f1
LP
3784int journal_file_open_reliably(
3785 const char *fname,
3786 int flags,
3787 mode_t mode,
7560fffc 3788 bool compress,
57850536 3789 uint64_t compress_threshold_bytes,
baed47c3 3790 bool seal,
4a92baf3 3791 JournalMetrics *metrics,
27370278 3792 MMapCache *mmap_cache,
b58c888f 3793 Set *deferred_closes,
9447a7f1
LP
3794 JournalFile *template,
3795 JournalFile **ret) {
3796
68127658 3797 int r;
9447a7f1 3798
57850536
AG
3799 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3800 deferred_closes, template, ret);
288359db 3801 if (!IN_SET(r,
b288cdeb
ZJS
3802 -EBADMSG, /* Corrupted */
3803 -ENODATA, /* Truncated */
3804 -EHOSTDOWN, /* Other machine */
3805 -EPROTONOSUPPORT, /* Incompatible feature */
3806 -EBUSY, /* Unclean shutdown */
3807 -ESHUTDOWN, /* Already archived */
288359db 3808 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3809 -EIDRM, /* File has been deleted */
3810 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3811 return r;
3812
3813 if ((flags & O_ACCMODE) == O_RDONLY)
3814 return r;
3815
3816 if (!(flags & O_CREAT))
3817 return r;
3818
7560fffc
LP
3819 if (!endswith(fname, ".journal"))
3820 return r;
3821
5c70eab4 3822 /* The file is corrupted. Rotate it away and try it again (but only once) */
65089b82 3823 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3824
68127658
LP
3825 r = journal_file_dispose(AT_FDCWD, fname);
3826 if (r < 0)
3827 return r;
3828
57850536
AG
3829 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3830 deferred_closes, template, ret);
9447a7f1
LP
3831}
3832
5a271b08 3833int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
cf244689
LP
3834 uint64_t i, n;
3835 uint64_t q, xor_hash = 0;
3836 int r;
3837 EntryItem *items;
3838 dual_timestamp ts;
d180c349 3839 const sd_id128_t *boot_id;
cf244689
LP
3840
3841 assert(from);
3842 assert(to);
3843 assert(o);
3844 assert(p);
3845
3846 if (!to->writable)
3847 return -EPERM;
3848
3849 ts.monotonic = le64toh(o->entry.monotonic);
3850 ts.realtime = le64toh(o->entry.realtime);
d180c349 3851 boot_id = &o->entry.boot_id;
cf244689 3852
cf244689 3853 n = journal_file_entry_n_items(o);
4faa7004 3854 /* alloca() can't take 0, hence let's allocate at least one */
cf409d15 3855 items = newa(EntryItem, MAX(1u, n));
cf244689
LP
3856
3857 for (i = 0; i < n; i++) {
4fd052ae
FC
3858 uint64_t l, h;
3859 le64_t le_hash;
cf244689
LP
3860 size_t t;
3861 void *data;
3862 Object *u;
3863
3864 q = le64toh(o->entry.items[i].object_offset);
3865 le_hash = o->entry.items[i].hash;
3866
3867 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3868 if (r < 0)
3869 return r;
3870
3871 if (le_hash != o->data.hash)
3872 return -EBADMSG;
3873
893e0f8f
LP
3874 l = le64toh(READ_NOW(o->object.size));
3875 if (l < offsetof(Object, data.payload))
3876 return -EBADMSG;
3877
3878 l -= offsetof(Object, data.payload);
cf244689
LP
3879 t = (size_t) l;
3880
3881 /* We hit the limit on 32bit machines */
3882 if ((uint64_t) t != l)
3883 return -E2BIG;
3884
d89c8fdf 3885 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
d80b051c 3886#if HAVE_COMPRESSION
a7f7d1bd 3887 size_t rsize = 0;
cf244689 3888
d89c8fdf
ZJS
3889 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3890 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3891 if (r < 0)
3892 return r;
cf244689
LP
3893
3894 data = from->compress_buffer;
3895 l = rsize;
3b1a55e1
ZJS
3896#else
3897 return -EPROTONOSUPPORT;
3898#endif
cf244689
LP
3899 } else
3900 data = o->data.payload;
3901
3902 r = journal_file_append_data(to, data, l, &u, &h);
3903 if (r < 0)
3904 return r;
3905
4ce534f4
LP
3906 if (JOURNAL_HEADER_KEYED_HASH(to->header))
3907 xor_hash ^= jenkins_hash64(data, l);
3908 else
3909 xor_hash ^= le64toh(u->data.hash);
3910
cf244689
LP
3911 items[i].object_offset = htole64(h);
3912 items[i].hash = u->data.hash;
3913
3914 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3915 if (r < 0)
3916 return r;
3917 }
3918
d180c349
ZJS
3919 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3920 NULL, NULL, NULL);
fa6ac760 3921
be7cdd8e 3922 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3923 return -EIO;
3924
3925 return r;
cf244689 3926}
babfc091 3927
8580d1f7
LP
3928void journal_reset_metrics(JournalMetrics *m) {
3929 assert(m);
3930
3931 /* Set everything to "pick automatic values". */
3932
3933 *m = (JournalMetrics) {
3934 .min_use = (uint64_t) -1,
3935 .max_use = (uint64_t) -1,
3936 .min_size = (uint64_t) -1,
3937 .max_size = (uint64_t) -1,
3938 .keep_free = (uint64_t) -1,
3939 .n_max_files = (uint64_t) -1,
3940 };
3941}
3942
babfc091 3943void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3944 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3945 struct statvfs ss;
6aae0b1a 3946 uint64_t fs_size = 0;
babfc091
LP
3947
3948 assert(m);
3949 assert(fd >= 0);
3950
3951 if (fstatvfs(fd, &ss) >= 0)
3952 fs_size = ss.f_frsize * ss.f_blocks;
6aae0b1a 3953 else
8fc58f1a 3954 log_debug_errno(errno, "Failed to determine disk size: %m");
babfc091
LP
3955
3956 if (m->max_use == (uint64_t) -1) {
3957
6aae0b1a
ZJS
3958 if (fs_size > 0)
3959 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3960 MAX_USE_LOWER, MAX_USE_UPPER);
3961 else
3962 m->max_use = MAX_USE_LOWER;
babfc091
LP
3963 } else {
3964 m->max_use = PAGE_ALIGN(m->max_use);
3965
8580d1f7 3966 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3967 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3968 }
3969
6aae0b1a
ZJS
3970 if (m->min_use == (uint64_t) -1) {
3971 if (fs_size > 0)
3972 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3973 MIN_USE_LOW, MIN_USE_HIGH);
3974 else
3975 m->min_use = MIN_USE_LOW;
3976 }
8580d1f7
LP
3977
3978 if (m->min_use > m->max_use)
3979 m->min_use = m->max_use;
3980
6aae0b1a
ZJS
3981 if (m->max_size == (uint64_t) -1)
3982 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3983 MAX_SIZE_UPPER);
3984 else
babfc091
LP
3985 m->max_size = PAGE_ALIGN(m->max_size);
3986
8580d1f7
LP
3987 if (m->max_size != 0) {
3988 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3989 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3990
8580d1f7
LP
3991 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3992 m->max_use = m->max_size*2;
3993 }
babfc091
LP
3994
3995 if (m->min_size == (uint64_t) -1)
3996 m->min_size = JOURNAL_FILE_SIZE_MIN;
6aae0b1a
ZJS
3997 else
3998 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3999 JOURNAL_FILE_SIZE_MIN,
4000 m->max_size ?: UINT64_MAX);
babfc091
LP
4001
4002 if (m->keep_free == (uint64_t) -1) {
6aae0b1a
ZJS
4003 if (fs_size > 0)
4004 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
4005 KEEP_FREE_UPPER);
4006 else
babfc091
LP
4007 m->keep_free = DEFAULT_KEEP_FREE;
4008 }
4009
8580d1f7
LP
4010 if (m->n_max_files == (uint64_t) -1)
4011 m->n_max_files = DEFAULT_N_MAX_FILES;
4012
4013 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
4014 format_bytes(a, sizeof(a), m->min_use),
4015 format_bytes(b, sizeof(b), m->max_use),
4016 format_bytes(c, sizeof(c), m->max_size),
4017 format_bytes(d, sizeof(d), m->min_size),
4018 format_bytes(e, sizeof(e), m->keep_free),
4019 m->n_max_files);
babfc091 4020}
08984293
LP
4021
4022int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 4023 assert(f);
c88cc6af 4024 assert(f->header);
08984293
LP
4025 assert(from || to);
4026
4027 if (from) {
162566a4
LP
4028 if (f->header->head_entry_realtime == 0)
4029 return -ENOENT;
08984293 4030
162566a4 4031 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
4032 }
4033
4034 if (to) {
162566a4
LP
4035 if (f->header->tail_entry_realtime == 0)
4036 return -ENOENT;
08984293 4037
162566a4 4038 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
4039 }
4040
4041 return 1;
4042}
4043
4044int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
4045 Object *o;
4046 uint64_t p;
4047 int r;
4048
4049 assert(f);
4050 assert(from || to);
4051
47838ab3 4052 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
4053 if (r <= 0)
4054 return r;
4055
4056 if (le64toh(o->data.n_entries) <= 0)
4057 return 0;
4058
4059 if (from) {
4060 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
4061 if (r < 0)
4062 return r;
4063
4064 *from = le64toh(o->entry.monotonic);
4065 }
4066
4067 if (to) {
4068 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
4069 if (r < 0)
4070 return r;
4071
4072 r = generic_array_get_plus_one(f,
4073 le64toh(o->data.entry_offset),
4074 le64toh(o->data.entry_array_offset),
4075 le64toh(o->data.n_entries)-1,
4076 &o, NULL);
4077 if (r <= 0)
4078 return r;
4079
4080 *to = le64toh(o->entry.monotonic);
4081 }
4082
4083 return 1;
4084}
dca6219e 4085
fb0951b0 4086bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 4087 assert(f);
c88cc6af 4088 assert(f->header);
dca6219e
LP
4089
4090 /* If we gained new header fields we gained new features,
4091 * hence suggest a rotation */
361f9cbc
LP
4092 if (le64toh(f->header->header_size) < sizeof(Header)) {
4093 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 4094 return true;
361f9cbc 4095 }
dca6219e 4096
0dbe57ee
LP
4097 /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
4098 * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
4099 * need the n_data field, which only exists in newer versions. */
dca6219e
LP
4100
4101 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 4102 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 4103 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
4104 f->path,
4105 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
4106 le64toh(f->header->n_data),
4107 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
4108 (unsigned long long) f->last_stat.st_size,
4109 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 4110 return true;
361f9cbc 4111 }
dca6219e
LP
4112
4113 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 4114 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 4115 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
4116 f->path,
4117 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
4118 le64toh(f->header->n_fields),
4119 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 4120 return true;
361f9cbc 4121 }
dca6219e 4122
0dbe57ee
LP
4123 /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
4124 * longest chain is longer than some threshold, let's suggest rotation. */
4125 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) &&
4126 le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
4127 log_debug("Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.",
4128 f->path, le64toh(f->header->data_hash_chain_depth));
4129 return true;
4130 }
4131
4132 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) &&
4133 le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
4134 log_debug("Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",
4135 f->path, le64toh(f->header->field_hash_chain_depth));
4136 return true;
4137 }
4138
0598fd4a
LP
4139 /* Are the data objects properly indexed by field objects? */
4140 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
4141 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
4142 le64toh(f->header->n_data) > 0 &&
4143 le64toh(f->header->n_fields) == 0)
4144 return true;
4145
fb0951b0
LP
4146 if (max_file_usec > 0) {
4147 usec_t t, h;
4148
4149 h = le64toh(f->header->head_entry_realtime);
4150 t = now(CLOCK_REALTIME);
4151
4152 if (h > 0 && t > h + max_file_usec)
4153 return true;
4154 }
4155
dca6219e
LP
4156 return false;
4157}