]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Use const char* for timestamp strings which we don't plan to modify
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
cec736d2
LP
6***/
7
cec736d2 8#include <errno.h>
cec736d2 9#include <fcntl.h>
11689d2a 10#include <linux/fs.h>
ac2e41f5 11#include <pthread.h>
07630cea
LP
12#include <stddef.h>
13#include <sys/mman.h>
14#include <sys/statvfs.h>
15#include <sys/uio.h>
16#include <unistd.h>
fb0951b0 17
b5efdb8a 18#include "alloc-util.h"
f27a3864 19#include "btrfs-util.h"
c8b3094d 20#include "chattr-util.h"
07630cea 21#include "compress.h"
3ffd4af2 22#include "fd-util.h"
11b29a96 23#include "fs-util.h"
0284adc6 24#include "journal-authenticate.h"
cec736d2
LP
25#include "journal-def.h"
26#include "journal-file.h"
27#include "lookup3.h"
6bedfcbb 28#include "parse-util.h"
5d1ce257 29#include "path-util.h"
3df3e884 30#include "random-util.h"
7a24f3bf 31#include "sd-event.h"
b58c888f 32#include "set.h"
3cc44114 33#include "stat-util.h"
07630cea 34#include "string-util.h"
4761fd0f 35#include "strv.h"
89a5a90c 36#include "xattr-util.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
57850536
AG
41#define DEFAULT_COMPRESS_THRESHOLD (512ULL)
42#define MIN_COMPRESS_THRESHOLD (8ULL)
807e17f0 43
babfc091 44/* This is the minimum journal file size */
16098e93 45#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
8580d1f7
LP
52/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
54
babfc091 55/* This is the upper bound if we deduce max_size from max_use */
71100051 56#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
57
58/* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62/* This is the keep_free value when we can't determine the system
63 * size */
64#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65
8580d1f7
LP
66/* This is the default maximum number of journal files to keep around. */
67#define DEFAULT_N_MAX_FILES (100)
68
dca6219e
LP
69/* n_data was the first entry we added after the initial file format design */
70#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 71
a4bcff5b
LP
72/* How many entries to keep in the entry array chain cache at max */
73#define CHAIN_CACHE_MAX 20
74
a676e665
LP
75/* How much to increase the journal file size at once each time we allocate something new. */
76#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
77
2678031a
LP
78/* Reread fstat() of the file for detecting deletions at least this often */
79#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
fa6ac760
LP
81/* The mmap context to use for the header we pick as one above the last defined typed */
82#define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
51804460
ZJS
84#ifdef __clang__
85# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
86#endif
87
ac2e41f5
VC
88/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
89 * As a result we use atomic operations on f->offline_state for inter-thread communications with
90 * journal_file_set_offline() and journal_file_set_online(). */
91static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 92 assert(f);
ac2e41f5
VC
93 assert(f->fd >= 0);
94 assert(f->header);
95
96 for (;;) {
97 switch (f->offline_state) {
98 case OFFLINE_CANCEL:
99 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
100 continue;
101 return;
102
103 case OFFLINE_AGAIN_FROM_SYNCING:
104 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
105 continue;
106 break;
107
108 case OFFLINE_AGAIN_FROM_OFFLINING:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
110 continue;
111 break;
112
113 case OFFLINE_SYNCING:
114 (void) fsync(f->fd);
26687bf8 115
ac2e41f5
VC
116 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
117 continue;
26687bf8 118
8eb85171 119 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
120 (void) fsync(f->fd);
121 break;
122
123 case OFFLINE_OFFLINING:
124 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
125 continue;
4831981d 126 _fallthrough_;
ac2e41f5
VC
127 case OFFLINE_DONE:
128 return;
129
130 case OFFLINE_JOINED:
131 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
132 return;
133 }
134 }
135}
136
137static void * journal_file_set_offline_thread(void *arg) {
138 JournalFile *f = arg;
139
fa7ff4cf
LP
140 (void) pthread_setname_np(pthread_self(), "journal-offline");
141
ac2e41f5
VC
142 journal_file_set_offline_internal(f);
143
144 return NULL;
145}
146
147static int journal_file_set_offline_thread_join(JournalFile *f) {
148 int r;
149
150 assert(f);
151
152 if (f->offline_state == OFFLINE_JOINED)
153 return 0;
154
155 r = pthread_join(f->offline_thread, NULL);
156 if (r)
157 return -r;
158
159 f->offline_state = OFFLINE_JOINED;
26687bf8 160
be7cdd8e 161 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
162 return -EIO;
163
ac2e41f5
VC
164 return 0;
165}
26687bf8 166
ac2e41f5
VC
167/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
168static bool journal_file_set_offline_try_restart(JournalFile *f) {
169 for (;;) {
170 switch (f->offline_state) {
171 case OFFLINE_AGAIN_FROM_SYNCING:
172 case OFFLINE_AGAIN_FROM_OFFLINING:
173 return true;
174
175 case OFFLINE_CANCEL:
176 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
177 continue;
178 return true;
179
180 case OFFLINE_SYNCING:
181 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
182 continue;
183 return true;
184
185 case OFFLINE_OFFLINING:
186 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
187 continue;
188 return true;
26687bf8
OS
189
190 default:
ac2e41f5
VC
191 return false;
192 }
26687bf8
OS
193 }
194}
195
ac2e41f5
VC
196/* Sets a journal offline.
197 *
198 * If wait is false then an offline is dispatched in a separate thread for a
199 * subsequent journal_file_set_offline() or journal_file_set_online() of the
200 * same journal to synchronize with.
201 *
202 * If wait is true, then either an existing offline thread will be restarted
203 * and joined, or if none exists the offline is simply performed in this
204 * context without involving another thread.
205 */
206int journal_file_set_offline(JournalFile *f, bool wait) {
207 bool restarted;
208 int r;
209
26687bf8
OS
210 assert(f);
211
212 if (!f->writable)
213 return -EPERM;
214
215 if (!(f->fd >= 0 && f->header))
216 return -EINVAL;
217
b8f99e27
VC
218 /* An offlining journal is implicitly online and may modify f->header->state,
219 * we must also join any potentially lingering offline thread when not online. */
220 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
221 return journal_file_set_offline_thread_join(f);
26687bf8 222
ac2e41f5
VC
223 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
224 restarted = journal_file_set_offline_try_restart(f);
225 if ((restarted && wait) || !restarted) {
226 r = journal_file_set_offline_thread_join(f);
227 if (r < 0)
228 return r;
229 }
26687bf8 230
ac2e41f5
VC
231 if (restarted)
232 return 0;
233
234 /* Initiate a new offline. */
235 f->offline_state = OFFLINE_SYNCING;
fa6ac760 236
ac2e41f5
VC
237 if (wait) /* Without using a thread if waiting. */
238 journal_file_set_offline_internal(f);
239 else {
5e9f01e8
LP
240 sigset_t ss, saved_ss;
241 int k;
242
243 if (sigfillset(&ss) < 0)
244 return -errno;
245
246 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
247 if (r > 0)
248 return -r;
249
ac2e41f5 250 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
251
252 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
253 if (r > 0) {
254 f->offline_state = OFFLINE_JOINED;
ac2e41f5 255 return -r;
ec9ffa2c 256 }
5e9f01e8
LP
257 if (k > 0)
258 return -k;
ac2e41f5
VC
259 }
260
261 return 0;
262}
263
264static int journal_file_set_online(JournalFile *f) {
83bf6b67 265 bool wait = true;
ac2e41f5
VC
266
267 assert(f);
268
269 if (!f->writable)
270 return -EPERM;
271
272 if (!(f->fd >= 0 && f->header))
273 return -EINVAL;
274
83bf6b67 275 while (wait) {
ac2e41f5
VC
276 switch (f->offline_state) {
277 case OFFLINE_JOINED:
278 /* No offline thread, no need to wait. */
83bf6b67 279 wait = false;
ac2e41f5
VC
280 break;
281
282 case OFFLINE_SYNCING:
283 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
284 continue;
285 /* Canceled syncing prior to offlining, no need to wait. */
83bf6b67 286 wait = false;
ac2e41f5
VC
287 break;
288
289 case OFFLINE_AGAIN_FROM_SYNCING:
290 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
291 continue;
292 /* Canceled restart from syncing, no need to wait. */
83bf6b67 293 wait = false;
ac2e41f5
VC
294 break;
295
296 case OFFLINE_AGAIN_FROM_OFFLINING:
297 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
298 continue;
299 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 300 _fallthrough_;
ac2e41f5
VC
301 default: {
302 int r;
303
304 r = journal_file_set_offline_thread_join(f);
305 if (r < 0)
306 return r;
307
83bf6b67 308 wait = false;
ac2e41f5
VC
309 break;
310 }
311 }
312 }
26687bf8 313
be7cdd8e 314 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
315 return -EIO;
316
ac2e41f5
VC
317 switch (f->header->state) {
318 case STATE_ONLINE:
319 return 0;
26687bf8 320
ac2e41f5
VC
321 case STATE_OFFLINE:
322 f->header->state = STATE_ONLINE;
323 (void) fsync(f->fd);
324 return 0;
325
326 default:
327 return -EINVAL;
328 }
26687bf8
OS
329}
330
b58c888f
VC
331bool journal_file_is_offlining(JournalFile *f) {
332 assert(f);
333
334 __sync_synchronize();
335
3742095b 336 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
337 return false;
338
339 return true;
340}
341
804ae586 342JournalFile* journal_file_close(JournalFile *f) {
de190aef 343 assert(f);
cec736d2 344
349cc4a5 345#if HAVE_GCRYPT
b0af6f41 346 /* Write the final tag */
43cd8794
FB
347 if (f->seal && f->writable) {
348 int r;
349
350 r = journal_file_append_tag(f);
351 if (r < 0)
352 log_error_errno(r, "Failed to append tag when closing journal: %m");
353 }
feb12d3e 354#endif
b0af6f41 355
7a24f3bf
VC
356 if (f->post_change_timer) {
357 int enabled;
358
359 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
360 if (enabled == SD_EVENT_ONESHOT)
361 journal_file_post_change(f);
362
e167d7fd 363 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
364 sd_event_source_unref(f->post_change_timer);
365 }
366
ac2e41f5 367 journal_file_set_offline(f, true);
cec736d2 368
be7cdd8e
VC
369 if (f->mmap && f->cache_fd)
370 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 371
11689d2a
LP
372 if (f->fd >= 0 && f->defrag_on_close) {
373
374 /* Be friendly to btrfs: turn COW back on again now,
375 * and defragment the file. We won't write to the file
376 * ever again, hence remove all fragmentation, and
377 * reenable all the good bits COW usually provides
378 * (such as data checksumming). */
379
1ed8f8c1 380 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
381 (void) btrfs_defrag_fd(f->fd);
382 }
f27a3864 383
5d1ce257
LP
384 if (f->close_fd)
385 safe_close(f->fd);
cec736d2 386 free(f->path);
807e17f0 387
f649045c 388 mmap_cache_unref(f->mmap);
16e9f408 389
4743015d 390 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 391
349cc4a5 392#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
393 free(f->compress_buffer);
394#endif
395
349cc4a5 396#if HAVE_GCRYPT
baed47c3
LP
397 if (f->fss_file)
398 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 399 else
b7c9ae91
LP
400 free(f->fsprg_state);
401
402 free(f->fsprg_seed);
7560fffc
LP
403
404 if (f->hmac)
405 gcry_md_close(f->hmac);
406#endif
407
6b430fdb 408 return mfree(f);
cec736d2
LP
409}
410
0ac38b70 411static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 412 Header h = {};
cec736d2
LP
413 ssize_t k;
414 int r;
415
416 assert(f);
417
7560fffc 418 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 419 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 420
d89c8fdf
ZJS
421 h.incompatible_flags |= htole32(
422 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
423 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 424
d89c8fdf
ZJS
425 h.compatible_flags = htole32(
426 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 427
cec736d2
LP
428 r = sd_id128_randomize(&h.file_id);
429 if (r < 0)
430 return r;
431
0ac38b70
LP
432 if (template) {
433 h.seqnum_id = template->header->seqnum_id;
beec0085 434 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
435 } else
436 h.seqnum_id = h.file_id;
cec736d2
LP
437
438 k = pwrite(f->fd, &h, sizeof(h), 0);
439 if (k < 0)
440 return -errno;
441
442 if (k != sizeof(h))
443 return -EIO;
444
445 return 0;
446}
447
448static int journal_file_refresh_header(JournalFile *f) {
de190aef 449 sd_id128_t boot_id;
fa6ac760 450 int r;
cec736d2
LP
451
452 assert(f);
c88cc6af 453 assert(f->header);
cec736d2
LP
454
455 r = sd_id128_get_machine(&f->header->machine_id);
fd4885df
ZJS
456 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
457 /* We don't have a machine-id, let's continue without */
458 zero(f->header->machine_id);
459 else if (r < 0)
cec736d2
LP
460 return r;
461
de190aef 462 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
463 if (r < 0)
464 return r;
465
de190aef
LP
466 f->header->boot_id = boot_id;
467
fa6ac760 468 r = journal_file_set_online(f);
b788cc23 469
7560fffc 470 /* Sync the online state to disk */
fb426037 471 (void) fsync(f->fd);
b788cc23 472
a0fe2a2d
LP
473 /* We likely just created a new file, also sync the directory this file is located in. */
474 (void) fsync_directory_of_file(f->fd);
475
fa6ac760 476 return r;
cec736d2
LP
477}
478
4214009f
ZJS
479static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
480 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
481 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
482 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
483 uint32_t flags;
484
4214009f
ZJS
485 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
486
487 if (flags & ~supported) {
488 if (flags & ~any)
4761fd0f 489 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
490 f->path, type, flags & ~any);
491 flags = (flags & any) & ~supported;
4761fd0f
ZJS
492 if (flags) {
493 const char* strv[3];
494 unsigned n = 0;
495 _cleanup_free_ char *t = NULL;
496
497 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
498 strv[n++] = "sealed";
499 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
500 strv[n++] = "xz-compressed";
501 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
502 strv[n++] = "lz4-compressed";
503 strv[n] = NULL;
504 assert(n < ELEMENTSOF(strv));
505
506 t = strv_join((char**) strv, ", ");
507 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
508 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
509 }
4214009f
ZJS
510 return true;
511 }
512
513 return false;
514}
515
516static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
517 uint64_t arena_size, header_size;
518
cec736d2 519 assert(f);
c88cc6af 520 assert(f->header);
cec736d2 521
7560fffc 522 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
523 return -EBADMSG;
524
4214009f
ZJS
525 /* In both read and write mode we refuse to open files with incompatible
526 * flags we don't know. */
527 if (warn_wrong_flags(f, false))
cec736d2
LP
528 return -EPROTONOSUPPORT;
529
4214009f
ZJS
530 /* When open for writing we refuse to open files with compatible flags, too. */
531 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 532 return -EPROTONOSUPPORT;
7560fffc 533
db11ac1a
LP
534 if (f->header->state >= _STATE_MAX)
535 return -EBADMSG;
536
6f94e420
TS
537 header_size = le64toh(f->header->header_size);
538
dca6219e 539 /* The first addition was n_data, so check that we are at least this large */
6f94e420 540 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
541 return -EBADMSG;
542
8088cbd3 543 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
544 return -EBADMSG;
545
6f94e420
TS
546 arena_size = le64toh(f->header->arena_size);
547
548 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
549 return -ENODATA;
550
6f94e420 551 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
552 return -ENODATA;
553
7762e02b
LP
554 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
555 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
556 !VALID64(le64toh(f->header->tail_object_offset)) ||
557 !VALID64(le64toh(f->header->entry_array_offset)))
558 return -ENODATA;
559
cec736d2 560 if (f->writable) {
cec736d2 561 sd_id128_t machine_id;
ae739cc1 562 uint8_t state;
cec736d2
LP
563 int r;
564
565 r = sd_id128_get_machine(&machine_id);
566 if (r < 0)
567 return r;
568
569 if (!sd_id128_equal(machine_id, f->header->machine_id))
570 return -EHOSTDOWN;
571
de190aef 572 state = f->header->state;
cec736d2 573
b288cdeb
ZJS
574 if (state == STATE_ARCHIVED)
575 return -ESHUTDOWN; /* Already archived */
576 else if (state == STATE_ONLINE) {
71fa6f00
LP
577 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
578 return -EBUSY;
b288cdeb 579 } else if (state != STATE_OFFLINE) {
8facc349 580 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
581 return -EBUSY;
582 }
ae739cc1 583
5b3cc0c8
YN
584 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
585 return -EBADMSG;
586
ae739cc1
LP
587 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
588 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
589 * bisection. */
590 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
591 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
592 return -ETXTBSY;
593 }
cec736d2
LP
594 }
595
d89c8fdf
ZJS
596 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
597 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 598
f1889c91 599 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 600
cec736d2
LP
601 return 0;
602}
603
2678031a 604static int journal_file_fstat(JournalFile *f) {
3cc44114
LP
605 int r;
606
2678031a
LP
607 assert(f);
608 assert(f->fd >= 0);
609
610 if (fstat(f->fd, &f->last_stat) < 0)
611 return -errno;
612
613 f->last_stat_usec = now(CLOCK_MONOTONIC);
614
8d6a4d33 615 /* Refuse dealing with with files that aren't regular */
3cc44114
LP
616 r = stat_verify_regular(&f->last_stat);
617 if (r < 0)
618 return r;
8d6a4d33 619
2678031a
LP
620 /* Refuse appending to files that are already deleted */
621 if (f->last_stat.st_nlink <= 0)
622 return -EIDRM;
623
624 return 0;
625}
626
cec736d2 627static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 628 uint64_t old_size, new_size;
fec2aa2f 629 int r;
cec736d2
LP
630
631 assert(f);
c88cc6af 632 assert(f->header);
cec736d2 633
cec736d2 634 /* We assume that this file is not sparse, and we know that
38ac38b2 635 * for sure, since we always call posix_fallocate()
cec736d2
LP
636 * ourselves */
637
be7cdd8e 638 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
639 return -EIO;
640
cec736d2 641 old_size =
23b0b2b2 642 le64toh(f->header->header_size) +
cec736d2
LP
643 le64toh(f->header->arena_size);
644
bc85bfee 645 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
646 if (new_size < le64toh(f->header->header_size))
647 new_size = le64toh(f->header->header_size);
bc85bfee 648
2678031a
LP
649 if (new_size <= old_size) {
650
651 /* We already pre-allocated enough space, but before
652 * we write to it, let's check with fstat() if the
653 * file got deleted, in order make sure we don't throw
654 * away the data immediately. Don't check fstat() for
655 * all writes though, but only once ever 10s. */
656
657 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
658 return 0;
659
660 return journal_file_fstat(f);
661 }
662
663 /* Allocate more space. */
cec736d2 664
a676e665 665 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 666 return -E2BIG;
cec736d2 667
a676e665 668 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
669 struct statvfs svfs;
670
671 if (fstatvfs(f->fd, &svfs) >= 0) {
672 uint64_t available;
673
070052ab 674 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
675
676 if (new_size - old_size > available)
677 return -E2BIG;
678 }
679 }
680
eda4b58b 681 /* Increase by larger blocks at once */
be6b0c21 682 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
eda4b58b
LP
683 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
684 new_size = f->metrics.max_size;
685
bc85bfee
LP
686 /* Note that the glibc fallocate() fallback is very
687 inefficient, hence we try to minimize the allocation area
688 as we can. */
fec2aa2f
GV
689 r = posix_fallocate(f->fd, old_size, new_size - old_size);
690 if (r != 0)
691 return -r;
cec736d2 692
23b0b2b2 693 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 694
2678031a 695 return journal_file_fstat(f);
cec736d2
LP
696}
697
78519831 698static unsigned type_to_context(ObjectType type) {
d3d3208f 699 /* One context for each type, plus one catch-all for the rest */
69adae51 700 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 701 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 702 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
703}
704
b439282e 705static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
706 int r;
707
cec736d2 708 assert(f);
cec736d2
LP
709 assert(ret);
710
7762e02b
LP
711 if (size <= 0)
712 return -EINVAL;
713
2a59ea54 714 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
715 if (offset + size > (uint64_t) f->last_stat.st_size) {
716 /* Hmm, out of range? Let's refresh the fstat() data
717 * first, before we trust that check. */
718
2678031a
LP
719 r = journal_file_fstat(f);
720 if (r < 0)
721 return r;
722
723 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
724 return -EADDRNOTAVAIL;
725 }
726
b439282e 727 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
728}
729
16e9f408
LP
730static uint64_t minimum_header_size(Object *o) {
731
b8e891e6 732 static const uint64_t table[] = {
16e9f408
LP
733 [OBJECT_DATA] = sizeof(DataObject),
734 [OBJECT_FIELD] = sizeof(FieldObject),
735 [OBJECT_ENTRY] = sizeof(EntryObject),
736 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
737 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
738 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
739 [OBJECT_TAG] = sizeof(TagObject),
740 };
741
742 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
743 return sizeof(ObjectHeader);
744
745 return table[o->object.type];
746}
747
24754f36
TR
748/* Lightweight object checks. We want this to be fast, so that we won't
749 * slowdown every journal_file_move_to_object() call too much. */
750static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
751 assert(f);
752 assert(o);
753
754 switch (o->object.type) {
755
756 case OBJECT_DATA: {
757 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
758 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 759 le64toh(o->data.n_entries), offset);
24754f36
TR
760 return -EBADMSG;
761 }
762
763 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
764 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
765 offsetof(DataObject, payload),
766 le64toh(o->object.size),
767 offset);
768 return -EBADMSG;
769 }
770
10e8445b
TR
771 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
772 !VALID64(le64toh(o->data.next_field_offset)) ||
773 !VALID64(le64toh(o->data.entry_offset)) ||
774 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
775 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
776 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
777 le64toh(o->data.next_hash_offset),
778 le64toh(o->data.next_field_offset),
779 le64toh(o->data.entry_offset),
780 le64toh(o->data.entry_array_offset),
24754f36
TR
781 offset);
782 return -EBADMSG;
783 }
784
785 break;
786 }
787
788 case OBJECT_FIELD:
789 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
790 log_debug(
791 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
792 offsetof(FieldObject, payload),
793 le64toh(o->object.size),
794 offset);
795 return -EBADMSG;
796 }
797
10e8445b
TR
798 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
799 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
800 log_debug(
801 "Invalid offset, next_hash_offset="OFSfmt
802 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
803 le64toh(o->field.next_hash_offset),
804 le64toh(o->field.head_data_offset),
24754f36
TR
805 offset);
806 return -EBADMSG;
807 }
808 break;
809
810 case OBJECT_ENTRY:
811 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
812 log_debug(
813 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
814 offsetof(EntryObject, items),
815 le64toh(o->object.size),
816 offset);
817 return -EBADMSG;
818 }
819
820 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
821 log_debug(
822 "Invalid number items in entry: %"PRIu64": %"PRIu64,
823 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
824 offset);
825 return -EBADMSG;
826 }
827
828 if (le64toh(o->entry.seqnum) <= 0) {
829 log_debug(
830 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
831 le64toh(o->entry.seqnum),
832 offset);
833 return -EBADMSG;
834 }
835
836 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
837 log_debug(
838 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
839 le64toh(o->entry.realtime),
840 offset);
841 return -EBADMSG;
842 }
843
844 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
845 log_debug(
846 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
847 le64toh(o->entry.monotonic),
848 offset);
849 return -EBADMSG;
850 }
851
852 break;
853
854 case OBJECT_DATA_HASH_TABLE:
855 case OBJECT_FIELD_HASH_TABLE:
856 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
857 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
858 log_debug(
859 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
860 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
861 le64toh(o->object.size),
862 offset);
863 return -EBADMSG;
864 }
865
866 break;
867
868 case OBJECT_ENTRY_ARRAY:
869 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
870 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
871 log_debug(
872 "Invalid object entry array size: %"PRIu64": %"PRIu64,
873 le64toh(o->object.size),
874 offset);
875 return -EBADMSG;
876 }
877
10e8445b 878 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
879 log_debug(
880 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 881 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
882 offset);
883 return -EBADMSG;
884 }
885
886 break;
887
888 case OBJECT_TAG:
889 if (le64toh(o->object.size) != sizeof(TagObject)) {
890 log_debug(
891 "Invalid object tag size: %"PRIu64": %"PRIu64,
892 le64toh(o->object.size),
893 offset);
894 return -EBADMSG;
895 }
896
10e8445b 897 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
898 log_debug(
899 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 900 le64toh(o->tag.epoch),
24754f36
TR
901 offset);
902 return -EBADMSG;
903 }
904
905 break;
906 }
907
908 return 0;
909}
910
78519831 911int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
912 int r;
913 void *t;
b439282e 914 size_t tsize;
cec736d2
LP
915 Object *o;
916 uint64_t s;
917
918 assert(f);
919 assert(ret);
920
db11ac1a 921 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
922 if (!VALID64(offset)) {
923 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 924 return -EBADMSG;
202fd896 925 }
db11ac1a 926
50809d7a 927 /* Object may not be located in the file header */
202fd896
LP
928 if (offset < le64toh(f->header->header_size)) {
929 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 930 return -EBADMSG;
202fd896 931 }
50809d7a 932
b439282e 933 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
934 if (r < 0)
935 return r;
936
937 o = (Object*) t;
938 s = le64toh(o->object.size);
939
1c69f096
LP
940 if (s == 0) {
941 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
942 return -EBADMSG;
943 }
202fd896
LP
944 if (s < sizeof(ObjectHeader)) {
945 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 946 return -EBADMSG;
202fd896 947 }
cec736d2 948
202fd896
LP
949 if (o->object.type <= OBJECT_UNUSED) {
950 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 951 return -EBADMSG;
202fd896 952 }
16e9f408 953
202fd896
LP
954 if (s < minimum_header_size(o)) {
955 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 956 return -EBADMSG;
202fd896 957 }
16e9f408 958
202fd896
LP
959 if (type > OBJECT_UNUSED && o->object.type != type) {
960 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 961 return -EBADMSG;
202fd896 962 }
cec736d2 963
b439282e
VC
964 if (s > tsize) {
965 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
966 if (r < 0)
967 return r;
968
969 o = (Object*) t;
970 }
971
24754f36
TR
972 r = journal_file_check_object(f, offset, o);
973 if (r < 0)
974 return r;
975
cec736d2
LP
976 *ret = o;
977 return 0;
978}
979
d98cc1f2 980static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
981 uint64_t r;
982
983 assert(f);
c88cc6af 984 assert(f->header);
cec736d2 985
beec0085 986 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
987
988 if (seqnum) {
de190aef 989 /* If an external seqnum counter was passed, we update
c2373f84
LP
990 * both the local and the external one, and set it to
991 * the maximum of both */
992
993 if (*seqnum + 1 > r)
994 r = *seqnum + 1;
995
996 *seqnum = r;
997 }
998
beec0085 999 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1000
beec0085
LP
1001 if (f->header->head_entry_seqnum == 0)
1002 f->header->head_entry_seqnum = htole64(r);
de190aef 1003
cec736d2
LP
1004 return r;
1005}
1006
78519831 1007int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1008 int r;
1009 uint64_t p;
1010 Object *tail, *o;
1011 void *t;
1012
1013 assert(f);
c88cc6af 1014 assert(f->header);
d05089d8 1015 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1016 assert(size >= sizeof(ObjectHeader));
1017 assert(offset);
1018 assert(ret);
1019
26687bf8
OS
1020 r = journal_file_set_online(f);
1021 if (r < 0)
1022 return r;
1023
cec736d2 1024 p = le64toh(f->header->tail_object_offset);
cec736d2 1025 if (p == 0)
23b0b2b2 1026 p = le64toh(f->header->header_size);
cec736d2 1027 else {
d05089d8 1028 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1029 if (r < 0)
1030 return r;
1031
1032 p += ALIGN64(le64toh(tail->object.size));
1033 }
1034
1035 r = journal_file_allocate(f, p, size);
1036 if (r < 0)
1037 return r;
1038
b439282e 1039 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1040 if (r < 0)
1041 return r;
1042
1043 o = (Object*) t;
1044
1045 zero(o->object);
de190aef 1046 o->object.type = type;
cec736d2
LP
1047 o->object.size = htole64(size);
1048
1049 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1050 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1051
1052 *ret = o;
1053 *offset = p;
1054
1055 return 0;
1056}
1057
de190aef 1058static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1059 uint64_t s, p;
1060 Object *o;
1061 int r;
1062
1063 assert(f);
c88cc6af 1064 assert(f->header);
cec736d2 1065
070052ab
LP
1066 /* We estimate that we need 1 hash table entry per 768 bytes
1067 of journal file and we want to make sure we never get
1068 beyond 75% fill level. Calculate the hash table size for
1069 the maximum file size based on these metrics. */
4a92baf3 1070
dfabe643 1071 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1072 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1073 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1074
507f22bd 1075 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1076
de190aef
LP
1077 r = journal_file_append_object(f,
1078 OBJECT_DATA_HASH_TABLE,
1079 offsetof(Object, hash_table.items) + s,
1080 &o, &p);
cec736d2
LP
1081 if (r < 0)
1082 return r;
1083
29804cc1 1084 memzero(o->hash_table.items, s);
cec736d2 1085
de190aef
LP
1086 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1087 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1088
1089 return 0;
1090}
1091
de190aef 1092static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1093 uint64_t s, p;
1094 Object *o;
1095 int r;
1096
1097 assert(f);
c88cc6af 1098 assert(f->header);
cec736d2 1099
3c1668da
LP
1100 /* We use a fixed size hash table for the fields as this
1101 * number should grow very slowly only */
1102
de190aef
LP
1103 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1104 r = journal_file_append_object(f,
1105 OBJECT_FIELD_HASH_TABLE,
1106 offsetof(Object, hash_table.items) + s,
1107 &o, &p);
cec736d2
LP
1108 if (r < 0)
1109 return r;
1110
29804cc1 1111 memzero(o->hash_table.items, s);
cec736d2 1112
de190aef
LP
1113 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1114 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1115
1116 return 0;
1117}
1118
dade37d4 1119int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1120 uint64_t s, p;
1121 void *t;
1122 int r;
1123
1124 assert(f);
c88cc6af 1125 assert(f->header);
cec736d2 1126
dade37d4
LP
1127 if (f->data_hash_table)
1128 return 0;
1129
de190aef
LP
1130 p = le64toh(f->header->data_hash_table_offset);
1131 s = le64toh(f->header->data_hash_table_size);
cec736d2 1132
de190aef 1133 r = journal_file_move_to(f,
16e9f408 1134 OBJECT_DATA_HASH_TABLE,
fcde2389 1135 true,
de190aef 1136 p, s,
b42549ad 1137 &t, NULL);
cec736d2
LP
1138 if (r < 0)
1139 return r;
1140
de190aef 1141 f->data_hash_table = t;
cec736d2
LP
1142 return 0;
1143}
1144
dade37d4 1145int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1146 uint64_t s, p;
1147 void *t;
1148 int r;
1149
1150 assert(f);
c88cc6af 1151 assert(f->header);
cec736d2 1152
dade37d4
LP
1153 if (f->field_hash_table)
1154 return 0;
1155
de190aef
LP
1156 p = le64toh(f->header->field_hash_table_offset);
1157 s = le64toh(f->header->field_hash_table_size);
cec736d2 1158
de190aef 1159 r = journal_file_move_to(f,
16e9f408 1160 OBJECT_FIELD_HASH_TABLE,
fcde2389 1161 true,
de190aef 1162 p, s,
b42549ad 1163 &t, NULL);
cec736d2
LP
1164 if (r < 0)
1165 return r;
1166
de190aef 1167 f->field_hash_table = t;
cec736d2
LP
1168 return 0;
1169}
1170
3c1668da
LP
1171static int journal_file_link_field(
1172 JournalFile *f,
1173 Object *o,
1174 uint64_t offset,
1175 uint64_t hash) {
1176
805d1486 1177 uint64_t p, h, m;
3c1668da
LP
1178 int r;
1179
1180 assert(f);
c88cc6af 1181 assert(f->header);
90d222c1 1182 assert(f->field_hash_table);
3c1668da
LP
1183 assert(o);
1184 assert(offset > 0);
1185
1186 if (o->object.type != OBJECT_FIELD)
1187 return -EINVAL;
1188
805d1486
LP
1189 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1190 if (m <= 0)
1191 return -EBADMSG;
3c1668da 1192
805d1486 1193 /* This might alter the window we are looking at */
3c1668da
LP
1194 o->field.next_hash_offset = o->field.head_data_offset = 0;
1195
805d1486 1196 h = hash % m;
3c1668da
LP
1197 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1198 if (p == 0)
1199 f->field_hash_table[h].head_hash_offset = htole64(offset);
1200 else {
1201 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1202 if (r < 0)
1203 return r;
1204
1205 o->field.next_hash_offset = htole64(offset);
1206 }
1207
1208 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1209
1210 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1211 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1212
1213 return 0;
1214}
1215
1216static int journal_file_link_data(
1217 JournalFile *f,
1218 Object *o,
1219 uint64_t offset,
1220 uint64_t hash) {
1221
805d1486 1222 uint64_t p, h, m;
cec736d2
LP
1223 int r;
1224
1225 assert(f);
c88cc6af 1226 assert(f->header);
90d222c1 1227 assert(f->data_hash_table);
cec736d2
LP
1228 assert(o);
1229 assert(offset > 0);
b588975f
LP
1230
1231 if (o->object.type != OBJECT_DATA)
1232 return -EINVAL;
cec736d2 1233
805d1486
LP
1234 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1235 if (m <= 0)
1236 return -EBADMSG;
48496df6 1237
805d1486 1238 /* This might alter the window we are looking at */
de190aef
LP
1239 o->data.next_hash_offset = o->data.next_field_offset = 0;
1240 o->data.entry_offset = o->data.entry_array_offset = 0;
1241 o->data.n_entries = 0;
cec736d2 1242
805d1486 1243 h = hash % m;
8db4213e 1244 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1245 if (p == 0)
cec736d2 1246 /* Only entry in the hash table is easy */
de190aef 1247 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1248 else {
48496df6
LP
1249 /* Move back to the previous data object, to patch in
1250 * pointer */
cec736d2 1251
de190aef 1252 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1253 if (r < 0)
1254 return r;
1255
de190aef 1256 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1257 }
1258
de190aef 1259 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1260
dca6219e
LP
1261 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1262 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1263
cec736d2
LP
1264 return 0;
1265}
1266
3c1668da
LP
1267int journal_file_find_field_object_with_hash(
1268 JournalFile *f,
1269 const void *field, uint64_t size, uint64_t hash,
1270 Object **ret, uint64_t *offset) {
1271
805d1486 1272 uint64_t p, osize, h, m;
3c1668da
LP
1273 int r;
1274
1275 assert(f);
c88cc6af 1276 assert(f->header);
3c1668da
LP
1277 assert(field && size > 0);
1278
dade37d4
LP
1279 /* If the field hash table is empty, we can't find anything */
1280 if (le64toh(f->header->field_hash_table_size) <= 0)
1281 return 0;
1282
1283 /* Map the field hash table, if it isn't mapped yet. */
1284 r = journal_file_map_field_hash_table(f);
1285 if (r < 0)
1286 return r;
1287
3c1668da
LP
1288 osize = offsetof(Object, field.payload) + size;
1289
805d1486 1290 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1291 if (m <= 0)
3c1668da
LP
1292 return -EBADMSG;
1293
805d1486 1294 h = hash % m;
3c1668da
LP
1295 p = le64toh(f->field_hash_table[h].head_hash_offset);
1296
1297 while (p > 0) {
1298 Object *o;
1299
1300 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1301 if (r < 0)
1302 return r;
1303
1304 if (le64toh(o->field.hash) == hash &&
1305 le64toh(o->object.size) == osize &&
1306 memcmp(o->field.payload, field, size) == 0) {
1307
1308 if (ret)
1309 *ret = o;
1310 if (offset)
1311 *offset = p;
1312
1313 return 1;
1314 }
1315
1316 p = le64toh(o->field.next_hash_offset);
1317 }
1318
1319 return 0;
1320}
1321
1322int journal_file_find_field_object(
1323 JournalFile *f,
1324 const void *field, uint64_t size,
1325 Object **ret, uint64_t *offset) {
1326
1327 uint64_t hash;
1328
1329 assert(f);
1330 assert(field && size > 0);
1331
1332 hash = hash64(field, size);
1333
1334 return journal_file_find_field_object_with_hash(f,
1335 field, size, hash,
1336 ret, offset);
1337}
1338
de190aef
LP
1339int journal_file_find_data_object_with_hash(
1340 JournalFile *f,
1341 const void *data, uint64_t size, uint64_t hash,
1342 Object **ret, uint64_t *offset) {
48496df6 1343
805d1486 1344 uint64_t p, osize, h, m;
cec736d2
LP
1345 int r;
1346
1347 assert(f);
c88cc6af 1348 assert(f->header);
cec736d2
LP
1349 assert(data || size == 0);
1350
dade37d4
LP
1351 /* If there's no data hash table, then there's no entry. */
1352 if (le64toh(f->header->data_hash_table_size) <= 0)
1353 return 0;
1354
1355 /* Map the data hash table, if it isn't mapped yet. */
1356 r = journal_file_map_data_hash_table(f);
1357 if (r < 0)
1358 return r;
1359
cec736d2
LP
1360 osize = offsetof(Object, data.payload) + size;
1361
805d1486
LP
1362 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1363 if (m <= 0)
bc85bfee
LP
1364 return -EBADMSG;
1365
805d1486 1366 h = hash % m;
de190aef 1367 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1368
de190aef
LP
1369 while (p > 0) {
1370 Object *o;
cec736d2 1371
de190aef 1372 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1373 if (r < 0)
1374 return r;
1375
807e17f0 1376 if (le64toh(o->data.hash) != hash)
85a131e8 1377 goto next;
807e17f0 1378
d89c8fdf 1379 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1380#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1381 uint64_t l;
a7f7d1bd 1382 size_t rsize = 0;
cec736d2 1383
807e17f0
LP
1384 l = le64toh(o->object.size);
1385 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1386 return -EBADMSG;
1387
807e17f0
LP
1388 l -= offsetof(Object, data.payload);
1389
d89c8fdf
ZJS
1390 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1391 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1392 if (r < 0)
1393 return r;
807e17f0 1394
b785c858 1395 if (rsize == size &&
807e17f0
LP
1396 memcmp(f->compress_buffer, data, size) == 0) {
1397
1398 if (ret)
1399 *ret = o;
1400
1401 if (offset)
1402 *offset = p;
1403
1404 return 1;
1405 }
3b1a55e1
ZJS
1406#else
1407 return -EPROTONOSUPPORT;
1408#endif
807e17f0
LP
1409 } else if (le64toh(o->object.size) == osize &&
1410 memcmp(o->data.payload, data, size) == 0) {
1411
cec736d2
LP
1412 if (ret)
1413 *ret = o;
1414
1415 if (offset)
1416 *offset = p;
1417
de190aef 1418 return 1;
cec736d2
LP
1419 }
1420
85a131e8 1421 next:
cec736d2
LP
1422 p = le64toh(o->data.next_hash_offset);
1423 }
1424
de190aef
LP
1425 return 0;
1426}
1427
1428int journal_file_find_data_object(
1429 JournalFile *f,
1430 const void *data, uint64_t size,
1431 Object **ret, uint64_t *offset) {
1432
1433 uint64_t hash;
1434
1435 assert(f);
1436 assert(data || size == 0);
1437
1438 hash = hash64(data, size);
1439
1440 return journal_file_find_data_object_with_hash(f,
1441 data, size, hash,
1442 ret, offset);
1443}
1444
3c1668da
LP
1445static int journal_file_append_field(
1446 JournalFile *f,
1447 const void *field, uint64_t size,
1448 Object **ret, uint64_t *offset) {
1449
1450 uint64_t hash, p;
1451 uint64_t osize;
1452 Object *o;
1453 int r;
1454
1455 assert(f);
1456 assert(field && size > 0);
1457
1458 hash = hash64(field, size);
1459
1460 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1461 if (r < 0)
1462 return r;
1463 else if (r > 0) {
1464
1465 if (ret)
1466 *ret = o;
1467
1468 if (offset)
1469 *offset = p;
1470
1471 return 0;
1472 }
1473
1474 osize = offsetof(Object, field.payload) + size;
1475 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1476 if (r < 0)
1477 return r;
3c1668da
LP
1478
1479 o->field.hash = htole64(hash);
1480 memcpy(o->field.payload, field, size);
1481
1482 r = journal_file_link_field(f, o, p, hash);
1483 if (r < 0)
1484 return r;
1485
1486 /* The linking might have altered the window, so let's
1487 * refresh our pointer */
1488 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1489 if (r < 0)
1490 return r;
1491
349cc4a5 1492#if HAVE_GCRYPT
3c1668da
LP
1493 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1494 if (r < 0)
1495 return r;
1496#endif
1497
1498 if (ret)
1499 *ret = o;
1500
1501 if (offset)
1502 *offset = p;
1503
1504 return 0;
1505}
1506
48496df6
LP
1507static int journal_file_append_data(
1508 JournalFile *f,
1509 const void *data, uint64_t size,
1510 Object **ret, uint64_t *offset) {
1511
de190aef
LP
1512 uint64_t hash, p;
1513 uint64_t osize;
1514 Object *o;
d89c8fdf 1515 int r, compression = 0;
3c1668da 1516 const void *eq;
de190aef
LP
1517
1518 assert(f);
1519 assert(data || size == 0);
1520
1521 hash = hash64(data, size);
1522
1523 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1524 if (r < 0)
1525 return r;
0240c603 1526 if (r > 0) {
de190aef
LP
1527
1528 if (ret)
1529 *ret = o;
1530
1531 if (offset)
1532 *offset = p;
1533
1534 return 0;
1535 }
1536
1537 osize = offsetof(Object, data.payload) + size;
1538 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1539 if (r < 0)
1540 return r;
1541
cec736d2 1542 o->data.hash = htole64(hash);
807e17f0 1543
349cc4a5 1544#if HAVE_XZ || HAVE_LZ4
57850536 1545 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
a7f7d1bd 1546 size_t rsize = 0;
807e17f0 1547
5d6f46b6 1548 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1549
d1afbcd2 1550 if (compression >= 0) {
807e17f0 1551 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1552 o->object.flags |= compression;
807e17f0 1553
fa1c4b51 1554 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1555 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1556 } else
1557 /* Compression didn't work, we don't really care why, let's continue without compression */
1558 compression = 0;
807e17f0
LP
1559 }
1560#endif
1561
75f32f04
ZJS
1562 if (compression == 0)
1563 memcpy_safe(o->data.payload, data, size);
cec736d2 1564
de190aef 1565 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1566 if (r < 0)
1567 return r;
1568
349cc4a5 1569#if HAVE_GCRYPT
33685a5a
FB
1570 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1571 if (r < 0)
1572 return r;
1573#endif
1574
48496df6
LP
1575 /* The linking might have altered the window, so let's
1576 * refresh our pointer */
1577 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1578 if (r < 0)
1579 return r;
1580
08c6f819
SL
1581 if (!data)
1582 eq = NULL;
1583 else
1584 eq = memchr(data, '=', size);
3c1668da 1585 if (eq && eq > data) {
748db592 1586 Object *fo = NULL;
3c1668da 1587 uint64_t fp;
3c1668da
LP
1588
1589 /* Create field object ... */
1590 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1591 if (r < 0)
1592 return r;
1593
1594 /* ... and link it in. */
1595 o->data.next_field_offset = fo->field.head_data_offset;
1596 fo->field.head_data_offset = le64toh(p);
1597 }
1598
cec736d2
LP
1599 if (ret)
1600 *ret = o;
1601
1602 if (offset)
de190aef 1603 *offset = p;
cec736d2
LP
1604
1605 return 0;
1606}
1607
1608uint64_t journal_file_entry_n_items(Object *o) {
1609 assert(o);
b588975f
LP
1610
1611 if (o->object.type != OBJECT_ENTRY)
1612 return 0;
cec736d2
LP
1613
1614 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1615}
1616
0284adc6 1617uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1618 assert(o);
b588975f
LP
1619
1620 if (o->object.type != OBJECT_ENTRY_ARRAY)
1621 return 0;
de190aef
LP
1622
1623 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1624}
1625
fb9a24b6
LP
1626uint64_t journal_file_hash_table_n_items(Object *o) {
1627 assert(o);
b588975f 1628
ec2ce0c5 1629 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1630 return 0;
fb9a24b6
LP
1631
1632 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1633}
1634
de190aef 1635static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1636 le64_t *first,
1637 le64_t *idx,
de190aef 1638 uint64_t p) {
cec736d2 1639 int r;
de190aef
LP
1640 uint64_t n = 0, ap = 0, q, i, a, hidx;
1641 Object *o;
1642
cec736d2 1643 assert(f);
c88cc6af 1644 assert(f->header);
de190aef
LP
1645 assert(first);
1646 assert(idx);
1647 assert(p > 0);
cec736d2 1648
de190aef
LP
1649 a = le64toh(*first);
1650 i = hidx = le64toh(*idx);
1651 while (a > 0) {
1652
1653 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1654 if (r < 0)
1655 return r;
cec736d2 1656
de190aef
LP
1657 n = journal_file_entry_array_n_items(o);
1658 if (i < n) {
1659 o->entry_array.items[i] = htole64(p);
1660 *idx = htole64(hidx + 1);
1661 return 0;
1662 }
cec736d2 1663
de190aef
LP
1664 i -= n;
1665 ap = a;
1666 a = le64toh(o->entry_array.next_entry_array_offset);
1667 }
1668
1669 if (hidx > n)
1670 n = (hidx+1) * 2;
1671 else
1672 n = n * 2;
1673
1674 if (n < 4)
1675 n = 4;
1676
1677 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1678 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1679 &o, &q);
cec736d2
LP
1680 if (r < 0)
1681 return r;
1682
349cc4a5 1683#if HAVE_GCRYPT
5996c7c2 1684 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1685 if (r < 0)
1686 return r;
feb12d3e 1687#endif
b0af6f41 1688
de190aef 1689 o->entry_array.items[i] = htole64(p);
cec736d2 1690
de190aef 1691 if (ap == 0)
7be3aa17 1692 *first = htole64(q);
cec736d2 1693 else {
de190aef 1694 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1695 if (r < 0)
1696 return r;
1697
de190aef
LP
1698 o->entry_array.next_entry_array_offset = htole64(q);
1699 }
cec736d2 1700
2dee23eb
LP
1701 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1702 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1703
de190aef
LP
1704 *idx = htole64(hidx + 1);
1705
1706 return 0;
1707}
cec736d2 1708
de190aef 1709static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1710 le64_t *extra,
1711 le64_t *first,
1712 le64_t *idx,
de190aef
LP
1713 uint64_t p) {
1714
1715 int r;
1716
1717 assert(f);
1718 assert(extra);
1719 assert(first);
1720 assert(idx);
1721 assert(p > 0);
1722
1723 if (*idx == 0)
1724 *extra = htole64(p);
1725 else {
4fd052ae 1726 le64_t i;
de190aef 1727
7be3aa17 1728 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1729 r = link_entry_into_array(f, first, &i, p);
1730 if (r < 0)
1731 return r;
cec736d2
LP
1732 }
1733
de190aef
LP
1734 *idx = htole64(le64toh(*idx) + 1);
1735 return 0;
1736}
1737
1738static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1739 uint64_t p;
1740 int r;
1741 assert(f);
1742 assert(o);
1743 assert(offset > 0);
1744
1745 p = le64toh(o->entry.items[i].object_offset);
1746 if (p == 0)
1747 return -EINVAL;
1748
1749 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1750 if (r < 0)
1751 return r;
1752
de190aef
LP
1753 return link_entry_into_array_plus_one(f,
1754 &o->data.entry_offset,
1755 &o->data.entry_array_offset,
1756 &o->data.n_entries,
1757 offset);
cec736d2
LP
1758}
1759
1760static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1761 uint64_t n, i;
cec736d2
LP
1762 int r;
1763
1764 assert(f);
c88cc6af 1765 assert(f->header);
cec736d2
LP
1766 assert(o);
1767 assert(offset > 0);
b588975f
LP
1768
1769 if (o->object.type != OBJECT_ENTRY)
1770 return -EINVAL;
cec736d2 1771
b788cc23
LP
1772 __sync_synchronize();
1773
cec736d2 1774 /* Link up the entry itself */
de190aef
LP
1775 r = link_entry_into_array(f,
1776 &f->header->entry_array_offset,
1777 &f->header->n_entries,
1778 offset);
1779 if (r < 0)
1780 return r;
cec736d2 1781
507f22bd 1782 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1783
de190aef 1784 if (f->header->head_entry_realtime == 0)
0ac38b70 1785 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1786
0ac38b70 1787 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1788 f->header->tail_entry_monotonic = o->entry.monotonic;
1789
cec736d2
LP
1790 /* Link up the items */
1791 n = journal_file_entry_n_items(o);
1792 for (i = 0; i < n; i++) {
1793 r = journal_file_link_entry_item(f, o, offset, i);
1794 if (r < 0)
1795 return r;
1796 }
1797
cec736d2
LP
1798 return 0;
1799}
1800
1801static int journal_file_append_entry_internal(
1802 JournalFile *f,
1803 const dual_timestamp *ts,
1804 uint64_t xor_hash,
1805 const EntryItem items[], unsigned n_items,
de190aef 1806 uint64_t *seqnum,
cec736d2
LP
1807 Object **ret, uint64_t *offset) {
1808 uint64_t np;
1809 uint64_t osize;
1810 Object *o;
1811 int r;
1812
1813 assert(f);
c88cc6af 1814 assert(f->header);
cec736d2 1815 assert(items || n_items == 0);
de190aef 1816 assert(ts);
cec736d2
LP
1817
1818 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1819
de190aef 1820 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1821 if (r < 0)
1822 return r;
1823
d98cc1f2 1824 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1825 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1826 o->entry.realtime = htole64(ts->realtime);
1827 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1828 o->entry.xor_hash = htole64(xor_hash);
1829 o->entry.boot_id = f->header->boot_id;
1830
349cc4a5 1831#if HAVE_GCRYPT
5996c7c2 1832 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1833 if (r < 0)
1834 return r;
feb12d3e 1835#endif
b0af6f41 1836
cec736d2
LP
1837 r = journal_file_link_entry(f, o, np);
1838 if (r < 0)
1839 return r;
1840
1841 if (ret)
1842 *ret = o;
1843
1844 if (offset)
1845 *offset = np;
1846
1847 return 0;
1848}
1849
cf244689 1850void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1851 assert(f);
1852
1853 /* inotify() does not receive IN_MODIFY events from file
1854 * accesses done via mmap(). After each access we hence
1855 * trigger IN_MODIFY by truncating the journal file to its
1856 * current size which triggers IN_MODIFY. */
1857
bc85bfee
LP
1858 __sync_synchronize();
1859
50f20cfd 1860 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1861 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1862}
1863
7a24f3bf
VC
1864static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1865 assert(userdata);
1866
1867 journal_file_post_change(userdata);
1868
1869 return 1;
1870}
1871
1872static void schedule_post_change(JournalFile *f) {
1873 sd_event_source *timer;
1874 int enabled, r;
1875 uint64_t now;
1876
1877 assert(f);
1878 assert(f->post_change_timer);
1879
1880 timer = f->post_change_timer;
1881
1882 r = sd_event_source_get_enabled(timer, &enabled);
1883 if (r < 0) {
e167d7fd
LP
1884 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1885 goto fail;
7a24f3bf
VC
1886 }
1887
1888 if (enabled == SD_EVENT_ONESHOT)
1889 return;
1890
1891 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1892 if (r < 0) {
e167d7fd
LP
1893 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1894 goto fail;
7a24f3bf
VC
1895 }
1896
1897 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1898 if (r < 0) {
e167d7fd
LP
1899 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1900 goto fail;
7a24f3bf
VC
1901 }
1902
1903 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1904 if (r < 0) {
e167d7fd
LP
1905 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1906 goto fail;
7a24f3bf 1907 }
e167d7fd
LP
1908
1909 return;
1910
1911fail:
1912 /* On failure, let's simply post the change immediately. */
1913 journal_file_post_change(f);
7a24f3bf
VC
1914}
1915
1916/* Enable coalesced change posting in a timer on the provided sd_event instance */
1917int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1918 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1919 int r;
1920
1921 assert(f);
1922 assert_return(!f->post_change_timer, -EINVAL);
1923 assert(e);
1924 assert(t);
1925
1926 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1927 if (r < 0)
1928 return r;
1929
1930 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1931 if (r < 0)
1932 return r;
1933
1cc6c93a 1934 f->post_change_timer = TAKE_PTR(timer);
7a24f3bf
VC
1935 f->post_change_timer_period = t;
1936
1937 return r;
1938}
1939
1f2da9ec
LP
1940static int entry_item_cmp(const void *_a, const void *_b) {
1941 const EntryItem *a = _a, *b = _b;
1942
1943 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1944 return -1;
1945 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1946 return 1;
1947 return 0;
1948}
1949
de190aef 1950int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1951 unsigned i;
1952 EntryItem *items;
1953 int r;
1954 uint64_t xor_hash = 0;
de190aef 1955 struct dual_timestamp _ts;
cec736d2
LP
1956
1957 assert(f);
c88cc6af 1958 assert(f->header);
cec736d2
LP
1959 assert(iovec || n_iovec == 0);
1960
de190aef
LP
1961 if (!ts) {
1962 dual_timestamp_get(&_ts);
1963 ts = &_ts;
1964 }
1965
349cc4a5 1966#if HAVE_GCRYPT
7560fffc
LP
1967 r = journal_file_maybe_append_tag(f, ts->realtime);
1968 if (r < 0)
1969 return r;
feb12d3e 1970#endif
7560fffc 1971
64825d3c 1972 /* alloca() can't take 0, hence let's allocate at least one */
cf409d15 1973 items = newa(EntryItem, MAX(1u, n_iovec));
cec736d2
LP
1974
1975 for (i = 0; i < n_iovec; i++) {
1976 uint64_t p;
1977 Object *o;
1978
1979 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1980 if (r < 0)
cf244689 1981 return r;
cec736d2
LP
1982
1983 xor_hash ^= le64toh(o->data.hash);
1984 items[i].object_offset = htole64(p);
de7b95cd 1985 items[i].hash = o->data.hash;
cec736d2
LP
1986 }
1987
1f2da9ec
LP
1988 /* Order by the position on disk, in order to improve seek
1989 * times for rotating media. */
7ff7394d 1990 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1991
de190aef 1992 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1993
fa6ac760
LP
1994 /* If the memory mapping triggered a SIGBUS then we return an
1995 * IO error and ignore the error code passed down to us, since
1996 * it is very likely just an effect of a nullified replacement
1997 * mapping page */
1998
be7cdd8e 1999 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2000 r = -EIO;
2001
7a24f3bf
VC
2002 if (f->post_change_timer)
2003 schedule_post_change(f);
2004 else
2005 journal_file_post_change(f);
50f20cfd 2006
cec736d2
LP
2007 return r;
2008}
2009
a4bcff5b 2010typedef struct ChainCacheItem {
fb099c8d 2011 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2012 uint64_t array; /* the cached array */
2013 uint64_t begin; /* the first item in the cached array */
2014 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2015 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2016} ChainCacheItem;
2017
2018static void chain_cache_put(
4743015d 2019 OrderedHashmap *h,
a4bcff5b
LP
2020 ChainCacheItem *ci,
2021 uint64_t first,
2022 uint64_t array,
2023 uint64_t begin,
f268980d
LP
2024 uint64_t total,
2025 uint64_t last_index) {
a4bcff5b
LP
2026
2027 if (!ci) {
34741aa3
LP
2028 /* If the chain item to cache for this chain is the
2029 * first one it's not worth caching anything */
2030 if (array == first)
2031 return;
2032
29433089 2033 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2034 ci = ordered_hashmap_steal_first(h);
29433089
LP
2035 assert(ci);
2036 } else {
a4bcff5b
LP
2037 ci = new(ChainCacheItem, 1);
2038 if (!ci)
2039 return;
2040 }
2041
2042 ci->first = first;
2043
4743015d 2044 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2045 free(ci);
2046 return;
2047 }
2048 } else
2049 assert(ci->first == first);
2050
2051 ci->array = array;
2052 ci->begin = begin;
2053 ci->total = total;
f268980d 2054 ci->last_index = last_index;
a4bcff5b
LP
2055}
2056
f268980d
LP
2057static int generic_array_get(
2058 JournalFile *f,
2059 uint64_t first,
2060 uint64_t i,
2061 Object **ret, uint64_t *offset) {
de190aef 2062
cec736d2 2063 Object *o;
a4bcff5b 2064 uint64_t p = 0, a, t = 0;
cec736d2 2065 int r;
a4bcff5b 2066 ChainCacheItem *ci;
cec736d2
LP
2067
2068 assert(f);
2069
de190aef 2070 a = first;
a4bcff5b
LP
2071
2072 /* Try the chain cache first */
4743015d 2073 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2074 if (ci && i > ci->total) {
2075 a = ci->array;
2076 i -= ci->total;
2077 t = ci->total;
2078 }
2079
de190aef 2080 while (a > 0) {
a4bcff5b 2081 uint64_t k;
cec736d2 2082
de190aef
LP
2083 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2084 if (r < 0)
2085 return r;
cec736d2 2086
a4bcff5b
LP
2087 k = journal_file_entry_array_n_items(o);
2088 if (i < k) {
de190aef 2089 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2090 goto found;
cec736d2
LP
2091 }
2092
a4bcff5b
LP
2093 i -= k;
2094 t += k;
de190aef
LP
2095 a = le64toh(o->entry_array.next_entry_array_offset);
2096 }
2097
a4bcff5b
LP
2098 return 0;
2099
2100found:
2101 /* Let's cache this item for the next invocation */
af13a6b0 2102 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2103
2104 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2105 if (r < 0)
2106 return r;
2107
2108 if (ret)
2109 *ret = o;
2110
2111 if (offset)
2112 *offset = p;
2113
2114 return 1;
2115}
2116
f268980d
LP
2117static int generic_array_get_plus_one(
2118 JournalFile *f,
2119 uint64_t extra,
2120 uint64_t first,
2121 uint64_t i,
2122 Object **ret, uint64_t *offset) {
de190aef
LP
2123
2124 Object *o;
2125
2126 assert(f);
2127
2128 if (i == 0) {
2129 int r;
2130
2131 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2132 if (r < 0)
2133 return r;
2134
de190aef
LP
2135 if (ret)
2136 *ret = o;
cec736d2 2137
de190aef
LP
2138 if (offset)
2139 *offset = extra;
cec736d2 2140
de190aef 2141 return 1;
cec736d2
LP
2142 }
2143
de190aef
LP
2144 return generic_array_get(f, first, i-1, ret, offset);
2145}
cec736d2 2146
de190aef
LP
2147enum {
2148 TEST_FOUND,
2149 TEST_LEFT,
2150 TEST_RIGHT
2151};
cec736d2 2152
f268980d
LP
2153static int generic_array_bisect(
2154 JournalFile *f,
2155 uint64_t first,
2156 uint64_t n,
2157 uint64_t needle,
2158 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2159 direction_t direction,
2160 Object **ret,
2161 uint64_t *offset,
2162 uint64_t *idx) {
2163
2164 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2165 bool subtract_one = false;
2166 Object *o, *array = NULL;
2167 int r;
a4bcff5b 2168 ChainCacheItem *ci;
cec736d2 2169
de190aef
LP
2170 assert(f);
2171 assert(test_object);
cec736d2 2172
a4bcff5b 2173 /* Start with the first array in the chain */
de190aef 2174 a = first;
a4bcff5b 2175
4743015d 2176 ci = ordered_hashmap_get(f->chain_cache, &first);
96d4d024 2177 if (ci && n > ci->total && ci->begin != 0) {
a4bcff5b
LP
2178 /* Ah, we have iterated this bisection array chain
2179 * previously! Let's see if we can skip ahead in the
2180 * chain, as far as the last time. But we can't jump
2181 * backwards in the chain, so let's check that
2182 * first. */
2183
2184 r = test_object(f, ci->begin, needle);
2185 if (r < 0)
2186 return r;
2187
2188 if (r == TEST_LEFT) {
f268980d 2189 /* OK, what we are looking for is right of the
a4bcff5b
LP
2190 * begin of this EntryArray, so let's jump
2191 * straight to previously cached array in the
2192 * chain */
2193
2194 a = ci->array;
2195 n -= ci->total;
2196 t = ci->total;
f268980d 2197 last_index = ci->last_index;
a4bcff5b
LP
2198 }
2199 }
2200
de190aef
LP
2201 while (a > 0) {
2202 uint64_t left, right, k, lp;
2203
2204 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2205 if (r < 0)
2206 return r;
2207
de190aef
LP
2208 k = journal_file_entry_array_n_items(array);
2209 right = MIN(k, n);
2210 if (right <= 0)
2211 return 0;
cec736d2 2212
de190aef
LP
2213 i = right - 1;
2214 lp = p = le64toh(array->entry_array.items[i]);
2215 if (p <= 0)
bee6a291
LP
2216 r = -EBADMSG;
2217 else
2218 r = test_object(f, p, needle);
2219 if (r == -EBADMSG) {
2220 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2221 n = i;
2222 continue;
2223 }
de190aef
LP
2224 if (r < 0)
2225 return r;
cec736d2 2226
de190aef
LP
2227 if (r == TEST_FOUND)
2228 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2229
2230 if (r == TEST_RIGHT) {
2231 left = 0;
2232 right -= 1;
f268980d
LP
2233
2234 if (last_index != (uint64_t) -1) {
2235 assert(last_index <= right);
2236
2237 /* If we cached the last index we
2238 * looked at, let's try to not to jump
2239 * too wildly around and see if we can
2240 * limit the range to look at early to
2241 * the immediate neighbors of the last
2242 * index we looked at. */
2243
2244 if (last_index > 0) {
2245 uint64_t x = last_index - 1;
2246
2247 p = le64toh(array->entry_array.items[x]);
2248 if (p <= 0)
2249 return -EBADMSG;
2250
2251 r = test_object(f, p, needle);
2252 if (r < 0)
2253 return r;
2254
2255 if (r == TEST_FOUND)
2256 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2257
2258 if (r == TEST_RIGHT)
2259 right = x;
2260 else
2261 left = x + 1;
2262 }
2263
2264 if (last_index < right) {
2265 uint64_t y = last_index + 1;
2266
2267 p = le64toh(array->entry_array.items[y]);
2268 if (p <= 0)
2269 return -EBADMSG;
2270
2271 r = test_object(f, p, needle);
2272 if (r < 0)
2273 return r;
2274
2275 if (r == TEST_FOUND)
2276 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2277
2278 if (r == TEST_RIGHT)
2279 right = y;
2280 else
2281 left = y + 1;
2282 }
f268980d
LP
2283 }
2284
de190aef
LP
2285 for (;;) {
2286 if (left == right) {
2287 if (direction == DIRECTION_UP)
2288 subtract_one = true;
2289
2290 i = left;
2291 goto found;
2292 }
2293
2294 assert(left < right);
de190aef 2295 i = (left + right) / 2;
f268980d 2296
de190aef
LP
2297 p = le64toh(array->entry_array.items[i]);
2298 if (p <= 0)
bee6a291
LP
2299 r = -EBADMSG;
2300 else
2301 r = test_object(f, p, needle);
2302 if (r == -EBADMSG) {
2303 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2304 right = n = i;
2305 continue;
2306 }
de190aef
LP
2307 if (r < 0)
2308 return r;
cec736d2 2309
de190aef
LP
2310 if (r == TEST_FOUND)
2311 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2312
2313 if (r == TEST_RIGHT)
2314 right = i;
2315 else
2316 left = i + 1;
2317 }
2318 }
2319
2173cbf8 2320 if (k >= n) {
cbdca852
LP
2321 if (direction == DIRECTION_UP) {
2322 i = n;
2323 subtract_one = true;
2324 goto found;
2325 }
2326
cec736d2 2327 return 0;
cbdca852 2328 }
cec736d2 2329
de190aef
LP
2330 last_p = lp;
2331
2332 n -= k;
2333 t += k;
f268980d 2334 last_index = (uint64_t) -1;
de190aef 2335 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2336 }
2337
2338 return 0;
de190aef
LP
2339
2340found:
2341 if (subtract_one && t == 0 && i == 0)
2342 return 0;
2343
a4bcff5b 2344 /* Let's cache this item for the next invocation */
af13a6b0 2345 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2346
de190aef
LP
2347 if (subtract_one && i == 0)
2348 p = last_p;
2349 else if (subtract_one)
2350 p = le64toh(array->entry_array.items[i-1]);
2351 else
2352 p = le64toh(array->entry_array.items[i]);
2353
2354 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2355 if (r < 0)
2356 return r;
2357
2358 if (ret)
2359 *ret = o;
2360
2361 if (offset)
2362 *offset = p;
2363
2364 if (idx)
cbdca852 2365 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2366
2367 return 1;
cec736d2
LP
2368}
2369
f268980d
LP
2370static int generic_array_bisect_plus_one(
2371 JournalFile *f,
2372 uint64_t extra,
2373 uint64_t first,
2374 uint64_t n,
2375 uint64_t needle,
2376 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2377 direction_t direction,
2378 Object **ret,
2379 uint64_t *offset,
2380 uint64_t *idx) {
de190aef 2381
cec736d2 2382 int r;
cbdca852
LP
2383 bool step_back = false;
2384 Object *o;
cec736d2
LP
2385
2386 assert(f);
de190aef 2387 assert(test_object);
cec736d2 2388
de190aef
LP
2389 if (n <= 0)
2390 return 0;
cec736d2 2391
de190aef
LP
2392 /* This bisects the array in object 'first', but first checks
2393 * an extra */
de190aef
LP
2394 r = test_object(f, extra, needle);
2395 if (r < 0)
2396 return r;
a536e261
LP
2397
2398 if (r == TEST_FOUND)
2399 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2400
cbdca852
LP
2401 /* if we are looking with DIRECTION_UP then we need to first
2402 see if in the actual array there is a matching entry, and
2403 return the last one of that. But if there isn't any we need
2404 to return this one. Hence remember this, and return it
2405 below. */
2406 if (r == TEST_LEFT)
2407 step_back = direction == DIRECTION_UP;
de190aef 2408
cbdca852
LP
2409 if (r == TEST_RIGHT) {
2410 if (direction == DIRECTION_DOWN)
2411 goto found;
2412 else
2413 return 0;
a536e261 2414 }
cec736d2 2415
de190aef
LP
2416 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2417
cbdca852
LP
2418 if (r == 0 && step_back)
2419 goto found;
2420
ecf68b1d 2421 if (r > 0 && idx)
313cefa1 2422 (*idx)++;
de190aef
LP
2423
2424 return r;
cbdca852
LP
2425
2426found:
2427 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2428 if (r < 0)
2429 return r;
2430
2431 if (ret)
2432 *ret = o;
2433
2434 if (offset)
2435 *offset = extra;
2436
2437 if (idx)
2438 *idx = 0;
2439
2440 return 1;
2441}
2442
44a6b1b6 2443_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2444 assert(f);
2445 assert(p > 0);
2446
2447 if (p == needle)
2448 return TEST_FOUND;
2449 else if (p < needle)
2450 return TEST_LEFT;
2451 else
2452 return TEST_RIGHT;
2453}
2454
de190aef
LP
2455static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2456 Object *o;
2457 int r;
2458
2459 assert(f);
2460 assert(p > 0);
2461
2462 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2463 if (r < 0)
2464 return r;
2465
de190aef
LP
2466 if (le64toh(o->entry.seqnum) == needle)
2467 return TEST_FOUND;
2468 else if (le64toh(o->entry.seqnum) < needle)
2469 return TEST_LEFT;
2470 else
2471 return TEST_RIGHT;
2472}
cec736d2 2473
de190aef
LP
2474int journal_file_move_to_entry_by_seqnum(
2475 JournalFile *f,
2476 uint64_t seqnum,
2477 direction_t direction,
2478 Object **ret,
2479 uint64_t *offset) {
c88cc6af
VC
2480 assert(f);
2481 assert(f->header);
de190aef
LP
2482
2483 return generic_array_bisect(f,
2484 le64toh(f->header->entry_array_offset),
2485 le64toh(f->header->n_entries),
2486 seqnum,
2487 test_object_seqnum,
2488 direction,
2489 ret, offset, NULL);
2490}
cec736d2 2491
de190aef
LP
2492static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2493 Object *o;
2494 int r;
2495
2496 assert(f);
2497 assert(p > 0);
2498
2499 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2500 if (r < 0)
2501 return r;
2502
2503 if (le64toh(o->entry.realtime) == needle)
2504 return TEST_FOUND;
2505 else if (le64toh(o->entry.realtime) < needle)
2506 return TEST_LEFT;
2507 else
2508 return TEST_RIGHT;
cec736d2
LP
2509}
2510
de190aef
LP
2511int journal_file_move_to_entry_by_realtime(
2512 JournalFile *f,
2513 uint64_t realtime,
2514 direction_t direction,
2515 Object **ret,
2516 uint64_t *offset) {
c88cc6af
VC
2517 assert(f);
2518 assert(f->header);
de190aef
LP
2519
2520 return generic_array_bisect(f,
2521 le64toh(f->header->entry_array_offset),
2522 le64toh(f->header->n_entries),
2523 realtime,
2524 test_object_realtime,
2525 direction,
2526 ret, offset, NULL);
2527}
2528
2529static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2530 Object *o;
2531 int r;
2532
2533 assert(f);
2534 assert(p > 0);
2535
2536 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2537 if (r < 0)
2538 return r;
2539
2540 if (le64toh(o->entry.monotonic) == needle)
2541 return TEST_FOUND;
2542 else if (le64toh(o->entry.monotonic) < needle)
2543 return TEST_LEFT;
2544 else
2545 return TEST_RIGHT;
2546}
2547
2a560338 2548static int find_data_object_by_boot_id(
47838ab3
ZJS
2549 JournalFile *f,
2550 sd_id128_t boot_id,
2551 Object **o,
2552 uint64_t *b) {
2a560338 2553
fbd0b64f 2554 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2555
2556 sd_id128_to_string(boot_id, t + 9);
2557 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2558}
2559
de190aef
LP
2560int journal_file_move_to_entry_by_monotonic(
2561 JournalFile *f,
2562 sd_id128_t boot_id,
2563 uint64_t monotonic,
2564 direction_t direction,
2565 Object **ret,
2566 uint64_t *offset) {
2567
de190aef
LP
2568 Object *o;
2569 int r;
2570
cbdca852 2571 assert(f);
de190aef 2572
47838ab3 2573 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2574 if (r < 0)
2575 return r;
cbdca852 2576 if (r == 0)
de190aef
LP
2577 return -ENOENT;
2578
2579 return generic_array_bisect_plus_one(f,
2580 le64toh(o->data.entry_offset),
2581 le64toh(o->data.entry_array_offset),
2582 le64toh(o->data.n_entries),
2583 monotonic,
2584 test_object_monotonic,
2585 direction,
2586 ret, offset, NULL);
2587}
2588
1fc605b0 2589void journal_file_reset_location(JournalFile *f) {
6573ef05 2590 f->location_type = LOCATION_HEAD;
1fc605b0 2591 f->current_offset = 0;
6573ef05
MS
2592 f->current_seqnum = 0;
2593 f->current_realtime = 0;
2594 f->current_monotonic = 0;
2595 zero(f->current_boot_id);
2596 f->current_xor_hash = 0;
2597}
2598
950c07d4 2599void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2600 f->location_type = LOCATION_SEEK;
2601 f->current_offset = offset;
2602 f->current_seqnum = le64toh(o->entry.seqnum);
2603 f->current_realtime = le64toh(o->entry.realtime);
2604 f->current_monotonic = le64toh(o->entry.monotonic);
2605 f->current_boot_id = o->entry.boot_id;
2606 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2607}
2608
d8ae66d7
MS
2609int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2610 assert(af);
c88cc6af 2611 assert(af->header);
d8ae66d7 2612 assert(bf);
c88cc6af 2613 assert(bf->header);
d8ae66d7
MS
2614 assert(af->location_type == LOCATION_SEEK);
2615 assert(bf->location_type == LOCATION_SEEK);
2616
2617 /* If contents and timestamps match, these entries are
2618 * identical, even if the seqnum does not match */
2619 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2620 af->current_monotonic == bf->current_monotonic &&
2621 af->current_realtime == bf->current_realtime &&
2622 af->current_xor_hash == bf->current_xor_hash)
2623 return 0;
2624
2625 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2626
2627 /* If this is from the same seqnum source, compare
2628 * seqnums */
2629 if (af->current_seqnum < bf->current_seqnum)
2630 return -1;
2631 if (af->current_seqnum > bf->current_seqnum)
2632 return 1;
2633
2634 /* Wow! This is weird, different data but the same
2635 * seqnums? Something is borked, but let's make the
2636 * best of it and compare by time. */
2637 }
2638
2639 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2640
2641 /* If the boot id matches, compare monotonic time */
2642 if (af->current_monotonic < bf->current_monotonic)
2643 return -1;
2644 if (af->current_monotonic > bf->current_monotonic)
2645 return 1;
2646 }
2647
2648 /* Otherwise, compare UTC time */
2649 if (af->current_realtime < bf->current_realtime)
2650 return -1;
2651 if (af->current_realtime > bf->current_realtime)
2652 return 1;
2653
2654 /* Finally, compare by contents */
2655 if (af->current_xor_hash < bf->current_xor_hash)
2656 return -1;
2657 if (af->current_xor_hash > bf->current_xor_hash)
2658 return 1;
2659
2660 return 0;
2661}
2662
aa598ba5
LP
2663static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2664
2665 /* Increase or decrease the specified index, in the right direction. */
2666
2667 if (direction == DIRECTION_DOWN) {
2668 if (*i >= n - 1)
2669 return 0;
2670
2671 (*i) ++;
2672 } else {
2673 if (*i <= 0)
2674 return 0;
2675
2676 (*i) --;
2677 }
2678
2679 return 1;
2680}
2681
b6da4ed0
LP
2682static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2683
2684 /* Consider it an error if any of the two offsets is uninitialized */
2685 if (old_offset == 0 || new_offset == 0)
2686 return false;
2687
2688 /* If we go down, the new offset must be larger than the old one. */
2689 return direction == DIRECTION_DOWN ?
2690 new_offset > old_offset :
2691 new_offset < old_offset;
2692}
2693
de190aef
LP
2694int journal_file_next_entry(
2695 JournalFile *f,
f534928a 2696 uint64_t p,
de190aef
LP
2697 direction_t direction,
2698 Object **ret, uint64_t *offset) {
2699
fb099c8d 2700 uint64_t i, n, ofs;
cec736d2
LP
2701 int r;
2702
2703 assert(f);
c88cc6af 2704 assert(f->header);
de190aef
LP
2705
2706 n = le64toh(f->header->n_entries);
2707 if (n <= 0)
2708 return 0;
cec736d2 2709
f534928a 2710 if (p == 0)
de190aef 2711 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2712 else {
de190aef
LP
2713 r = generic_array_bisect(f,
2714 le64toh(f->header->entry_array_offset),
2715 le64toh(f->header->n_entries),
2716 p,
2717 test_object_offset,
2718 DIRECTION_DOWN,
2719 NULL, NULL,
2720 &i);
2721 if (r <= 0)
2722 return r;
2723
aa598ba5
LP
2724 r = bump_array_index(&i, direction, n);
2725 if (r <= 0)
2726 return r;
cec736d2
LP
2727 }
2728
de190aef 2729 /* And jump to it */
989793d3
LP
2730 for (;;) {
2731 r = generic_array_get(f,
2732 le64toh(f->header->entry_array_offset),
2733 i,
2734 ret, &ofs);
2735 if (r > 0)
2736 break;
2737 if (r != -EBADMSG)
2738 return r;
2739
2740 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2741 * the next one might work for us instead. */
2742 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2743
2744 r = bump_array_index(&i, direction, n);
2745 if (r <= 0)
2746 return r;
caeab8f6 2747 }
fb099c8d 2748
b6da4ed0
LP
2749 /* Ensure our array is properly ordered. */
2750 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2751 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2752 return -EBADMSG;
2753 }
2754
2755 if (offset)
2756 *offset = ofs;
2757
2758 return 1;
de190aef 2759}
cec736d2 2760
de190aef
LP
2761int journal_file_next_entry_for_data(
2762 JournalFile *f,
2763 Object *o, uint64_t p,
2764 uint64_t data_offset,
2765 direction_t direction,
2766 Object **ret, uint64_t *offset) {
2767
ded5034e 2768 uint64_t i, n, ofs;
de190aef 2769 Object *d;
989793d3 2770 int r;
cec736d2
LP
2771
2772 assert(f);
de190aef 2773 assert(p > 0 || !o);
cec736d2 2774
de190aef 2775 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2776 if (r < 0)
de190aef 2777 return r;
cec736d2 2778
de190aef
LP
2779 n = le64toh(d->data.n_entries);
2780 if (n <= 0)
2781 return n;
cec736d2 2782
de190aef
LP
2783 if (!o)
2784 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2785 else {
2786 if (o->object.type != OBJECT_ENTRY)
2787 return -EINVAL;
cec736d2 2788
de190aef
LP
2789 r = generic_array_bisect_plus_one(f,
2790 le64toh(d->data.entry_offset),
2791 le64toh(d->data.entry_array_offset),
2792 le64toh(d->data.n_entries),
2793 p,
2794 test_object_offset,
2795 DIRECTION_DOWN,
2796 NULL, NULL,
2797 &i);
2798
2799 if (r <= 0)
cec736d2
LP
2800 return r;
2801
aa598ba5
LP
2802 r = bump_array_index(&i, direction, n);
2803 if (r <= 0)
2804 return r;
de190aef 2805 }
cec736d2 2806
989793d3
LP
2807 for (;;) {
2808 r = generic_array_get_plus_one(f,
2809 le64toh(d->data.entry_offset),
2810 le64toh(d->data.entry_array_offset),
2811 i,
2812 ret, &ofs);
2813 if (r > 0)
2814 break;
2815 if (r != -EBADMSG)
2816 return r;
2817
2818 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2819
2820 r = bump_array_index(&i, direction, n);
2821 if (r <= 0)
2822 return r;
2823 }
ded5034e
LP
2824
2825 /* Ensure our array is properly ordered. */
2826 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2827 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2828 return -EBADMSG;
2829 }
2830
2831 if (offset)
2832 *offset = ofs;
2833
2834 return 1;
de190aef 2835}
cec736d2 2836
cbdca852
LP
2837int journal_file_move_to_entry_by_offset_for_data(
2838 JournalFile *f,
2839 uint64_t data_offset,
2840 uint64_t p,
2841 direction_t direction,
2842 Object **ret, uint64_t *offset) {
2843
2844 int r;
2845 Object *d;
2846
2847 assert(f);
2848
2849 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2850 if (r < 0)
2851 return r;
2852
2853 return generic_array_bisect_plus_one(f,
2854 le64toh(d->data.entry_offset),
2855 le64toh(d->data.entry_array_offset),
2856 le64toh(d->data.n_entries),
2857 p,
2858 test_object_offset,
2859 direction,
2860 ret, offset, NULL);
2861}
2862
2863int journal_file_move_to_entry_by_monotonic_for_data(
2864 JournalFile *f,
2865 uint64_t data_offset,
2866 sd_id128_t boot_id,
2867 uint64_t monotonic,
2868 direction_t direction,
2869 Object **ret, uint64_t *offset) {
2870
cbdca852
LP
2871 Object *o, *d;
2872 int r;
2873 uint64_t b, z;
2874
2875 assert(f);
2876
2877 /* First, seek by time */
47838ab3 2878 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2879 if (r < 0)
2880 return r;
2881 if (r == 0)
2882 return -ENOENT;
2883
2884 r = generic_array_bisect_plus_one(f,
2885 le64toh(o->data.entry_offset),
2886 le64toh(o->data.entry_array_offset),
2887 le64toh(o->data.n_entries),
2888 monotonic,
2889 test_object_monotonic,
2890 direction,
2891 NULL, &z, NULL);
2892 if (r <= 0)
2893 return r;
2894
2895 /* And now, continue seeking until we find an entry that
2896 * exists in both bisection arrays */
2897
2898 for (;;) {
2899 Object *qo;
2900 uint64_t p, q;
2901
2902 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2903 if (r < 0)
2904 return r;
2905
2906 r = generic_array_bisect_plus_one(f,
2907 le64toh(d->data.entry_offset),
2908 le64toh(d->data.entry_array_offset),
2909 le64toh(d->data.n_entries),
2910 z,
2911 test_object_offset,
2912 direction,
2913 NULL, &p, NULL);
2914 if (r <= 0)
2915 return r;
2916
2917 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2918 if (r < 0)
2919 return r;
2920
2921 r = generic_array_bisect_plus_one(f,
2922 le64toh(o->data.entry_offset),
2923 le64toh(o->data.entry_array_offset),
2924 le64toh(o->data.n_entries),
2925 p,
2926 test_object_offset,
2927 direction,
2928 &qo, &q, NULL);
2929
2930 if (r <= 0)
2931 return r;
2932
2933 if (p == q) {
2934 if (ret)
2935 *ret = qo;
2936 if (offset)
2937 *offset = q;
2938
2939 return 1;
2940 }
2941
2942 z = q;
2943 }
cbdca852
LP
2944}
2945
de190aef
LP
2946int journal_file_move_to_entry_by_seqnum_for_data(
2947 JournalFile *f,
2948 uint64_t data_offset,
2949 uint64_t seqnum,
2950 direction_t direction,
2951 Object **ret, uint64_t *offset) {
cec736d2 2952
de190aef
LP
2953 Object *d;
2954 int r;
cec736d2 2955
91a31dde
LP
2956 assert(f);
2957
de190aef 2958 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2959 if (r < 0)
de190aef 2960 return r;
cec736d2 2961
de190aef
LP
2962 return generic_array_bisect_plus_one(f,
2963 le64toh(d->data.entry_offset),
2964 le64toh(d->data.entry_array_offset),
2965 le64toh(d->data.n_entries),
2966 seqnum,
2967 test_object_seqnum,
2968 direction,
2969 ret, offset, NULL);
2970}
cec736d2 2971
de190aef
LP
2972int journal_file_move_to_entry_by_realtime_for_data(
2973 JournalFile *f,
2974 uint64_t data_offset,
2975 uint64_t realtime,
2976 direction_t direction,
2977 Object **ret, uint64_t *offset) {
2978
2979 Object *d;
2980 int r;
2981
91a31dde
LP
2982 assert(f);
2983
de190aef 2984 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2985 if (r < 0)
de190aef
LP
2986 return r;
2987
2988 return generic_array_bisect_plus_one(f,
2989 le64toh(d->data.entry_offset),
2990 le64toh(d->data.entry_array_offset),
2991 le64toh(d->data.n_entries),
2992 realtime,
2993 test_object_realtime,
2994 direction,
2995 ret, offset, NULL);
cec736d2
LP
2996}
2997
0284adc6 2998void journal_file_dump(JournalFile *f) {
7560fffc 2999 Object *o;
7560fffc 3000 int r;
0284adc6 3001 uint64_t p;
7560fffc
LP
3002
3003 assert(f);
c88cc6af 3004 assert(f->header);
7560fffc 3005
0284adc6 3006 journal_file_print_header(f);
7560fffc 3007
0284adc6
LP
3008 p = le64toh(f->header->header_size);
3009 while (p != 0) {
d05089d8 3010 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3011 if (r < 0)
3012 goto fail;
7560fffc 3013
0284adc6 3014 switch (o->object.type) {
d98cc1f2 3015
0284adc6
LP
3016 case OBJECT_UNUSED:
3017 printf("Type: OBJECT_UNUSED\n");
3018 break;
d98cc1f2 3019
0284adc6
LP
3020 case OBJECT_DATA:
3021 printf("Type: OBJECT_DATA\n");
3022 break;
7560fffc 3023
3c1668da
LP
3024 case OBJECT_FIELD:
3025 printf("Type: OBJECT_FIELD\n");
3026 break;
3027
0284adc6 3028 case OBJECT_ENTRY:
507f22bd
ZJS
3029 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3030 le64toh(o->entry.seqnum),
3031 le64toh(o->entry.monotonic),
3032 le64toh(o->entry.realtime));
0284adc6 3033 break;
7560fffc 3034
0284adc6
LP
3035 case OBJECT_FIELD_HASH_TABLE:
3036 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3037 break;
7560fffc 3038
0284adc6
LP
3039 case OBJECT_DATA_HASH_TABLE:
3040 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3041 break;
7560fffc 3042
0284adc6
LP
3043 case OBJECT_ENTRY_ARRAY:
3044 printf("Type: OBJECT_ENTRY_ARRAY\n");
3045 break;
7560fffc 3046
0284adc6 3047 case OBJECT_TAG:
507f22bd
ZJS
3048 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3049 le64toh(o->tag.seqnum),
3050 le64toh(o->tag.epoch));
0284adc6 3051 break;
3c1668da
LP
3052
3053 default:
8facc349 3054 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3055 break;
0284adc6 3056 }
7560fffc 3057
d89c8fdf
ZJS
3058 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3059 printf("Flags: %s\n",
3060 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3061
0284adc6
LP
3062 if (p == le64toh(f->header->tail_object_offset))
3063 p = 0;
3064 else
3065 p = p + ALIGN64(le64toh(o->object.size));
3066 }
7560fffc 3067
0284adc6
LP
3068 return;
3069fail:
3070 log_error("File corrupt");
7560fffc
LP
3071}
3072
718fe4b1
ZJS
3073static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3074 const char *x;
3075
3076 x = format_timestamp(buf, l, t);
3077 if (x)
3078 return x;
3079 return " --- ";
3080}
3081
0284adc6 3082void journal_file_print_header(JournalFile *f) {
2765b7bb 3083 char a[33], b[33], c[33], d[33];
ed375beb 3084 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3085 struct stat st;
3086 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3087
3088 assert(f);
c88cc6af 3089 assert(f->header);
7560fffc 3090
0284adc6
LP
3091 printf("File Path: %s\n"
3092 "File ID: %s\n"
3093 "Machine ID: %s\n"
3094 "Boot ID: %s\n"
3095 "Sequential Number ID: %s\n"
3096 "State: %s\n"
3097 "Compatible Flags:%s%s\n"
d89c8fdf 3098 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3099 "Header size: %"PRIu64"\n"
3100 "Arena size: %"PRIu64"\n"
3101 "Data Hash Table Size: %"PRIu64"\n"
3102 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3103 "Rotate Suggested: %s\n"
0808b92f
LP
3104 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3105 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3106 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3107 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3108 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3109 "Objects: %"PRIu64"\n"
3110 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3111 f->path,
3112 sd_id128_to_string(f->header->file_id, a),
3113 sd_id128_to_string(f->header->machine_id, b),
3114 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3115 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3116 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3117 f->header->state == STATE_ONLINE ? "ONLINE" :
3118 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3119 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3120 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3121 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3122 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3123 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3124 le64toh(f->header->header_size),
3125 le64toh(f->header->arena_size),
3126 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3127 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3128 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3129 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3130 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3131 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3132 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3133 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3134 le64toh(f->header->n_objects),
3135 le64toh(f->header->n_entries));
7560fffc 3136
0284adc6 3137 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3138 printf("Data Objects: %"PRIu64"\n"
0284adc6 3139 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3140 le64toh(f->header->n_data),
0284adc6 3141 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3142
0284adc6 3143 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3144 printf("Field Objects: %"PRIu64"\n"
0284adc6 3145 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3146 le64toh(f->header->n_fields),
0284adc6 3147 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3148
3149 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3150 printf("Tag Objects: %"PRIu64"\n",
3151 le64toh(f->header->n_tags));
3223f44f 3152 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3153 printf("Entry Array Objects: %"PRIu64"\n",
3154 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3155
3156 if (fstat(f->fd, &st) >= 0)
59f448cf 3157 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3158}
3159
fc68c929
LP
3160static int journal_file_warn_btrfs(JournalFile *f) {
3161 unsigned attrs;
3162 int r;
3163
3164 assert(f);
3165
3166 /* Before we write anything, check if the COW logic is turned
3167 * off on btrfs. Given our write pattern that is quite
3168 * unfriendly to COW file systems this should greatly improve
3169 * performance on COW file systems, such as btrfs, at the
3170 * expense of data integrity features (which shouldn't be too
3171 * bad, given that we do our own checksumming). */
3172
3173 r = btrfs_is_filesystem(f->fd);
3174 if (r < 0)
3175 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3176 if (!r)
3177 return 0;
3178
3179 r = read_attr_fd(f->fd, &attrs);
3180 if (r < 0)
3181 return log_warning_errno(r, "Failed to read file attributes: %m");
3182
3183 if (attrs & FS_NOCOW_FL) {
3184 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3185 return 0;
3186 }
3187
3188 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3189 "This is likely to slow down journal access substantially, please consider turning "
3190 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3191
3192 return 1;
3193}
3194
0284adc6 3195int journal_file_open(
5d1ce257 3196 int fd,
0284adc6
LP
3197 const char *fname,
3198 int flags,
3199 mode_t mode,
3200 bool compress,
57850536 3201 uint64_t compress_threshold_bytes,
baed47c3 3202 bool seal,
0284adc6
LP
3203 JournalMetrics *metrics,
3204 MMapCache *mmap_cache,
b58c888f 3205 Set *deferred_closes,
0284adc6
LP
3206 JournalFile *template,
3207 JournalFile **ret) {
7560fffc 3208
fa6ac760 3209 bool newly_created = false;
0284adc6 3210 JournalFile *f;
fa6ac760 3211 void *h;
0284adc6 3212 int r;
57850536 3213 char bytes[FORMAT_BYTES_MAX];
7560fffc 3214
0559d3a5 3215 assert(ret);
5d1ce257 3216 assert(fd >= 0 || fname);
7560fffc 3217
ec2ce0c5 3218 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3219 return -EINVAL;
7560fffc 3220
6eda13d3
LP
3221 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3222 return -EINVAL;
7560fffc 3223
0284adc6
LP
3224 f = new0(JournalFile, 1);
3225 if (!f)
3226 return -ENOMEM;
7560fffc 3227
5d1ce257 3228 f->fd = fd;
0284adc6 3229 f->mode = mode;
7560fffc 3230
0284adc6
LP
3231 f->flags = flags;
3232 f->prot = prot_from_flags(flags);
3233 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3234#if HAVE_LZ4
d89c8fdf 3235 f->compress_lz4 = compress;
349cc4a5 3236#elif HAVE_XZ
d89c8fdf 3237 f->compress_xz = compress;
48b61739 3238#endif
57850536
AG
3239
3240 if (compress_threshold_bytes == (uint64_t) -1)
3241 f->compress_threshold_bytes = DEFAULT_COMPRESS_THRESHOLD;
3242 else
3243 f->compress_threshold_bytes = MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes);
3244
349cc4a5 3245#if HAVE_GCRYPT
baed47c3 3246 f->seal = seal;
49a32d43 3247#endif
7560fffc 3248
57850536
AG
3249 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3250 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3251 format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3252
0284adc6
LP
3253 if (mmap_cache)
3254 f->mmap = mmap_cache_ref(mmap_cache);
3255 else {
84168d80 3256 f->mmap = mmap_cache_new();
0284adc6
LP
3257 if (!f->mmap) {
3258 r = -ENOMEM;
3259 goto fail;
3260 }
3261 }
7560fffc 3262
7645c77b 3263 if (fname) {
5d1ce257 3264 f->path = strdup(fname);
7645c77b
ZJS
3265 if (!f->path) {
3266 r = -ENOMEM;
3267 goto fail;
3268 }
3269 } else {
817b1c5b
LP
3270 assert(fd >= 0);
3271
7645c77b
ZJS
3272 /* If we don't know the path, fill in something explanatory and vaguely useful */
3273 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3274 r = -ENOMEM;
3275 goto fail;
3276 }
0284adc6 3277 }
7560fffc 3278
4743015d 3279 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3280 if (!f->chain_cache) {
3281 r = -ENOMEM;
3282 goto fail;
3283 }
3284
0284adc6 3285 if (f->fd < 0) {
817b1c5b
LP
3286 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3287 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3288 * it doesn't hurt in that case. */
3289
3290 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
5d1ce257
LP
3291 if (f->fd < 0) {
3292 r = -errno;
3293 goto fail;
3294 }
3295
3296 /* fds we opened here by us should also be closed by us. */
3297 f->close_fd = true;
817b1c5b
LP
3298
3299 r = fd_nonblock(f->fd, false);
3300 if (r < 0)
3301 goto fail;
7560fffc 3302 }
7560fffc 3303
be7cdd8e
VC
3304 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3305 if (!f->cache_fd) {
3306 r = -ENOMEM;
3307 goto fail;
3308 }
3309
2678031a
LP
3310 r = journal_file_fstat(f);
3311 if (r < 0)
0284adc6 3312 goto fail;
7560fffc 3313
0284adc6 3314 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3315
fc68c929 3316 (void) journal_file_warn_btrfs(f);
11689d2a 3317
4c2e1b39
LP
3318 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3319 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3320 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3321 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3322 * solely on mtime/atime/ctime of the file. */
3323 (void) fd_setcrtime(f->fd, 0);
7560fffc 3324
349cc4a5 3325#if HAVE_GCRYPT
0284adc6 3326 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3327 * just don't do sealing */
49a32d43
LP
3328 if (f->seal) {
3329 r = journal_file_fss_load(f);
3330 if (r < 0)
3331 f->seal = false;
3332 }
feb12d3e 3333#endif
7560fffc 3334
0284adc6
LP
3335 r = journal_file_init_header(f, template);
3336 if (r < 0)
3337 goto fail;
7560fffc 3338
2678031a
LP
3339 r = journal_file_fstat(f);
3340 if (r < 0)
0284adc6 3341 goto fail;
fb0951b0
LP
3342
3343 newly_created = true;
0284adc6 3344 }
7560fffc 3345
0284adc6 3346 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3347 r = -ENODATA;
0284adc6
LP
3348 goto fail;
3349 }
7560fffc 3350
b42549ad 3351 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3352 if (r < 0)
0284adc6 3353 goto fail;
7560fffc 3354
fa6ac760
LP
3355 f->header = h;
3356
0284adc6 3357 if (!newly_created) {
f9168190 3358 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3359
0284adc6
LP
3360 r = journal_file_verify_header(f);
3361 if (r < 0)
3362 goto fail;
3363 }
7560fffc 3364
349cc4a5 3365#if HAVE_GCRYPT
0284adc6 3366 if (!newly_created && f->writable) {
baed47c3 3367 r = journal_file_fss_load(f);
0284adc6
LP
3368 if (r < 0)
3369 goto fail;
3370 }
feb12d3e 3371#endif
cec736d2
LP
3372
3373 if (f->writable) {
4a92baf3
LP
3374 if (metrics) {
3375 journal_default_metrics(metrics, f->fd);
3376 f->metrics = *metrics;
3377 } else if (template)
3378 f->metrics = template->metrics;
3379
cec736d2
LP
3380 r = journal_file_refresh_header(f);
3381 if (r < 0)
3382 goto fail;
3383 }
3384
349cc4a5 3385#if HAVE_GCRYPT
baed47c3 3386 r = journal_file_hmac_setup(f);
14d10188
LP
3387 if (r < 0)
3388 goto fail;
feb12d3e 3389#endif
14d10188 3390
cec736d2 3391 if (newly_created) {
de190aef 3392 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3393 if (r < 0)
3394 goto fail;
3395
de190aef 3396 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3397 if (r < 0)
3398 goto fail;
7560fffc 3399
349cc4a5 3400#if HAVE_GCRYPT
7560fffc
LP
3401 r = journal_file_append_first_tag(f);
3402 if (r < 0)
3403 goto fail;
feb12d3e 3404#endif
cec736d2
LP
3405 }
3406
be7cdd8e 3407 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3408 r = -EIO;
3409 goto fail;
3410 }
3411
7a24f3bf 3412 if (template && template->post_change_timer) {
e167d7fd
LP
3413 r = journal_file_enable_post_change_timer(
3414 f,
3415 sd_event_source_get_event(template->post_change_timer),
3416 template->post_change_timer_period);
7a24f3bf 3417
7a24f3bf
VC
3418 if (r < 0)
3419 goto fail;
3420 }
3421
f8e2f4d6 3422 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3423 f->close_fd = true;
3424
0559d3a5 3425 *ret = f;
cec736d2
LP
3426 return 0;
3427
3428fail:
be7cdd8e 3429 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3430 r = -EIO;
3431
69a3a6fd 3432 (void) journal_file_close(f);
cec736d2
LP
3433
3434 return r;
3435}
0ac38b70 3436
57850536 3437int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes) {
57535f47 3438 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3439 size_t l;
3440 JournalFile *old_file, *new_file = NULL;
3441 int r;
3442
3443 assert(f);
3444 assert(*f);
3445
3446 old_file = *f;
3447
3448 if (!old_file->writable)
3449 return -EINVAL;
3450
5d1ce257 3451 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3452 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3453 if (path_startswith(old_file->path, "/proc/self/fd"))
3454 return -EINVAL;
3455
0ac38b70
LP
3456 if (!endswith(old_file->path, ".journal"))
3457 return -EINVAL;
3458
3459 l = strlen(old_file->path);
57535f47
ZJS
3460 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3461 (int) l - 8, old_file->path,
3462 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3463 le64toh((*f)->header->head_entry_seqnum),
3464 le64toh((*f)->header->head_entry_realtime));
3465 if (r < 0)
0ac38b70
LP
3466 return -ENOMEM;
3467
2678031a
LP
3468 /* Try to rename the file to the archived version. If the file
3469 * already was deleted, we'll get ENOENT, let's ignore that
3470 * case. */
0ac38b70 3471 r = rename(old_file->path, p);
2678031a 3472 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3473 return -errno;
3474
1fcefd88
LP
3475 /* Sync the rename to disk */
3476 (void) fsync_directory_of_file(old_file->fd);
3477
8eb85171
VC
3478 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3479 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3480 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3481 * would result in the rotated journal never getting fsync() called before closing.
3482 * Now we simply queue the archive state by setting an archive bit, leaving the state
3483 * as STATE_ONLINE so proper offlining occurs. */
3484 old_file->archive = true;
0ac38b70 3485
f27a3864
LP
3486 /* Currently, btrfs is not very good with out write patterns
3487 * and fragments heavily. Let's defrag our journal files when
3488 * we archive them */
3489 old_file->defrag_on_close = true;
3490
57850536
AG
3491 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress,
3492 compress_threshold_bytes, seal, NULL, old_file->mmap, deferred_closes,
3493 old_file, &new_file);
b58c888f
VC
3494
3495 if (deferred_closes &&
3496 set_put(deferred_closes, old_file) >= 0)
3497 (void) journal_file_set_offline(old_file, false);
3498 else
3499 (void) journal_file_close(old_file);
0ac38b70
LP
3500
3501 *f = new_file;
3502 return r;
3503}
3504
9447a7f1
LP
3505int journal_file_open_reliably(
3506 const char *fname,
3507 int flags,
3508 mode_t mode,
7560fffc 3509 bool compress,
57850536 3510 uint64_t compress_threshold_bytes,
baed47c3 3511 bool seal,
4a92baf3 3512 JournalMetrics *metrics,
27370278 3513 MMapCache *mmap_cache,
b58c888f 3514 Set *deferred_closes,
9447a7f1
LP
3515 JournalFile *template,
3516 JournalFile **ret) {
3517
3518 int r;
3519 size_t l;
ed375beb 3520 _cleanup_free_ char *p = NULL;
9447a7f1 3521
57850536
AG
3522 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3523 deferred_closes, template, ret);
288359db 3524 if (!IN_SET(r,
b288cdeb
ZJS
3525 -EBADMSG, /* Corrupted */
3526 -ENODATA, /* Truncated */
3527 -EHOSTDOWN, /* Other machine */
3528 -EPROTONOSUPPORT, /* Incompatible feature */
3529 -EBUSY, /* Unclean shutdown */
3530 -ESHUTDOWN, /* Already archived */
288359db 3531 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3532 -EIDRM, /* File has been deleted */
3533 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3534 return r;
3535
3536 if ((flags & O_ACCMODE) == O_RDONLY)
3537 return r;
3538
3539 if (!(flags & O_CREAT))
3540 return r;
3541
7560fffc
LP
3542 if (!endswith(fname, ".journal"))
3543 return r;
3544
5c70eab4
LP
3545 /* The file is corrupted. Rotate it away and try it again (but only once) */
3546
9447a7f1 3547 l = strlen(fname);
d587eca5 3548 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3549 (int) l - 8, fname,
d587eca5 3550 now(CLOCK_REALTIME),
9bf3b535 3551 random_u64()) < 0)
9447a7f1
LP
3552 return -ENOMEM;
3553
65089b82 3554 if (rename(fname, p) < 0)
9447a7f1
LP
3555 return -errno;
3556
f27a3864
LP
3557 /* btrfs doesn't cope well with our write pattern and
3558 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3559
a67d68b8 3560 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3561 (void) btrfs_defrag(p);
3562
65089b82 3563 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3564
57850536
AG
3565 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3566 deferred_closes, template, ret);
9447a7f1
LP
3567}
3568
cf244689
LP
3569int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3570 uint64_t i, n;
3571 uint64_t q, xor_hash = 0;
3572 int r;
3573 EntryItem *items;
3574 dual_timestamp ts;
3575
3576 assert(from);
3577 assert(to);
3578 assert(o);
3579 assert(p);
3580
3581 if (!to->writable)
3582 return -EPERM;
3583
3584 ts.monotonic = le64toh(o->entry.monotonic);
3585 ts.realtime = le64toh(o->entry.realtime);
3586
cf244689 3587 n = journal_file_entry_n_items(o);
4faa7004 3588 /* alloca() can't take 0, hence let's allocate at least one */
cf409d15 3589 items = newa(EntryItem, MAX(1u, n));
cf244689
LP
3590
3591 for (i = 0; i < n; i++) {
4fd052ae
FC
3592 uint64_t l, h;
3593 le64_t le_hash;
cf244689
LP
3594 size_t t;
3595 void *data;
3596 Object *u;
3597
3598 q = le64toh(o->entry.items[i].object_offset);
3599 le_hash = o->entry.items[i].hash;
3600
3601 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3602 if (r < 0)
3603 return r;
3604
3605 if (le_hash != o->data.hash)
3606 return -EBADMSG;
3607
3608 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3609 t = (size_t) l;
3610
3611 /* We hit the limit on 32bit machines */
3612 if ((uint64_t) t != l)
3613 return -E2BIG;
3614
d89c8fdf 3615 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3616#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3617 size_t rsize = 0;
cf244689 3618
d89c8fdf
ZJS
3619 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3620 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3621 if (r < 0)
3622 return r;
cf244689
LP
3623
3624 data = from->compress_buffer;
3625 l = rsize;
3b1a55e1
ZJS
3626#else
3627 return -EPROTONOSUPPORT;
3628#endif
cf244689
LP
3629 } else
3630 data = o->data.payload;
3631
3632 r = journal_file_append_data(to, data, l, &u, &h);
3633 if (r < 0)
3634 return r;
3635
3636 xor_hash ^= le64toh(u->data.hash);
3637 items[i].object_offset = htole64(h);
3638 items[i].hash = u->data.hash;
3639
3640 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3641 if (r < 0)
3642 return r;
3643 }
3644
fa6ac760
LP
3645 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3646
be7cdd8e 3647 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3648 return -EIO;
3649
3650 return r;
cf244689 3651}
babfc091 3652
8580d1f7
LP
3653void journal_reset_metrics(JournalMetrics *m) {
3654 assert(m);
3655
3656 /* Set everything to "pick automatic values". */
3657
3658 *m = (JournalMetrics) {
3659 .min_use = (uint64_t) -1,
3660 .max_use = (uint64_t) -1,
3661 .min_size = (uint64_t) -1,
3662 .max_size = (uint64_t) -1,
3663 .keep_free = (uint64_t) -1,
3664 .n_max_files = (uint64_t) -1,
3665 };
3666}
3667
babfc091 3668void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3669 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3670 struct statvfs ss;
8580d1f7 3671 uint64_t fs_size;
babfc091
LP
3672
3673 assert(m);
3674 assert(fd >= 0);
3675
3676 if (fstatvfs(fd, &ss) >= 0)
3677 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7 3678 else {
8fc58f1a 3679 log_debug_errno(errno, "Failed to determine disk size: %m");
8580d1f7
LP
3680 fs_size = 0;
3681 }
babfc091
LP
3682
3683 if (m->max_use == (uint64_t) -1) {
3684
3685 if (fs_size > 0) {
3686 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3687
3688 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3689 m->max_use = DEFAULT_MAX_USE_UPPER;
3690
3691 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3692 m->max_use = DEFAULT_MAX_USE_LOWER;
3693 } else
3694 m->max_use = DEFAULT_MAX_USE_LOWER;
3695 } else {
3696 m->max_use = PAGE_ALIGN(m->max_use);
3697
8580d1f7 3698 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3699 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3700 }
3701
8580d1f7
LP
3702 if (m->min_use == (uint64_t) -1)
3703 m->min_use = DEFAULT_MIN_USE;
3704
3705 if (m->min_use > m->max_use)
3706 m->min_use = m->max_use;
3707
babfc091
LP
3708 if (m->max_size == (uint64_t) -1) {
3709 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3710
3711 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3712 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3713 } else
3714 m->max_size = PAGE_ALIGN(m->max_size);
3715
8580d1f7
LP
3716 if (m->max_size != 0) {
3717 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3718 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3719
8580d1f7
LP
3720 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3721 m->max_use = m->max_size*2;
3722 }
babfc091
LP
3723
3724 if (m->min_size == (uint64_t) -1)
3725 m->min_size = JOURNAL_FILE_SIZE_MIN;
3726 else {
3727 m->min_size = PAGE_ALIGN(m->min_size);
3728
3729 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3730 m->min_size = JOURNAL_FILE_SIZE_MIN;
3731
8580d1f7 3732 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3733 m->max_size = m->min_size;
3734 }
3735
3736 if (m->keep_free == (uint64_t) -1) {
3737
3738 if (fs_size > 0) {
8621b110 3739 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3740
3741 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3742 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3743
3744 } else
3745 m->keep_free = DEFAULT_KEEP_FREE;
3746 }
3747
8580d1f7
LP
3748 if (m->n_max_files == (uint64_t) -1)
3749 m->n_max_files = DEFAULT_N_MAX_FILES;
3750
3751 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3752 format_bytes(a, sizeof(a), m->min_use),
3753 format_bytes(b, sizeof(b), m->max_use),
3754 format_bytes(c, sizeof(c), m->max_size),
3755 format_bytes(d, sizeof(d), m->min_size),
3756 format_bytes(e, sizeof(e), m->keep_free),
3757 m->n_max_files);
babfc091 3758}
08984293
LP
3759
3760int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3761 assert(f);
c88cc6af 3762 assert(f->header);
08984293
LP
3763 assert(from || to);
3764
3765 if (from) {
162566a4
LP
3766 if (f->header->head_entry_realtime == 0)
3767 return -ENOENT;
08984293 3768
162566a4 3769 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3770 }
3771
3772 if (to) {
162566a4
LP
3773 if (f->header->tail_entry_realtime == 0)
3774 return -ENOENT;
08984293 3775
162566a4 3776 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3777 }
3778
3779 return 1;
3780}
3781
3782int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3783 Object *o;
3784 uint64_t p;
3785 int r;
3786
3787 assert(f);
3788 assert(from || to);
3789
47838ab3 3790 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3791 if (r <= 0)
3792 return r;
3793
3794 if (le64toh(o->data.n_entries) <= 0)
3795 return 0;
3796
3797 if (from) {
3798 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3799 if (r < 0)
3800 return r;
3801
3802 *from = le64toh(o->entry.monotonic);
3803 }
3804
3805 if (to) {
3806 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3807 if (r < 0)
3808 return r;
3809
3810 r = generic_array_get_plus_one(f,
3811 le64toh(o->data.entry_offset),
3812 le64toh(o->data.entry_array_offset),
3813 le64toh(o->data.n_entries)-1,
3814 &o, NULL);
3815 if (r <= 0)
3816 return r;
3817
3818 *to = le64toh(o->entry.monotonic);
3819 }
3820
3821 return 1;
3822}
dca6219e 3823
fb0951b0 3824bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3825 assert(f);
c88cc6af 3826 assert(f->header);
dca6219e
LP
3827
3828 /* If we gained new header fields we gained new features,
3829 * hence suggest a rotation */
361f9cbc
LP
3830 if (le64toh(f->header->header_size) < sizeof(Header)) {
3831 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3832 return true;
361f9cbc 3833 }
dca6219e
LP
3834
3835 /* Let's check if the hash tables grew over a certain fill
3836 * level (75%, borrowing this value from Java's hash table
3837 * implementation), and if so suggest a rotation. To calculate
3838 * the fill level we need the n_data field, which only exists
3839 * in newer versions. */
3840
3841 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3842 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3843 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3844 f->path,
3845 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3846 le64toh(f->header->n_data),
3847 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3848 (unsigned long long) f->last_stat.st_size,
3849 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3850 return true;
361f9cbc 3851 }
dca6219e
LP
3852
3853 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3854 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3855 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3856 f->path,
3857 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3858 le64toh(f->header->n_fields),
3859 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3860 return true;
361f9cbc 3861 }
dca6219e 3862
0598fd4a
LP
3863 /* Are the data objects properly indexed by field objects? */
3864 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3865 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3866 le64toh(f->header->n_data) > 0 &&
3867 le64toh(f->header->n_fields) == 0)
3868 return true;
3869
fb0951b0
LP
3870 if (max_file_usec > 0) {
3871 usec_t t, h;
3872
3873 h = le64toh(f->header->head_entry_realtime);
3874 t = now(CLOCK_REALTIME);
3875
3876 if (h > 0 && t > h + max_file_usec)
3877 return true;
3878 }
3879
dca6219e
LP
3880 return false;
3881}